diff options
Diffstat (limited to 'xlators/features')
277 files changed, 84785 insertions, 34716 deletions
diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am index d2f5ef19290..c57897f11ea 100644 --- a/xlators/features/Makefile.am +++ b/xlators/features/Makefile.am @@ -1,4 +1,14 @@ -SUBDIRS = locks quota read-only mac-compat quiesce marker index \ - protect compress changelog gfid-access $(GLUPY_SUBDIR) qemu-block # trash path-converter # filter +if BUILD_CLOUDSYNC + CLOUDSYNC_DIR = cloudsync +endif + +if BUILD_METADISP + METADISP_DIR = metadisp +endif + +SUBDIRS = locks quota read-only quiesce marker index barrier arbiter upcall \ + compress changelog gfid-access snapview-client snapview-server trash \ + shard bit-rot leases selinux sdfs namespace $(CLOUDSYNC_DIR) thin-arbiter \ + utime $(METADISP_DIR) CLEANFILES = diff --git a/xlators/features/glupy/Makefile.am b/xlators/features/arbiter/Makefile.am index a985f42a877..a985f42a877 100644 --- a/xlators/features/glupy/Makefile.am +++ b/xlators/features/arbiter/Makefile.am diff --git a/xlators/features/arbiter/src/Makefile.am b/xlators/features/arbiter/src/Makefile.am new file mode 100644 index 00000000000..badc42f37be --- /dev/null +++ b/xlators/features/arbiter/src/Makefile.am @@ -0,0 +1,19 @@ +if WITH_SERVER +xlator_LTLIBRARIES = arbiter.la +endif + +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +arbiter_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +arbiter_la_SOURCES = arbiter.c +arbiter_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = arbiter.h arbiter-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/arbiter/src/arbiter-mem-types.h b/xlators/features/arbiter/src/arbiter-mem-types.h new file mode 100644 index 00000000000..05d18374c46 --- /dev/null +++ b/xlators/features/arbiter/src/arbiter-mem-types.h @@ -0,0 +1,18 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __ARBITER_MEM_TYPES_H__ +#define __ARBITER_MEM_TYPES_H__ +#include <glusterfs/mem-types.h> + +typedef enum gf_arbiter_mem_types_ { + gf_arbiter_mt_inode_ctx_t = gf_common_mt_end + 1, + gf_arbiter_mt_end +} gf_arbiter_mem_types_t; +#endif diff --git a/xlators/features/arbiter/src/arbiter.c b/xlators/features/arbiter/src/arbiter.c new file mode 100644 index 00000000000..83a97e3354b --- /dev/null +++ b/xlators/features/arbiter/src/arbiter.c @@ -0,0 +1,380 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "arbiter.h" +#include "arbiter-mem-types.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> +#include <glusterfs/logging.h> + +static arbiter_inode_ctx_t * +__arbiter_inode_ctx_get(inode_t *inode, xlator_t *this) +{ + arbiter_inode_ctx_t *ctx = NULL; + int ret = 0; + uint64_t ctx_addr = 0; + + ret = __inode_ctx_get(inode, this, &ctx_addr); + if (ret == 0) { + ctx = (arbiter_inode_ctx_t *)(long)ctx_addr; + goto out; + } + + ctx = GF_CALLOC(1, sizeof(*ctx), gf_arbiter_mt_inode_ctx_t); + if (!ctx) + goto out; + + ret = __inode_ctx_put(inode, this, (uint64_t)(uintptr_t)ctx); + if (ret) { + GF_FREE(ctx); + ctx = NULL; + gf_log_callingfn(this->name, GF_LOG_ERROR, + "failed to " + "set the inode ctx (%s)", + uuid_utoa(inode->gfid)); + } +out: + return ctx; +} + +static arbiter_inode_ctx_t * +arbiter_inode_ctx_get(inode_t *inode, xlator_t *this) +{ + arbiter_inode_ctx_t *ctx = NULL; + + LOCK(&inode->lock); + { + ctx = __arbiter_inode_ctx_get(inode, this); + } + UNLOCK(&inode->lock); + return ctx; +} + +int32_t +arbiter_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + arbiter_inode_ctx_t *ctx = NULL; + + if (op_ret != 0) + goto unwind; + ctx = arbiter_inode_ctx_get(inode, this); + if (!ctx) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + memcpy(&ctx->iattbuf, buf, sizeof(ctx->iattbuf)); + +unwind: + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + return 0; +} + +int32_t +arbiter_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + STACK_WIND(frame, arbiter_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + return 0; +} + +int32_t +arbiter_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + arbiter_inode_ctx_t *ctx = NULL; + struct iatt *buf = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + ctx = arbiter_inode_ctx_get(loc->inode, this); + if (!ctx) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + buf = &ctx->iattbuf; +unwind: + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, buf, buf, NULL); + return 0; +} + +int32_t +arbiter_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) + +{ + arbiter_inode_ctx_t *ctx = NULL; + struct iatt *buf = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + ctx = arbiter_inode_ctx_get(fd->inode, this); + if (!ctx) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + buf = &ctx->iattbuf; +unwind: + STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, buf, buf, NULL); + return 0; +} + +dict_t * +arbiter_fill_writev_xdata(fd_t *fd, dict_t *xdata, xlator_t *this) +{ + dict_t *rsp_xdata = NULL; + int32_t ret = 0; + int is_append = 1; + + if (!fd || !fd->inode || gf_uuid_is_null(fd->inode->gfid)) { + goto out; + } + + if (!xdata) + goto out; + + rsp_xdata = dict_new(); + if (!rsp_xdata) + goto out; + + if (dict_get(xdata, GLUSTERFS_OPEN_FD_COUNT)) { + ret = dict_set_uint32(rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, + fd->inode->fd_count); + if (ret < 0) { + gf_msg_debug(this->name, 0, + "Failed to set dict value" + " for GLUSTERFS_OPEN_FD_COUNT"); + } + } + if (dict_get(xdata, GLUSTERFS_WRITE_IS_APPEND)) { + ret = dict_set_uint32(rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, is_append); + if (ret < 0) { + gf_msg_debug(this->name, 0, + "Failed to set dict value" + " for GLUSTERFS_WRITE_IS_APPEND"); + } + } +out: + return rsp_xdata; +} + +int32_t +arbiter_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + arbiter_inode_ctx_t *ctx = NULL; + struct iatt *buf = NULL; + dict_t *rsp_xdata = NULL; + int op_ret = 0; + int op_errno = 0; + + ctx = arbiter_inode_ctx_get(fd->inode, this); + if (!ctx) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + buf = &ctx->iattbuf; + op_ret = iov_length(vector, count); + rsp_xdata = arbiter_fill_writev_xdata(fd, xdata, this); +unwind: + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, buf, buf, rsp_xdata); + if (rsp_xdata) + dict_unref(rsp_xdata); + return 0; +} + +int32_t +arbiter_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t keep_size, off_t offset, size_t len, dict_t *xdata) +{ + arbiter_inode_ctx_t *ctx = NULL; + struct iatt *buf = NULL; + int op_ret = 0; + int op_errno = 0; + + ctx = arbiter_inode_ctx_get(fd->inode, this); + if (!ctx) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + buf = &ctx->iattbuf; +unwind: + STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, buf, buf, NULL); + return 0; +} + +int32_t +arbiter_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + arbiter_inode_ctx_t *ctx = NULL; + struct iatt *buf = NULL; + int op_ret = 0; + int op_errno = 0; + + ctx = arbiter_inode_ctx_get(fd->inode, this); + if (!ctx) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + buf = &ctx->iattbuf; +unwind: + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, buf, buf, NULL); + return 0; +} + +int32_t +arbiter_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + arbiter_inode_ctx_t *ctx = NULL; + struct iatt *buf = NULL; + int op_ret = 0; + int op_errno = 0; + + ctx = arbiter_inode_ctx_get(fd->inode, this); + if (!ctx) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + buf = &ctx->iattbuf; +unwind: + STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, buf, buf, NULL); + return 0; +} + +static int32_t +arbiter_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + STACK_UNWIND_STRICT(readv, frame, -1, ENOSYS, NULL, 0, NULL, NULL, NULL); + return 0; +} + +static int32_t +arbiter_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) +{ + STACK_UNWIND_STRICT(seek, frame, -1, ENOSYS, 0, xdata); + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + ret = xlator_mem_acct_init(this, gf_arbiter_mt_end + 1); + if (ret) + gf_log(this->name, GF_LOG_ERROR, + "Memory accounting " + "initialization failed."); + return ret; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + return 0; +} + +int +arbiter_forget(xlator_t *this, inode_t *inode) +{ + arbiter_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + + inode_ctx_del(inode, this, &ctx_addr); + if (!ctx_addr) + return 0; + ctx = (arbiter_inode_ctx_t *)(long)ctx_addr; + GF_FREE(ctx); + return 0; +} + +int32_t +init(xlator_t *this) +{ + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_ERROR, + "'arbiter' not configured with exactly one child"); + return -1; + } + + if (!this->parents) + gf_log(this->name, GF_LOG_ERROR, "dangling volume. check volfile "); + + return 0; +} + +void +fini(xlator_t *this) +{ + return; +} + +struct xlator_fops fops = { + .lookup = arbiter_lookup, + + /* Return success for these inode write FOPS without winding it down to + * posix; this is needed for AFR write transaction logic to work.*/ + .truncate = arbiter_truncate, + .writev = arbiter_writev, + .ftruncate = arbiter_ftruncate, + .fallocate = arbiter_fallocate, + .discard = arbiter_discard, + .zerofill = arbiter_zerofill, + + /* AFR is not expected to wind these inode read FOPS initiated by the + * application to the arbiter brick. But in case a bug causes them + * to be called, we return ENOSYS. */ + .readv = arbiter_readv, + .seek = arbiter_seek, + + /* The following inode read FOPS initiated by the application are not + * wound by AFR either but internal logic like shd, glfsheal and + * client side healing in AFR will send them for selfheal/ inode refresh + * operations etc.,so we need to wind them down to posix: + * + * (f)stat, readdir(p), readlink, (f)getxattr.*/ + + /* All other FOPs not listed here are safe to be wound down to posix.*/ +}; + +struct xlator_cbks cbks = { + .forget = arbiter_forget, +}; + +struct volume_options options[] = { + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "arbiter", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/arbiter/src/arbiter.h b/xlators/features/arbiter/src/arbiter.h new file mode 100644 index 00000000000..546db7b751a --- /dev/null +++ b/xlators/features/arbiter/src/arbiter.h @@ -0,0 +1,21 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _ARBITER_H +#define _ARBITER_H + +#include <glusterfs/locking.h> +#include <glusterfs/common-utils.h> + +typedef struct arbiter_inode_ctx_ { + struct iatt iattbuf; +} arbiter_inode_ctx_t; + +#endif /* _ARBITER_H */ diff --git a/xlators/features/filter/Makefile.am b/xlators/features/barrier/Makefile.am index d471a3f9243..a985f42a877 100644 --- a/xlators/features/filter/Makefile.am +++ b/xlators/features/barrier/Makefile.am @@ -1,3 +1,3 @@ SUBDIRS = src -CLEANFILES = +CLEANFILES = diff --git a/xlators/features/barrier/src/Makefile.am b/xlators/features/barrier/src/Makefile.am new file mode 100644 index 00000000000..25099bc56e5 --- /dev/null +++ b/xlators/features/barrier/src/Makefile.am @@ -0,0 +1,17 @@ +xlator_LTLIBRARIES = barrier.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +barrier_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +barrier_la_SOURCES = barrier.c + +barrier_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = barrier.h barrier-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/filter/src/filter-mem-types.h b/xlators/features/barrier/src/barrier-mem-types.h index 47a17249b8d..71ed7898d9c 100644 --- a/xlators/features/filter/src/filter-mem-types.h +++ b/xlators/features/barrier/src/barrier-mem-types.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -7,14 +7,14 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef __FILTER_MEM_TYPES_H__ -#define __FILTER_MEM_TYPES_H__ -#include "mem-types.h" +#ifndef __BARRIER_MEM_TYPES_H__ +#define __BARRIER_MEM_TYPES_H__ -enum gf_filter_mem_types_ { - gf_filter_mt_gf_filter = gf_common_mt_end + 1, - gf_filter_mt_end +#include <glusterfs/mem-types.h> + +enum gf_barrier_mem_types_ { + gf_barrier_mt_priv_t = gf_common_mt_end + 1, + gf_barrier_mt_end }; #endif - diff --git a/xlators/features/barrier/src/barrier.c b/xlators/features/barrier/src/barrier.c new file mode 100644 index 00000000000..852bbacb99d --- /dev/null +++ b/xlators/features/barrier/src/barrier.c @@ -0,0 +1,809 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "barrier.h" +#include <glusterfs/defaults.h> +#include <glusterfs/call-stub.h> + +#include <glusterfs/statedump.h> + +void +barrier_local_set_gfid(call_frame_t *frame, uuid_t gfid, xlator_t *this) +{ + if (gfid) { + uuid_t *id = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); + if (!id) { + gf_log(this->name, GF_LOG_WARNING, + "Could not set gfid" + ". gfid will not be dumped in statedump file."); + return; + } + gf_uuid_copy(*id, gfid); + frame->local = id; + } +} + +void +barrier_local_free_gfid(call_frame_t *frame) +{ + if (frame->local) { + GF_FREE(frame->local); + frame->local = NULL; + } +} + +int32_t +barrier_truncate_cbk_resume(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + barrier_local_free_gfid(frame); + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int32_t +barrier_ftruncate_cbk_resume(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + barrier_local_free_gfid(frame); + STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int32_t +barrier_unlink_cbk_resume(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + barrier_local_free_gfid(frame); + STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); + return 0; +} + +int32_t +barrier_rmdir_cbk_resume(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + barrier_local_free_gfid(frame); + STACK_UNWIND_STRICT(rmdir, frame, op_ret, op_errno, preparent, postparent, + xdata); + return 0; +} + +int32_t +barrier_rename_cbk_resume(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + barrier_local_free_gfid(frame); + STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, buf, preoldparent, + postoldparent, prenewparent, postnewparent, xdata); + return 0; +} + +int32_t +barrier_writev_cbk_resume(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + barrier_local_free_gfid(frame); + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int32_t +barrier_fsync_cbk_resume(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + barrier_local_free_gfid(frame); + STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} + +int32_t +barrier_removexattr_cbk_resume(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) +{ + barrier_local_free_gfid(frame); + STACK_UNWIND_STRICT(removexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +barrier_fremovexattr_cbk_resume(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + barrier_local_free_gfid(frame); + STACK_UNWIND_STRICT(fremovexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +barrier_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + BARRIER_FOP_CBK(writev, out, frame, this, op_ret, op_errno, prebuf, postbuf, + xdata); +out: + return 0; +} + +int32_t +barrier_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + BARRIER_FOP_CBK(fremovexattr, out, frame, this, op_ret, op_errno, xdata); +out: + return 0; +} + +int32_t +barrier_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + BARRIER_FOP_CBK(removexattr, out, frame, this, op_ret, op_errno, xdata); +out: + return 0; +} + +int32_t +barrier_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + BARRIER_FOP_CBK(truncate, out, frame, this, op_ret, op_errno, prebuf, + postbuf, xdata); +out: + return 0; +} + +int32_t +barrier_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + BARRIER_FOP_CBK(ftruncate, out, frame, this, op_ret, op_errno, prebuf, + postbuf, xdata); +out: + return 0; +} + +int32_t +barrier_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + BARRIER_FOP_CBK(rename, out, frame, this, op_ret, op_errno, buf, + preoldparent, postoldparent, prenewparent, postnewparent, + xdata); +out: + return 0; +} + +int32_t +barrier_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + BARRIER_FOP_CBK(rmdir, out, frame, this, op_ret, op_errno, preparent, + postparent, xdata); +out: + return 0; +} + +int32_t +barrier_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + BARRIER_FOP_CBK(unlink, out, frame, this, op_ret, op_errno, preparent, + postparent, xdata); +out: + return 0; +} + +int32_t +barrier_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + BARRIER_FOP_CBK(fsync, out, frame, this, op_ret, op_errno, prebuf, postbuf, + xdata); +out: + return 0; +} + +int32_t +barrier_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + if (!((flags | fd->flags) & (O_SYNC | O_DSYNC))) { + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, off, + flags, iobref, xdata); + + return 0; + } + + barrier_local_set_gfid(frame, fd->inode->gfid, this); + STACK_WIND(frame, barrier_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, off, flags, + iobref, xdata); + return 0; +} + +int32_t +barrier_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + barrier_local_set_gfid(frame, fd->inode->gfid, this); + STACK_WIND(frame, barrier_fremovexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + return 0; +} + +int32_t +barrier_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + barrier_local_set_gfid(frame, loc->inode->gfid, this); + STACK_WIND(frame, barrier_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; +} + +int32_t +barrier_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + barrier_local_set_gfid(frame, loc->inode->gfid, this); + STACK_WIND(frame, barrier_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; +} + +int32_t +barrier_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + barrier_local_set_gfid(frame, oldloc->inode->gfid, this); + STACK_WIND(frame, barrier_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; +} + +int +barrier_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) +{ + barrier_local_set_gfid(frame, loc->inode->gfid, this); + STACK_WIND(frame, barrier_rmdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata); + return 0; +} + +int32_t +barrier_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) +{ + barrier_local_set_gfid(frame, loc->inode->gfid, this); + STACK_WIND(frame, barrier_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); + return 0; +} + +int32_t +barrier_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + barrier_local_set_gfid(frame, fd->inode->gfid, this); + STACK_WIND(frame, barrier_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; +} + +int32_t +barrier_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) +{ + barrier_local_set_gfid(frame, fd->inode->gfid, this); + STACK_WIND(frame, barrier_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, flags, xdata); + return 0; +} + +call_stub_t * +__barrier_dequeue(xlator_t *this, struct list_head *queue) +{ + call_stub_t *stub = NULL; + barrier_priv_t *priv = NULL; + + priv = this->private; + GF_ASSERT(priv); + + if (list_empty(queue)) + goto out; + + stub = list_entry(queue->next, call_stub_t, list); + list_del_init(&stub->list); + +out: + return stub; +} + +void +barrier_dequeue_all(xlator_t *this, struct list_head *queue) +{ + call_stub_t *stub = NULL; + + gf_log(this->name, GF_LOG_INFO, "Dequeuing all the barriered fops"); + + /* TODO: Start the below task in a new thread */ + while ((stub = __barrier_dequeue(this, queue))) + call_resume(stub); + + gf_log(this->name, GF_LOG_INFO, + "Dequeuing the barriered fops is " + "finished"); + return; +} + +void +barrier_timeout(void *data) +{ + xlator_t *this = NULL; + barrier_priv_t *priv = NULL; + struct list_head queue = { + 0, + }; + + this = data; + THIS = this; + priv = this->private; + + INIT_LIST_HEAD(&queue); + + gf_log(this->name, GF_LOG_CRITICAL, + "Disabling barrier because of " + "the barrier timeout."); + + LOCK(&priv->lock); + { + __barrier_disable(this, &queue); + } + UNLOCK(&priv->lock); + + barrier_dequeue_all(this, &queue); + + return; +} + +void +__barrier_enqueue(xlator_t *this, call_stub_t *stub) +{ + barrier_priv_t *priv = NULL; + + priv = this->private; + GF_ASSERT(priv); + + list_add_tail(&stub->list, &priv->queue); + priv->queue_size++; + + return; +} + +void +__barrier_disable(xlator_t *this, struct list_head *queue) +{ + GF_UNUSED int ret = 0; + barrier_priv_t *priv = NULL; + + priv = this->private; + GF_ASSERT(priv); + + if (priv->timer) { + ret = gf_timer_call_cancel(this->ctx, priv->timer); + priv->timer = NULL; + } + + list_splice_init(&priv->queue, queue); + priv->queue_size = 0; + priv->barrier_enabled = _gf_false; +} + +int +__barrier_enable(xlator_t *this, barrier_priv_t *priv) +{ + int ret = -1; + + priv->timer = gf_timer_call_after(this->ctx, priv->timeout, barrier_timeout, + (void *)this); + if (!priv->timer) { + gf_log(this->name, GF_LOG_CRITICAL, + "Couldn't add barrier " + "timeout event."); + goto out; + } + + priv->barrier_enabled = _gf_true; + ret = 0; +out: + return ret; +} + +int +notify(xlator_t *this, int event, void *data, ...) +{ + barrier_priv_t *priv = this->private; + dict_t *dict = NULL; + int ret = -1; + int barrier_enabled = _gf_false; + struct list_head queue = { + 0, + }; + + GF_ASSERT(priv); + INIT_LIST_HEAD(&queue); + + switch (event) { + case GF_EVENT_TRANSLATOR_OP: { + dict = data; + barrier_enabled = dict_get_str_boolean(dict, "barrier", -1); + + if (barrier_enabled == -1) { + gf_log(this->name, GF_LOG_ERROR, + "Could not fetch " + " barrier key from the dictionary."); + goto out; + } + + LOCK(&priv->lock); + { + if (!priv->barrier_enabled) { + if (barrier_enabled) { + ret = __barrier_enable(this, priv); + } else { + UNLOCK(&priv->lock); + gf_log(this->name, GF_LOG_ERROR, "Already disabled."); + goto post_unlock; + } + } else { + if (!barrier_enabled) { + __barrier_disable(this, &queue); + ret = 0; + } else { + UNLOCK(&priv->lock); + gf_log(this->name, GF_LOG_ERROR, "Already enabled"); + goto post_unlock; + } + } + } + UNLOCK(&priv->lock); + post_unlock: + if (!list_empty(&queue)) + barrier_dequeue_all(this, &queue); + + break; + } + default: { + default_notify(this, event, data); + ret = 0; + goto out; + } + } +out: + return ret; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + barrier_priv_t *priv = NULL; + int ret = -1; + gf_boolean_t barrier_enabled = _gf_false; + uint32_t timeout = { + 0, + }; + struct list_head queue = { + 0, + }; + + priv = this->private; + GF_ASSERT(priv); + + GF_OPTION_RECONF("barrier", barrier_enabled, options, bool, out); + GF_OPTION_RECONF("barrier-timeout", timeout, options, time, out); + + INIT_LIST_HEAD(&queue); + + LOCK(&priv->lock); + { + if (!priv->barrier_enabled) { + if (barrier_enabled) { + ret = __barrier_enable(this, priv); + if (ret) { + goto unlock; + } + } + } else { + if (!barrier_enabled) { + __barrier_disable(this, &queue); + } + } + priv->timeout.tv_sec = timeout; + ret = 0; + } +unlock: + UNLOCK(&priv->lock); + + if (!list_empty(&queue)) + barrier_dequeue_all(this, &queue); + +out: + return ret; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + ret = xlator_mem_acct_init(this, gf_barrier_mt_end + 1); + if (ret) + gf_log(this->name, GF_LOG_ERROR, + "Memory accounting " + "initialization failed."); + + return ret; +} + +int +init(xlator_t *this) +{ + int ret = -1; + barrier_priv_t *priv = NULL; + uint32_t timeout = { + 0, + }; + + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_ERROR, + "'barrier' not configured with exactly one child"); + goto out; + } + + if (!this->parents) + gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile "); + + priv = GF_CALLOC(1, sizeof(*priv), gf_barrier_mt_priv_t); + if (!priv) + goto out; + + LOCK_INIT(&priv->lock); + + GF_OPTION_INIT("barrier", priv->barrier_enabled, bool, out); + GF_OPTION_INIT("barrier-timeout", timeout, time, out); + priv->timeout.tv_sec = timeout; + + INIT_LIST_HEAD(&priv->queue); + + if (priv->barrier_enabled) { + ret = __barrier_enable(this, priv); + if (ret == -1) + goto out; + } + + this->private = priv; + ret = 0; +out: + if (ret && priv) + GF_FREE(priv); + + return ret; +} + +void +fini(xlator_t *this) +{ + barrier_priv_t *priv = NULL; + struct list_head queue = { + 0, + }; + + priv = this->private; + if (!priv) + goto out; + + INIT_LIST_HEAD(&queue); + + gf_log(this->name, GF_LOG_INFO, + "Disabling barriering and dequeuing " + "all the queued fops"); + LOCK(&priv->lock); + { + __barrier_disable(this, &queue); + } + UNLOCK(&priv->lock); + + if (!list_empty(&queue)) + barrier_dequeue_all(this, &queue); + + this->private = NULL; + + LOCK_DESTROY(&priv->lock); + GF_FREE(priv); +out: + return; +} + +static void +barrier_dump_stub(call_stub_t *stub, char *prefix) +{ + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + + gf_proc_dump_build_key(key, prefix, "fop"); + gf_proc_dump_write(key, "%s", gf_fop_list[stub->fop]); + + if (stub->frame->local) { + gf_proc_dump_build_key(key, prefix, "gfid"); + gf_proc_dump_write(key, "%s", + uuid_utoa(*(uuid_t *)(stub->frame->local))); + } + if (stub->args.loc.path) { + gf_proc_dump_build_key(key, prefix, "path"); + gf_proc_dump_write(key, "%s", stub->args.loc.path); + } + if (stub->args.loc.name) { + gf_proc_dump_build_key(key, prefix, "name"); + gf_proc_dump_write(key, "%s", stub->args.loc.name); + } + + return; +} + +static void +__barrier_dump_queue(barrier_priv_t *priv) +{ + call_stub_t *stub = NULL; + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + int i = 0; + + GF_VALIDATE_OR_GOTO("barrier", priv, out); + + list_for_each_entry(stub, &priv->queue, list) + { + snprintf(key, sizeof(key), "stub.%d", i++); + gf_proc_dump_add_section("%s", key); + barrier_dump_stub(stub, key); + } + +out: + return; +} + +int +barrier_dump_priv(xlator_t *this) +{ + int ret = -1; + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + barrier_priv_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("barrier", this, out); + + priv = this->private; + if (!priv) + return 0; + + gf_proc_dump_build_key(key, "xlator.features.barrier", "priv"); + gf_proc_dump_add_section("%s", key); + gf_proc_dump_build_key(key, "barrier", "enabled"); + + LOCK(&priv->lock); + { + gf_proc_dump_write(key, "%d", priv->barrier_enabled); + gf_proc_dump_build_key(key, "barrier", "timeout"); + gf_proc_dump_write(key, "%ld", priv->timeout.tv_sec); + if (priv->barrier_enabled) { + gf_proc_dump_build_key(key, "barrier", "queue_size"); + gf_proc_dump_write(key, "%d", priv->queue_size); + __barrier_dump_queue(priv); + } + } + UNLOCK(&priv->lock); + +out: + return ret; +} + +struct xlator_fops fops = { + + /* Barrier Class fops */ + .rmdir = barrier_rmdir, + .unlink = barrier_unlink, + .rename = barrier_rename, + .removexattr = barrier_removexattr, + .fremovexattr = barrier_fremovexattr, + .truncate = barrier_truncate, + .ftruncate = barrier_ftruncate, + .fsync = barrier_fsync, + + /* Writes with only O_SYNC flag */ + .writev = barrier_writev, +}; + +struct xlator_dumpops dumpops = { + .priv = barrier_dump_priv, +}; + +struct xlator_cbks cbks; + +struct volume_options options[] = { + {.key = {"barrier"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "disable", + .op_version = {GD_OP_VERSION_3_6_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "When \"enabled\", blocks acknowledgements to application " + "for file operations such as rmdir, rename, unlink, " + "removexattr, fremovexattr, truncate, ftruncate, " + "write (with O_SYNC), fsync. It is turned \"off\" by " + "default."}, + {.key = {"barrier-timeout"}, + .type = GF_OPTION_TYPE_TIME, + .default_value = BARRIER_TIMEOUT, + .op_version = {GD_OP_VERSION_3_6_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "After 'timeout' seconds since the time 'barrier' " + "option was set to \"on\", acknowledgements to file " + "operations are no longer blocked and previously " + "blocked acknowledgements are sent to the application"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "barrier", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/barrier/src/barrier.h b/xlators/features/barrier/src/barrier.h new file mode 100644 index 00000000000..1337f311f7d --- /dev/null +++ b/xlators/features/barrier/src/barrier.h @@ -0,0 +1,89 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BARRIER_H__ +#define __BARRIER_H__ + +#include "barrier-mem-types.h" +#include <glusterfs/xlator.h> +#include <glusterfs/timer.h> +#include <glusterfs/call-stub.h> + +#define BARRIER_FOP_CBK(fop_name, label, frame, this, params...) \ + do { \ + barrier_priv_t *_priv = NULL; \ + call_stub_t *_stub = NULL; \ + gf_boolean_t _barrier_enabled = _gf_false; \ + struct list_head queue = { \ + 0, \ + }; \ + \ + INIT_LIST_HEAD(&queue); \ + \ + _priv = this->private; \ + GF_ASSERT(_priv); \ + \ + LOCK(&_priv->lock); \ + { \ + if (_priv->barrier_enabled) { \ + _barrier_enabled = _priv->barrier_enabled; \ + \ + _stub = fop_##fop_name##_cbk_stub( \ + frame, barrier_##fop_name##_cbk_resume, params); \ + if (!_stub) { \ + __barrier_disable(this, &queue); \ + goto unlock; \ + } \ + \ + __barrier_enqueue(this, _stub); \ + } \ + } \ + unlock: \ + UNLOCK(&_priv->lock); \ + \ + if (_stub) \ + goto label; \ + \ + if (_barrier_enabled && !_stub) { \ + gf_log(this->name, GF_LOG_CRITICAL, \ + "Failed to barrier FOPs, disabling " \ + "barrier. FOP: %s, ERROR: %s", \ + #fop_name, strerror(ENOMEM)); \ + barrier_dequeue_all(this, &queue); \ + } \ + barrier_local_free_gfid(frame); \ + STACK_UNWIND_STRICT(fop_name, frame, params); \ + goto label; \ + } while (0) + +typedef struct { + gf_timer_t *timer; + gf_lock_t lock; + struct list_head queue; + struct timespec timeout; + uint32_t queue_size; + gf_boolean_t barrier_enabled; + char _pad[3]; /* manual padding */ +} barrier_priv_t; + +int +__barrier_enable(xlator_t *this, barrier_priv_t *priv); +void +__barrier_enqueue(xlator_t *this, call_stub_t *stub); +void +__barrier_disable(xlator_t *this, struct list_head *queue); +void +barrier_timeout(void *data); +void +barrier_dequeue_all(xlator_t *this, struct list_head *queue); +call_stub_t * +__barrier_dequeue(xlator_t *this, struct list_head *queue); + +#endif diff --git a/xlators/features/bit-rot/Makefile.am b/xlators/features/bit-rot/Makefile.am new file mode 100644 index 00000000000..f963effea22 --- /dev/null +++ b/xlators/features/bit-rot/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src
\ No newline at end of file diff --git a/xlators/features/bit-rot/src/Makefile.am b/xlators/features/bit-rot/src/Makefile.am new file mode 100644 index 00000000000..b5e4a7d62a0 --- /dev/null +++ b/xlators/features/bit-rot/src/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = stub bitd diff --git a/xlators/features/bit-rot/src/bitd/Makefile.am b/xlators/features/bit-rot/src/bitd/Makefile.am new file mode 100644 index 00000000000..6db800e6565 --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/Makefile.am @@ -0,0 +1,23 @@ +if WITH_SERVER +xlator_LTLIBRARIES = bit-rot.la +endif +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +bit_rot_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src/ -I$(top_builddir)/rpc/xdr/src/ \ + -I$(top_srcdir)/rpc/rpc-lib/src -I$(CONTRIBDIR)/timer-wheel \ + -I$(top_srcdir)/xlators/features/bit-rot/src/stub + +bit_rot_la_SOURCES = bit-rot.c bit-rot-scrub.c bit-rot-ssm.c \ + bit-rot-scrub-status.c +bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/xlators/features/changelog/lib/src/libgfchangelog.la + +noinst_HEADERS = bit-rot.h bit-rot-scrub.h bit-rot-bitd-messages.h bit-rot-ssm.h \ + bit-rot-scrub-status.h + +AM_CFLAGS = -Wall -DBR_RATE_LIMIT_SIGNER $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h b/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h new file mode 100644 index 00000000000..5bc5103a27c --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h @@ -0,0 +1,101 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _BITROT_BITD_MESSAGES_H_ +#define _BITROT_BITD_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(BITROT_BITD, BRB_MSG_FD_CREATE_FAILED, BRB_MSG_READV_FAILED, + BRB_MSG_BLOCK_READ_FAILED, BRB_MSG_CALC_CHECKSUM_FAILED, + BRB_MSG_NO_MEMORY, BRB_MSG_GET_SIGN_FAILED, BRB_MSG_SET_SIGN_FAILED, + BRB_MSG_OP_FAILED, BRB_MSG_READ_AND_SIGN_FAILED, BRB_MSG_SIGN_FAILED, + BRB_MSG_GET_SUBVOL_FAILED, BRB_MSG_SET_TIMER_FAILED, + BRB_MSG_GET_INFO_FAILED, BRB_MSG_PATH_FAILED, BRB_MSG_MARK_BAD_FILE, + BRB_MSG_TRIGGER_SIGN, BRB_MSG_REGISTER_FAILED, + BRB_MSG_CRAWLING_START, BRB_MSG_SPAWN_FAILED, + BRB_MSG_INVALID_SUBVOL_CHILD, BRB_MSG_SKIP_OBJECT, BRB_MSG_NO_CHILD, + BRB_MSG_CHECKSUM_MISMATCH, BRB_MSG_MARK_CORRUPTED, + BRB_MSG_CRAWLING_FINISH, BRB_MSG_CALC_ERROR, BRB_MSG_LOOKUP_FAILED, + BRB_MSG_PARTIAL_VERSION_PRESENCE, BRB_MSG_MEM_ACNT_FAILED, + BRB_MSG_TIMER_WHEEL_UNAVAILABLE, BRB_MSG_BITROT_LOADED, + BRB_MSG_SCALE_DOWN_FAILED, BRB_MSG_SCALE_UP_FAILED, + BRB_MSG_SCALE_DOWN_SCRUBBER, BRB_MSG_SCALING_UP_SCRUBBER, + BRB_MSG_UNKNOWN_THROTTLE, BRB_MSG_RATE_LIMIT_INFO, + BRB_MSG_SCRUB_INFO, BRB_MSG_CONNECTED_TO_BRICK, BRB_MSG_BRICK_INFO, + BRB_MSG_SUBVOL_CONNECT_FAILED, BRB_MSG_INVALID_SUBVOL, + BRB_MSG_RESCHEDULE_SCRUBBER_FAILED, BRB_MSG_SCRUB_START, + BRB_MSG_SCRUB_FINISH, BRB_MSG_SCRUB_RUNNING, + BRB_MSG_SCRUB_RESCHEDULED, BRB_MSG_SCRUB_TUNABLE, + BRB_MSG_SCRUB_THREAD_CLEANUP, BRB_MSG_SCRUBBER_CLEANED, + BRB_MSG_GENERIC_SSM_INFO, BRB_MSG_ZERO_TIMEOUT_BUG, + BRB_MSG_BAD_OBJ_READDIR_FAIL, BRB_MSG_SSM_FAILED, + BRB_MSG_SCRUB_WAIT_FAILED, BRB_MSG_TRIGGER_SIGN_FAILED, + BRB_MSG_EVENT_UNHANDLED, BRB_MSG_COULD_NOT_SCHEDULE_SCRUB, + BRB_MSG_THREAD_CREATION_FAILED, BRB_MSG_MEM_POOL_ALLOC, + BRB_MSG_SAVING_HASH_FAILED); + +#define BRB_MSG_FD_CREATE_FAILED_STR "failed to create fd for the inode" +#define BRB_MSG_READV_FAILED_STR "readv failed" +#define BRB_MSG_BLOCK_READ_FAILED_STR "reading block failed" +#define BRB_MSG_NO_MEMORY_STR "failed to allocate memory" +#define BRB_MSG_CALC_CHECKSUM_FAILED_STR "calculating checksum failed" +#define BRB_MSG_GET_SIGN_FAILED_STR "failed to get the signature" +#define BRB_MSG_SET_SIGN_FAILED_STR "signing failed" +#define BRB_MSG_OP_FAILED_STR "failed on object" +#define BRB_MSG_TRIGGER_SIGN_FAILED_STR "Could not trigger signing" +#define BRB_MSG_READ_AND_SIGN_FAILED_STR "reading and signing of object failed" +#define BRB_MSG_SET_TIMER_FAILED_STR "Failed to allocate object expiry timer" +#define BRB_MSG_GET_SUBVOL_FAILED_STR \ + "failed to get the subvolume for the brick" +#define BRB_MSG_PATH_FAILED_STR "path failed" +#define BRB_MSG_SKIP_OBJECT_STR "Entry is marked corrupted. skipping" +#define BRB_MSG_PARTIAL_VERSION_PRESENCE_STR \ + "PArtial version xattr presence detected, ignoring" +#define BRB_MSG_TRIGGER_SIGN_STR "Triggering signing" +#define BRB_MSG_CRAWLING_START_STR \ + "Crawling brick, scanning for unsigned objects" +#define BRB_MSG_CRAWLING_FINISH_STR "Completed crawling brick" +#define BRB_MSG_REGISTER_FAILED_STR "Register to changelog failed" +#define BRB_MSG_SPAWN_FAILED_STR "failed to spawn" +#define BRB_MSG_CONNECTED_TO_BRICK_STR "Connected to brick" +#define BRB_MSG_LOOKUP_FAILED_STR "lookup on root failed" +#define BRB_MSG_GET_INFO_FAILED_STR "failed to get stub info" +#define BRB_MSG_SCRUB_THREAD_CLEANUP_STR "Error cleaning up scanner thread" +#define BRB_MSG_SCRUBBER_CLEANED_STR "clened up scrubber for brick" +#define BRB_MSG_SUBVOL_CONNECT_FAILED_STR \ + "callback handler for subvolume failed" +#define BRB_MSG_MEM_ACNT_FAILED_STR "Memory accounting init failed" +#define BRB_MSG_EVENT_UNHANDLED_STR "Event unhandled for child" +#define BRB_MSG_INVALID_SUBVOL_STR "Got event from invalid subvolume" +#define BRB_MSG_RESCHEDULE_SCRUBBER_FAILED_STR \ + "on demand scrub schedule failed. Scrubber is not in pending state." +#define BRB_MSG_COULD_NOT_SCHEDULE_SCRUB_STR \ + "Could not schedule ondemand scrubbing. Scrubbing will continue " \ + "according to old frequency." +#define BRB_MSG_THREAD_CREATION_FAILED_STR "thread creation failed" +#define BRB_MSG_RATE_LIMIT_INFO_STR "Rate Limit Info" +#define BRB_MSG_MEM_POOL_ALLOC_STR "failed to allocate mem-pool for timer" +#define BRB_MSG_NO_CHILD_STR "FATAL: no children" +#define BRB_MSG_TIMER_WHEEL_UNAVAILABLE_STR "global timer wheel unavailable" +#define BRB_MSG_BITROT_LOADED_STR "bit-rot xlator loaded" +#define BRB_MSG_SAVING_HASH_FAILED_STR \ + "failed to allocate memory for saving hash of the object" +#endif /* !_BITROT_BITD_MESSAGES_H_ */ diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c new file mode 100644 index 00000000000..5cef2ffa5e5 --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c @@ -0,0 +1,78 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <string.h> +#include <stdio.h> + +#include "bit-rot-scrub-status.h" + +void +br_inc_unsigned_file_count(br_scrub_stats_t *scrub_stat) +{ + if (!scrub_stat) + return; + + pthread_mutex_lock(&scrub_stat->lock); + { + scrub_stat->unsigned_files++; + } + pthread_mutex_unlock(&scrub_stat->lock); +} + +void +br_inc_scrubbed_file(br_scrub_stats_t *scrub_stat) +{ + if (!scrub_stat) + return; + + pthread_mutex_lock(&scrub_stat->lock); + { + scrub_stat->scrubbed_files++; + } + pthread_mutex_unlock(&scrub_stat->lock); +} + +void +br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, time_t time) +{ + if (!scrub_stat) + return; + + pthread_mutex_lock(&scrub_stat->lock); + { + scrub_stat->scrub_start_time = time; + } + pthread_mutex_unlock(&scrub_stat->lock); +} + +void +br_update_scrub_finish_time(br_scrub_stats_t *scrub_stat, char *timestr, + time_t time) +{ + int lst_size = 0; + + if (!scrub_stat) + return; + + lst_size = sizeof(scrub_stat->last_scrub_time); + if (strlen(timestr) >= lst_size) + return; + + pthread_mutex_lock(&scrub_stat->lock); + { + scrub_stat->scrub_end_time = time; + + scrub_stat->scrub_duration = scrub_stat->scrub_end_time - + scrub_stat->scrub_start_time; + + snprintf(scrub_stat->last_scrub_time, lst_size, "%s", timestr); + } + pthread_mutex_unlock(&scrub_stat->lock); +} diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h new file mode 100644 index 00000000000..f022aa831eb --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h @@ -0,0 +1,50 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_SCRUB_STATUS_H__ +#define __BIT_ROT_SCRUB_STATUS_H__ + +#include <stdint.h> +#include <sys/time.h> +#include <pthread.h> + +#include <glusterfs/common-utils.h> + +struct br_scrub_stats { + uint64_t scrubbed_files; /* Total number of scrubbed files. */ + + uint64_t unsigned_files; /* Total number of unsigned files. */ + + uint64_t scrub_duration; /* Duration of last scrub. */ + + char last_scrub_time[GF_TIMESTR_SIZE]; /* Last scrub completion time. */ + + time_t scrub_start_time; /* Scrubbing starting time. */ + + time_t scrub_end_time; /* Scrubbing finishing time. */ + + int8_t scrub_running; /* Whether scrub running or not. */ + + pthread_mutex_t lock; +}; + +typedef struct br_scrub_stats br_scrub_stats_t; + +void +br_inc_unsigned_file_count(br_scrub_stats_t *scrub_stat); +void +br_inc_scrubbed_file(br_scrub_stats_t *scrub_stat); +void +br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, time_t time); +void +br_update_scrub_finish_time(br_scrub_stats_t *scrub_stat, char *timestr, + time_t time); + +#endif /* __BIT_ROT_SCRUB_STATUS_H__ */ diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c new file mode 100644 index 00000000000..289dd53f610 --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c @@ -0,0 +1,2070 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <math.h> +#include <ctype.h> +#include <sys/uio.h> + +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/common-utils.h> + +#include "bit-rot-scrub.h" +#include <pthread.h> +#include "bit-rot-bitd-messages.h" +#include "bit-rot-scrub-status.h" +#include <glusterfs/events.h> + +struct br_scrubbers { + pthread_t scrubthread; + + struct list_head list; +}; + +struct br_fsscan_entry { + void *data; + + loc_t parent; + + gf_dirent_t *entry; + + struct br_scanfs *fsscan; /* backpointer to subvolume scanner */ + + struct list_head list; +}; + +/** + * fetch signature extended attribute from an object's fd. + * NOTE: On success @xattr is not unref'd as @sign points + * to the dictionary value. + */ +static int32_t +bitd_fetch_signature(xlator_t *this, br_child_t *child, fd_t *fd, + dict_t **xattr, br_isignature_out_t **sign) +{ + int32_t ret = -1; + + ret = syncop_fgetxattr(child->xl, fd, xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, + NULL, NULL); + if (ret < 0) { + br_log_object(this, "fgetxattr", fd->inode->gfid, -ret); + goto out; + } + + ret = dict_get_ptr(*xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, (void **)sign); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED, + "failed to extract signature info [GFID: %s]", + uuid_utoa(fd->inode->gfid)); + goto unref_dict; + } + + return 0; + +unref_dict: + dict_unref(*xattr); +out: + return -1; +} + +/** + * POST COMPUTE CHECK + * + * Checks to be performed before verifying calculated signature + * Object is skipped if: + * - has stale signature + * - mismatches versions caches in pre-compute check + */ + +int32_t +bitd_scrub_post_compute_check(xlator_t *this, br_child_t *child, fd_t *fd, + unsigned long version, + br_isignature_out_t **signature, + br_scrub_stats_t *scrub_stat, + gf_boolean_t skip_stat) +{ + int32_t ret = 0; + size_t signlen = 0; + dict_t *xattr = NULL; + br_isignature_out_t *signptr = NULL; + + ret = bitd_fetch_signature(this, child, fd, &xattr, &signptr); + if (ret < 0) { + if (!skip_stat) + br_inc_unsigned_file_count(scrub_stat); + goto out; + } + + /** + * Either the object got dirtied during the time the signature was + * calculated OR the version we saved during pre-compute check does + * not match now, implying that the object got dirtied and signed in + * between scrubs pre & post compute checks (checksum window). + * + * The log entry looks pretty ugly, but helps in debugging.. + */ + if (signptr->stale || (signptr->version != version)) { + if (!skip_stat) + br_inc_unsigned_file_count(scrub_stat); + gf_msg_debug(this->name, 0, + "<STAGE: POST> Object [GFID: %s] " + "either has a stale signature OR underwent " + "signing during checksumming {Stale: %d | " + "Version: %lu,%lu}", + uuid_utoa(fd->inode->gfid), (signptr->stale) ? 1 : 0, + version, signptr->version); + ret = -1; + goto unref_dict; + } + + signlen = signptr->signaturelen; + *signature = GF_MALLOC(sizeof(br_isignature_out_t) + signlen, + gf_common_mt_char); + + (void)memcpy(*signature, signptr, sizeof(br_isignature_out_t) + signlen); + + (*signature)->signaturelen = signlen; + +unref_dict: + dict_unref(xattr); +out: + return ret; +} + +static int32_t +bitd_signature_staleness(xlator_t *this, br_child_t *child, fd_t *fd, + int *stale, unsigned long *version, + br_scrub_stats_t *scrub_stat, gf_boolean_t skip_stat) +{ + int32_t ret = -1; + dict_t *xattr = NULL; + br_isignature_out_t *signptr = NULL; + + ret = bitd_fetch_signature(this, child, fd, &xattr, &signptr); + if (ret < 0) { + if (!skip_stat) + br_inc_unsigned_file_count(scrub_stat); + goto out; + } + + /** + * save version for validation in post compute stage + * c.f. bitd_scrub_post_compute_check() + */ + *stale = signptr->stale ? 1 : 0; + *version = signptr->version; + + dict_unref(xattr); + +out: + return ret; +} + +/** + * PRE COMPUTE CHECK + * + * Checks to be performed before initiating object signature calculation. + * An object is skipped if: + * - it's already marked corrupted + * - has stale signature + */ +int32_t +bitd_scrub_pre_compute_check(xlator_t *this, br_child_t *child, fd_t *fd, + unsigned long *version, + br_scrub_stats_t *scrub_stat, + gf_boolean_t skip_stat) +{ + int stale = 0; + int32_t ret = -1; + + if (bitd_is_bad_file(this, child, NULL, fd)) { + gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SKIP_OBJECT, + "Object [GFID: %s] is marked corrupted, skipping..", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + ret = bitd_signature_staleness(this, child, fd, &stale, version, scrub_stat, + skip_stat); + if (!ret && stale) { + if (!skip_stat) + br_inc_unsigned_file_count(scrub_stat); + gf_msg_debug(this->name, 0, + "<STAGE: PRE> Object [GFID: %s] " + "has stale signature", + uuid_utoa(fd->inode->gfid)); + ret = -1; + } + +out: + return ret; +} + +/* static int */ +int +bitd_compare_ckum(xlator_t *this, br_isignature_out_t *sign, unsigned char *md, + inode_t *linked_inode, gf_dirent_t *entry, fd_t *fd, + br_child_t *child, loc_t *loc) +{ + int ret = -1; + dict_t *xattr = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, sign, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, child, out); + GF_VALIDATE_OR_GOTO(this->name, linked_inode, out); + GF_VALIDATE_OR_GOTO(this->name, md, out); + GF_VALIDATE_OR_GOTO(this->name, entry, out); + + if (strncmp(sign->signature, (char *)md, sign->signaturelen) == 0) { + gf_msg_debug(this->name, 0, + "%s [GFID: %s | Brick: %s] " + "matches calculated checksum", + loc->path, uuid_utoa(linked_inode->gfid), + child->brick_path); + return 0; + } + + gf_msg(this->name, GF_LOG_DEBUG, 0, BRB_MSG_CHECKSUM_MISMATCH, + "Object checksum mismatch: %s [GFID: %s | Brick: %s]", loc->path, + uuid_utoa(linked_inode->gfid), child->brick_path); + gf_msg(this->name, GF_LOG_ALERT, 0, BRB_MSG_CHECKSUM_MISMATCH, + "CORRUPTION DETECTED: Object %s {Brick: %s | GFID: %s}", loc->path, + child->brick_path, uuid_utoa(linked_inode->gfid)); + + /* Perform bad-file marking */ + xattr = dict_new(); + if (!xattr) { + ret = -1; + goto out; + } + + ret = dict_set_int32(xattr, BITROT_OBJECT_BAD_KEY, _gf_true); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_MARK_BAD_FILE, + "Error setting bad-file marker for %s [GFID: %s | " + "Brick: %s]", + loc->path, uuid_utoa(linked_inode->gfid), child->brick_path); + goto dictfree; + } + + gf_msg(this->name, GF_LOG_ALERT, 0, BRB_MSG_MARK_CORRUPTED, + "Marking" + " %s [GFID: %s | Brick: %s] as corrupted..", + loc->path, uuid_utoa(linked_inode->gfid), child->brick_path); + gf_event(EVENT_BITROT_BAD_FILE, "gfid=%s;path=%s;brick=%s", + uuid_utoa(linked_inode->gfid), loc->path, child->brick_path); + ret = syncop_fsetxattr(child->xl, fd, xattr, 0, NULL, NULL); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_MARK_BAD_FILE, + "Error marking object %s [GFID: %s] as corrupted", loc->path, + uuid_utoa(linked_inode->gfid)); + +dictfree: + dict_unref(xattr); +out: + return ret; +} + +/** + * "The Scrubber" + * + * Perform signature validation for a given object with the assumption + * that the signature is SHA256 (because signer as of now _always_ + * signs with SHA256). + */ +int +br_scrubber_scrub_begin(xlator_t *this, struct br_fsscan_entry *fsentry) +{ + int32_t ret = -1; + fd_t *fd = NULL; + loc_t loc = { + 0, + }; + struct iatt iatt = { + 0, + }; + struct iatt parent_buf = { + 0, + }; + pid_t pid = 0; + br_child_t *child = NULL; + unsigned char *md = NULL; + inode_t *linked_inode = NULL; + br_isignature_out_t *sign = NULL; + unsigned long signedversion = 0; + gf_dirent_t *entry = NULL; + br_private_t *priv = NULL; + loc_t *parent = NULL; + gf_boolean_t skip_stat = _gf_false; + uuid_t shard_root_gfid = { + 0, + }; + + GF_VALIDATE_OR_GOTO("bit-rot", fsentry, out); + + entry = fsentry->entry; + parent = &fsentry->parent; + child = fsentry->data; + + priv = this->private; + + GF_VALIDATE_OR_GOTO("bit-rot", entry, out); + GF_VALIDATE_OR_GOTO("bit-rot", parent, out); + GF_VALIDATE_OR_GOTO("bit-rot", child, out); + GF_VALIDATE_OR_GOTO("bit-rot", priv, out); + + pid = GF_CLIENT_PID_SCRUB; + + ret = br_prepare_loc(this, child, parent, entry, &loc); + if (!ret) + goto out; + + syncopctx_setfspid(&pid); + + ret = syncop_lookup(child->xl, &loc, &iatt, &parent_buf, NULL, NULL); + if (ret) { + br_log_object_path(this, "lookup", loc.path, -ret); + goto out; + } + + linked_inode = inode_link(loc.inode, parent->inode, loc.name, &iatt); + if (linked_inode) + inode_lookup(linked_inode); + + gf_msg_debug(this->name, 0, "Scrubbing object %s [GFID: %s]", entry->d_name, + uuid_utoa(linked_inode->gfid)); + + if (iatt.ia_type != IA_IFREG) { + gf_msg_debug(this->name, 0, "%s is not a regular file", entry->d_name); + ret = 0; + goto unref_inode; + } + + if (IS_DHT_LINKFILE_MODE((&iatt))) { + gf_msg_debug(this->name, 0, "%s is a dht sticky bit file", + entry->d_name); + ret = 0; + goto unref_inode; + } + + /* skip updating scrub statistics for shard entries */ + gf_uuid_parse(SHARD_ROOT_GFID, shard_root_gfid); + if (gf_uuid_compare(loc.pargfid, shard_root_gfid) == 0) + skip_stat = _gf_true; + + /** + * open() an fd for subsequent operations + */ + fd = fd_create(linked_inode, 0); + if (!fd) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED, + "failed to create fd for inode %s", + uuid_utoa(linked_inode->gfid)); + goto unref_inode; + } + + ret = syncop_open(child->xl, &loc, O_RDWR, fd, NULL, NULL); + if (ret) { + br_log_object(this, "open", linked_inode->gfid, -ret); + ret = -1; + goto unrefd; + } + + fd_bind(fd); + + /** + * perform pre compute checks before initiating checksum + * computation + * - presence of bad object + * - signature staleness + */ + ret = bitd_scrub_pre_compute_check(this, child, fd, &signedversion, + &priv->scrub_stat, skip_stat); + if (ret) + goto unrefd; /* skip this object */ + + /* if all's good, proceed to calculate the hash */ + md = GF_MALLOC(SHA256_DIGEST_LENGTH, gf_common_mt_char); + if (!md) + goto unrefd; + + ret = br_calculate_obj_checksum(md, child, fd, &iatt); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_CALC_ERROR, + "error calculating hash for object [GFID: %s]", + uuid_utoa(fd->inode->gfid)); + ret = -1; + goto free_md; + } + + /** + * perform post compute checks as an object's signature may have + * become stale while scrubber calculated checksum. + */ + ret = bitd_scrub_post_compute_check(this, child, fd, signedversion, &sign, + &priv->scrub_stat, skip_stat); + if (ret) + goto free_md; + + ret = bitd_compare_ckum(this, sign, md, linked_inode, entry, fd, child, + &loc); + + if (!skip_stat) + br_inc_scrubbed_file(&priv->scrub_stat); + + GF_FREE(sign); /* allocated on post-compute */ + + /** fd_unref() takes care of closing fd.. like syncop_close() */ + +free_md: + GF_FREE(md); +unrefd: + fd_unref(fd); +unref_inode: + inode_unref(linked_inode); +out: + loc_wipe(&loc); + return ret; +} + +static void +_br_lock_cleaner(void *arg) +{ + pthread_mutex_t *mutex = arg; + + pthread_mutex_unlock(mutex); +} + +static void +wait_for_scrubbing(xlator_t *this, struct br_scanfs *fsscan) +{ + br_private_t *priv = NULL; + struct br_scrubber *fsscrub = NULL; + + priv = this->private; + fsscrub = &priv->fsscrub; + + pthread_cleanup_push(_br_lock_cleaner, &fsscan->waitlock); + pthread_mutex_lock(&fsscan->waitlock); + { + pthread_cleanup_push(_br_lock_cleaner, &fsscrub->mutex); + pthread_mutex_lock(&fsscrub->mutex); + { + list_replace_init(&fsscan->queued, &fsscan->ready); + + /* wake up scrubbers */ + pthread_cond_broadcast(&fsscrub->cond); + } + pthread_mutex_unlock(&fsscrub->mutex); + pthread_cleanup_pop(0); + + while (fsscan->entries != 0) + pthread_cond_wait(&fsscan->waitcond, &fsscan->waitlock); + } + pthread_mutex_unlock(&fsscan->waitlock); + pthread_cleanup_pop(0); +} + +static void +_br_fsscan_inc_entry_count(struct br_scanfs *fsscan) +{ + fsscan->entries++; +} + +static void +_br_fsscan_dec_entry_count(struct br_scanfs *fsscan) +{ + if (--fsscan->entries == 0) { + pthread_mutex_lock(&fsscan->waitlock); + { + pthread_cond_signal(&fsscan->waitcond); + } + pthread_mutex_unlock(&fsscan->waitlock); + } +} + +static void +_br_fsscan_collect_entry(struct br_scanfs *fsscan, + struct br_fsscan_entry *fsentry) +{ + list_add_tail(&fsentry->list, &fsscan->queued); + _br_fsscan_inc_entry_count(fsscan); +} + +#define NR_ENTRIES (1 << 7) /* ..bulk scrubbing */ + +int +br_fsscanner_handle_entry(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) +{ + int32_t ret = -1; + int scrub = 0; + br_child_t *child = NULL; + xlator_t *this = NULL; + struct br_scanfs *fsscan = NULL; + struct br_fsscan_entry *fsentry = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot", subvol, error_return); + GF_VALIDATE_OR_GOTO("bit-rot", data, error_return); + + child = data; + this = child->this; + fsscan = &child->fsscan; + + _mask_cancellation(); + + fsentry = GF_CALLOC(1, sizeof(*fsentry), gf_br_mt_br_fsscan_entry_t); + if (!fsentry) + goto error_return; + + { + fsentry->data = data; + fsentry->fsscan = &child->fsscan; + + /* copy parent loc */ + ret = loc_copy(&fsentry->parent, parent); + if (ret) + goto dealloc; + + /* copy child entry */ + fsentry->entry = entry_copy(entry); + if (!fsentry->entry) + goto locwipe; + + INIT_LIST_HEAD(&fsentry->list); + } + + LOCK(&fsscan->entrylock); + { + _br_fsscan_collect_entry(fsscan, fsentry); + + /** + * need not be a equality check as entries may be pushed + * back onto the scanned queue when thread(s) are cleaned. + */ + if (fsscan->entries >= NR_ENTRIES) + scrub = 1; + } + UNLOCK(&fsscan->entrylock); + + _unmask_cancellation(); + + if (scrub) + wait_for_scrubbing(this, fsscan); + + return 0; + +locwipe: + loc_wipe(&fsentry->parent); +dealloc: + GF_FREE(fsentry); +error_return: + return -1; +} + +int32_t +br_fsscan_deactivate(xlator_t *this) +{ + int ret = 0; + br_private_t *priv = NULL; + br_scrub_state_t nstate = 0; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + ret = gf_tw_del_timer(priv->timer_wheel, scrub_monitor->timer); + if (ret == 0) { + nstate = BR_SCRUB_STATE_STALLED; + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Volume is under active scrubbing. Pausing scrub.."); + } else { + nstate = BR_SCRUB_STATE_PAUSED; + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Scrubber paused"); + } + + _br_monitor_set_scrub_state(scrub_monitor, nstate); + + return 0; +} + +static void +br_scrubber_log_time(xlator_t *this, const char *sfx) +{ + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + br_private_t *priv = NULL; + time_t now = 0; + + now = gf_time(); + priv = this->private; + + gf_time_fmt(timestr, sizeof(timestr), now, gf_timefmt_FT); + + if (strcasecmp(sfx, "started") == 0) { + br_update_scrub_start_time(&priv->scrub_stat, now); + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_START, + "Scrubbing %s at %s", sfx, timestr); + } else { + br_update_scrub_finish_time(&priv->scrub_stat, timestr, now); + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_FINISH, + "Scrubbing %s at %s", sfx, timestr); + } +} + +static void +br_fsscanner_log_time(xlator_t *this, br_child_t *child, const char *sfx) +{ + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + time_t now = 0; + + now = gf_time(); + gf_time_fmt(timestr, sizeof(timestr), now, gf_timefmt_FT); + + if (strcasecmp(sfx, "started") == 0) { + gf_msg_debug(this->name, 0, "Scrubbing \"%s\" %s at %s", + child->brick_path, sfx, timestr); + } else { + gf_msg_debug(this->name, 0, "Scrubbing \"%s\" %s at %s", + child->brick_path, sfx, timestr); + } +} + +void +br_child_set_scrub_state(br_child_t *child, gf_boolean_t state) +{ + child->active_scrubbing = state; +} + +static void +br_fsscanner_wait_until_kicked(xlator_t *this, br_child_t *child) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + pthread_cleanup_push(_br_lock_cleaner, &scrub_monitor->wakelock); + pthread_mutex_lock(&scrub_monitor->wakelock); + { + while (!scrub_monitor->kick) + pthread_cond_wait(&scrub_monitor->wakecond, + &scrub_monitor->wakelock); + + /* Child lock is to synchronize with disconnect events */ + pthread_cleanup_push(_br_lock_cleaner, &child->lock); + pthread_mutex_lock(&child->lock); + { + scrub_monitor->active_child_count++; + br_child_set_scrub_state(child, _gf_true); + } + pthread_mutex_unlock(&child->lock); + pthread_cleanup_pop(0); + } + pthread_mutex_unlock(&scrub_monitor->wakelock); + pthread_cleanup_pop(0); +} + +static void +br_scrubber_entry_control(xlator_t *this) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + LOCK(&scrub_monitor->lock); + { + /* Move the state to BR_SCRUB_STATE_ACTIVE */ + if (scrub_monitor->state == BR_SCRUB_STATE_PENDING) + scrub_monitor->state = BR_SCRUB_STATE_ACTIVE; + br_scrubber_log_time(this, "started"); + priv->scrub_stat.scrub_running = 1; + } + UNLOCK(&scrub_monitor->lock); +} + +static void +br_scrubber_exit_control(xlator_t *this) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + LOCK(&scrub_monitor->lock); + { + br_scrubber_log_time(this, "finished"); + priv->scrub_stat.scrub_running = 0; + + if (scrub_monitor->state == BR_SCRUB_STATE_ACTIVE) { + (void)br_fsscan_activate(this); + } else { + UNLOCK(&scrub_monitor->lock); + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Volume waiting to get rescheduled.."); + return; + } + } + UNLOCK(&scrub_monitor->lock); +} + +static void +br_fsscanner_entry_control(xlator_t *this, br_child_t *child) +{ + br_fsscanner_log_time(this, child, "started"); +} + +static void +br_fsscanner_exit_control(xlator_t *this, br_child_t *child) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + if (!_br_is_child_connected(child)) { + gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SCRUB_INFO, + "Brick [%s] disconnected while scrubbing. Scrubbing " + "might be incomplete", + child->brick_path); + } + + br_fsscanner_log_time(this, child, "finished"); + + pthread_cleanup_push(_br_lock_cleaner, &scrub_monitor->wakelock); + pthread_mutex_lock(&scrub_monitor->wakelock); + { + scrub_monitor->active_child_count--; + pthread_cleanup_push(_br_lock_cleaner, &child->lock); + pthread_mutex_lock(&child->lock); + { + br_child_set_scrub_state(child, _gf_false); + } + pthread_mutex_unlock(&child->lock); + pthread_cleanup_pop(0); + + if (scrub_monitor->active_child_count == 0) { + /* The last child has finished scrubbing. + * Set the kick to false and wake up other + * children who are waiting for the last + * child to complete scrubbing. + */ + scrub_monitor->kick = _gf_false; + pthread_cond_broadcast(&scrub_monitor->wakecond); + + /* Signal monitor thread waiting for the all + * the children to finish scrubbing. + */ + pthread_cleanup_push(_br_lock_cleaner, &scrub_monitor->donelock); + pthread_mutex_lock(&scrub_monitor->donelock); + { + scrub_monitor->done = _gf_true; + pthread_cond_signal(&scrub_monitor->donecond); + } + pthread_mutex_unlock(&scrub_monitor->donelock); + pthread_cleanup_pop(0); + } else { + while (scrub_monitor->active_child_count) + pthread_cond_wait(&scrub_monitor->wakecond, + &scrub_monitor->wakelock); + } + } + pthread_mutex_unlock(&scrub_monitor->wakelock); + pthread_cleanup_pop(0); +} + +void * +br_fsscanner(void *arg) +{ + loc_t loc = { + 0, + }; + br_child_t *child = NULL; + xlator_t *this = NULL; + struct br_scanfs *fsscan = NULL; + + child = arg; + this = child->this; + fsscan = &child->fsscan; + + THIS = this; + loc.inode = child->table->root; + + while (1) { + br_fsscanner_wait_until_kicked(this, child); + { + /* precursor for scrub */ + br_fsscanner_entry_control(this, child); + + /* scrub */ + (void)syncop_ftw(child->xl, &loc, GF_CLIENT_PID_SCRUB, child, + br_fsscanner_handle_entry); + if (!list_empty(&fsscan->queued)) + wait_for_scrubbing(this, fsscan); + + /* scrub exit criteria */ + br_fsscanner_exit_control(this, child); + } + } + + return NULL; +} + +/** + * Keep this routine extremely simple and do not ever try to acquire + * child->lock here: it may lead to deadlock. Scrubber state is + * modified in br_fsscanner(). An intermediate state change to pause + * changes the scrub state to the _correct_ state by identifying a + * non-pending timer. + */ +void +br_kickstart_scanner(struct gf_tw_timer_list *timer, void *data, + unsigned long calltime) +{ + xlator_t *this = NULL; + struct br_monitor *scrub_monitor = data; + br_private_t *priv = NULL; + + THIS = this = scrub_monitor->this; + priv = this->private; + + /* Reset scrub statistics */ + priv->scrub_stat.scrubbed_files = 0; + priv->scrub_stat.unsigned_files = 0; + + /* Moves state from PENDING to ACTIVE */ + (void)br_scrubber_entry_control(this); + + /* kickstart scanning.. */ + pthread_mutex_lock(&scrub_monitor->wakelock); + { + scrub_monitor->kick = _gf_true; + GF_ASSERT(scrub_monitor->active_child_count == 0); + pthread_cond_broadcast(&scrub_monitor->wakecond); + } + pthread_mutex_unlock(&scrub_monitor->wakelock); + + return; +} + +static uint32_t +br_fsscan_calculate_delta(uint32_t times) +{ + return times; +} + +#define BR_SCRUB_ONDEMAND (1) +#define BR_SCRUB_MINUTE (60) +#define BR_SCRUB_HOURLY (60 * 60) +#define BR_SCRUB_DAILY (1 * 24 * 60 * 60) +#define BR_SCRUB_WEEKLY (7 * 24 * 60 * 60) +#define BR_SCRUB_BIWEEKLY (14 * 24 * 60 * 60) +#define BR_SCRUB_MONTHLY (30 * 24 * 60 * 60) + +static unsigned int +br_fsscan_calculate_timeout(scrub_freq_t freq) +{ + uint32_t timo = 0; + + switch (freq) { + case BR_FSSCRUB_FREQ_MINUTE: + timo = br_fsscan_calculate_delta(BR_SCRUB_MINUTE); + break; + case BR_FSSCRUB_FREQ_HOURLY: + timo = br_fsscan_calculate_delta(BR_SCRUB_HOURLY); + break; + case BR_FSSCRUB_FREQ_DAILY: + timo = br_fsscan_calculate_delta(BR_SCRUB_DAILY); + break; + case BR_FSSCRUB_FREQ_WEEKLY: + timo = br_fsscan_calculate_delta(BR_SCRUB_WEEKLY); + break; + case BR_FSSCRUB_FREQ_BIWEEKLY: + timo = br_fsscan_calculate_delta(BR_SCRUB_BIWEEKLY); + break; + case BR_FSSCRUB_FREQ_MONTHLY: + timo = br_fsscan_calculate_delta(BR_SCRUB_MONTHLY); + break; + default: + timo = 0; + } + + return timo; +} + +int32_t +br_fsscan_schedule(xlator_t *this) +{ + uint32_t timo = 0; + br_private_t *priv = NULL; + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + struct br_scrubber *fsscrub = NULL; + struct gf_tw_timer_list *timer = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + fsscrub = &priv->fsscrub; + scrub_monitor = &priv->scrub_monitor; + + scrub_monitor->boot = gf_time(); + + timo = br_fsscan_calculate_timeout(fsscrub->frequency); + if (timo == 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG, + "BUG: Zero schedule timeout"); + goto error_return; + } + + scrub_monitor->timer = GF_CALLOC(1, sizeof(*scrub_monitor->timer), + gf_br_stub_mt_br_scanner_freq_t); + if (!scrub_monitor->timer) + goto error_return; + + timer = scrub_monitor->timer; + INIT_LIST_HEAD(&timer->entry); + + timer->data = scrub_monitor; + timer->expires = timo; + timer->function = br_kickstart_scanner; + + gf_tw_add_timer(priv->timer_wheel, timer); + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING); + + gf_time_fmt(timestr, sizeof(timestr), (scrub_monitor->boot + timo), + gf_timefmt_FT); + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Scrubbing is " + "scheduled to run at %s", + timestr); + + return 0; + +error_return: + return -1; +} + +int32_t +br_fsscan_activate(xlator_t *this) +{ + uint32_t timo = 0; + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + time_t now = 0; + br_private_t *priv = NULL; + struct br_scrubber *fsscrub = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + fsscrub = &priv->fsscrub; + scrub_monitor = &priv->scrub_monitor; + + now = gf_time(); + timo = br_fsscan_calculate_timeout(fsscrub->frequency); + if (timo == 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG, + "BUG: Zero schedule timeout"); + return -1; + } + + pthread_mutex_lock(&scrub_monitor->donelock); + { + scrub_monitor->done = _gf_false; + } + pthread_mutex_unlock(&scrub_monitor->donelock); + + gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT); + (void)gf_tw_mod_timer(priv->timer_wheel, scrub_monitor->timer, timo); + + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING); + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Scrubbing is " + "rescheduled to run at %s", + timestr); + + return 0; +} + +int32_t +br_fsscan_reschedule(xlator_t *this) +{ + int32_t ret = 0; + uint32_t timo = 0; + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + time_t now = 0; + br_private_t *priv = NULL; + struct br_scrubber *fsscrub = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + fsscrub = &priv->fsscrub; + scrub_monitor = &priv->scrub_monitor; + + if (!fsscrub->frequency_reconf) + return 0; + + now = gf_time(); + timo = br_fsscan_calculate_timeout(fsscrub->frequency); + if (timo == 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG, + "BUG: Zero schedule timeout"); + return -1; + } + + gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT); + + pthread_mutex_lock(&scrub_monitor->donelock); + { + scrub_monitor->done = _gf_false; + } + pthread_mutex_unlock(&scrub_monitor->donelock); + + ret = gf_tw_mod_timer_pending(priv->timer_wheel, scrub_monitor->timer, + timo); + if (ret == 0) + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Scrubber is currently running and would be " + "rescheduled after completion"); + else { + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING); + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Scrubbing rescheduled to run at %s", timestr); + } + + return 0; +} + +int32_t +br_fsscan_ondemand(xlator_t *this) +{ + int32_t ret = 0; + uint32_t timo = 0; + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + time_t now = 0; + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + now = gf_time(); + timo = BR_SCRUB_ONDEMAND; + gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT); + + pthread_mutex_lock(&scrub_monitor->donelock); + { + scrub_monitor->done = _gf_false; + } + pthread_mutex_unlock(&scrub_monitor->donelock); + + ret = gf_tw_mod_timer_pending(priv->timer_wheel, scrub_monitor->timer, + timo); + if (ret == 0) + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Scrubber is currently running and would be " + "rescheduled after completion"); + else { + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING); + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Ondemand Scrubbing scheduled to run at %s", timestr); + } + + return 0; +} + +#define BR_SCRUB_THREAD_SCALE_LAZY 0 +#define BR_SCRUB_THREAD_SCALE_NORMAL 0.4 +#define BR_SCRUB_THREAD_SCALE_AGGRESSIVE 1.0 + +#ifndef M_E +#define M_E 2.718 +#endif + +/** + * This is just a simple exponential scale to a fixed value selected + * per throttle config. We probably need to be more smart and select + * the scale based on the number of processor cores too. + */ +static unsigned int +br_scrubber_calc_scale(xlator_t *this, br_private_t *priv, + scrub_throttle_t throttle) +{ + unsigned int scale = 0; + + switch (throttle) { + case BR_SCRUB_THROTTLE_VOID: + case BR_SCRUB_THROTTLE_STALLED: + scale = 0; + break; + case BR_SCRUB_THROTTLE_LAZY: + scale = priv->child_count * pow(M_E, BR_SCRUB_THREAD_SCALE_LAZY); + break; + case BR_SCRUB_THROTTLE_NORMAL: + scale = priv->child_count * pow(M_E, BR_SCRUB_THREAD_SCALE_NORMAL); + break; + case BR_SCRUB_THROTTLE_AGGRESSIVE: + scale = priv->child_count * + pow(M_E, BR_SCRUB_THREAD_SCALE_AGGRESSIVE); + break; + default: + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_UNKNOWN_THROTTLE, + "Unknown throttle %d", throttle); + } + + return scale; +} + +static br_child_t * +_br_scrubber_get_next_child(struct br_scrubber *fsscrub) +{ + br_child_t *child = NULL; + + child = list_first_entry(&fsscrub->scrublist, br_child_t, list); + list_rotate_left(&fsscrub->scrublist); + + return child; +} + +static void +_br_scrubber_get_entry(br_child_t *child, struct br_fsscan_entry **fsentry) +{ + struct br_scanfs *fsscan = &child->fsscan; + + if (list_empty(&fsscan->ready)) + return; + *fsentry = list_first_entry(&fsscan->ready, struct br_fsscan_entry, list); + list_del_init(&(*fsentry)->list); +} + +static void +_br_scrubber_find_scrubbable_entry(struct br_scrubber *fsscrub, + struct br_fsscan_entry **fsentry) +{ + br_child_t *child = NULL; + br_child_t *firstchild = NULL; + + while (1) { + while (list_empty(&fsscrub->scrublist)) + pthread_cond_wait(&fsscrub->cond, &fsscrub->mutex); + + firstchild = NULL; + for (child = _br_scrubber_get_next_child(fsscrub); child != firstchild; + child = _br_scrubber_get_next_child(fsscrub)) { + if (!firstchild) + firstchild = child; + + _br_scrubber_get_entry(child, fsentry); + if (*fsentry) + break; + } + + if (*fsentry) + break; + + /* nothing to work on.. wait till available */ + pthread_cond_wait(&fsscrub->cond, &fsscrub->mutex); + } +} + +static void +br_scrubber_pick_entry(struct br_scrubber *fsscrub, + struct br_fsscan_entry **fsentry) +{ + pthread_cleanup_push(_br_lock_cleaner, &fsscrub->mutex); + + pthread_mutex_lock(&fsscrub->mutex); + { + *fsentry = NULL; + _br_scrubber_find_scrubbable_entry(fsscrub, fsentry); + } + pthread_mutex_unlock(&fsscrub->mutex); + + pthread_cleanup_pop(0); +} + +struct br_scrub_entry { + gf_boolean_t scrubbed; + struct br_fsscan_entry *fsentry; +}; + +/** + * We need to be a bit careful here. These thread(s) are prone to cancellations + * when threads are scaled down (depending on the thottling value configured) + * and pausing scrub. A thread can get cancelled while it's waiting for entries + * in the ->pending queue or when an object is undergoing scrubbing. + */ +static void +br_scrubber_entry_handle(void *arg) +{ + struct br_scanfs *fsscan = NULL; + struct br_scrub_entry *sentry = NULL; + struct br_fsscan_entry *fsentry = NULL; + + sentry = arg; + + fsentry = sentry->fsentry; + fsscan = fsentry->fsscan; + + LOCK(&fsscan->entrylock); + { + if (sentry->scrubbed) { + _br_fsscan_dec_entry_count(fsscan); + + /* cleanup ->entry */ + fsentry->data = NULL; + fsentry->fsscan = NULL; + loc_wipe(&fsentry->parent); + gf_dirent_entry_free(fsentry->entry); + + GF_FREE(sentry->fsentry); + } else { + /* (re)queue the entry again for scrub */ + _br_fsscan_collect_entry(fsscan, sentry->fsentry); + } + } + UNLOCK(&fsscan->entrylock); +} + +static void +br_scrubber_scrub_entry(xlator_t *this, struct br_fsscan_entry *fsentry) +{ + struct br_scrub_entry sentry = { + 0, + }; + + sentry.scrubbed = 0; + sentry.fsentry = fsentry; + + pthread_cleanup_push(br_scrubber_entry_handle, &sentry); + { + (void)br_scrubber_scrub_begin(this, fsentry); + sentry.scrubbed = 1; + } + pthread_cleanup_pop(1); +} + +void * +br_scrubber_proc(void *arg) +{ + xlator_t *this = NULL; + struct br_scrubber *fsscrub = NULL; + struct br_fsscan_entry *fsentry = NULL; + + fsscrub = arg; + THIS = this = fsscrub->this; + + while (1) { + br_scrubber_pick_entry(fsscrub, &fsentry); + br_scrubber_scrub_entry(this, fsentry); + sleep(1); + } + + return NULL; +} + +static int32_t +br_scrubber_scale_up(xlator_t *this, struct br_scrubber *fsscrub, + unsigned int v1, unsigned int v2) +{ + int i = 0; + int32_t ret = -1; + int diff = 0; + struct br_scrubbers *scrub = NULL; + + diff = (int)(v2 - v1); + + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCALING_UP_SCRUBBER, + "Scaling up scrubbers [%d => %d]", v1, v2); + + for (i = 0; i < diff; i++) { + scrub = GF_CALLOC(diff, sizeof(*scrub), gf_br_mt_br_scrubber_t); + if (!scrub) + break; + + INIT_LIST_HEAD(&scrub->list); + ret = gf_thread_create(&scrub->scrubthread, NULL, br_scrubber_proc, + fsscrub, "brsproc"); + if (ret) + break; + + fsscrub->nr_scrubbers++; + list_add_tail(&scrub->list, &fsscrub->scrubbers); + } + + if ((i != diff) && !scrub) + goto error_return; + + if (i != diff) /* degraded scaling.. */ + gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SCALE_UP_FAILED, + "Could not fully scale up to %d scrubber(s). Spawned " + "%d/%d [total scrubber(s): %d]", + v2, i, diff, (v1 + i)); + + return 0; + +error_return: + return -1; +} + +static int32_t +br_scrubber_scale_down(xlator_t *this, struct br_scrubber *fsscrub, + unsigned int v1, unsigned int v2) +{ + int i = 0; + int diff = 0; + int32_t ret = -1; + struct br_scrubbers *scrub = NULL; + + diff = (int)(v1 - v2); + + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCALE_DOWN_SCRUBBER, + "Scaling down scrubbers [%d => %d]", v1, v2); + + for (i = 0; i < diff; i++) { + scrub = list_first_entry(&fsscrub->scrubbers, struct br_scrubbers, + list); + + list_del_init(&scrub->list); + ret = gf_thread_cleanup_xint(scrub->scrubthread); + if (ret) + break; + GF_FREE(scrub); + + fsscrub->nr_scrubbers--; + } + + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SCALE_DOWN_FAILED, + "Could not fully scale down " + "to %d scrubber(s). Terminated %d/%d [total " + "scrubber(s): %d]", + v1, i, diff, (v2 - i)); + ret = 0; + } + + return ret; +} + +static int32_t +br_scrubber_configure(xlator_t *this, br_private_t *priv, + struct br_scrubber *fsscrub, scrub_throttle_t nthrottle) +{ + int32_t ret = 0; + unsigned int v1 = 0; + unsigned int v2 = 0; + + v1 = fsscrub->nr_scrubbers; + v2 = br_scrubber_calc_scale(this, priv, nthrottle); + + if (v1 == v2) + return 0; + + if (v1 > v2) + ret = br_scrubber_scale_down(this, fsscrub, v1, v2); + else + ret = br_scrubber_scale_up(this, fsscrub, v1, v2); + + return ret; +} + +static int32_t +br_scrubber_fetch_option(xlator_t *this, char *opt, dict_t *options, + char **value) +{ + if (options) + GF_OPTION_RECONF(opt, *value, options, str, error_return); + else + GF_OPTION_INIT(opt, *value, str, error_return); + + return 0; + +error_return: + return -1; +} + +/* internal "throttle" override */ +#define BR_SCRUB_STALLED "STALLED" + +/* TODO: token buket spec */ +static int32_t +br_scrubber_handle_throttle(xlator_t *this, br_private_t *priv, dict_t *options, + gf_boolean_t scrubstall) +{ + int32_t ret = 0; + char *tmp = NULL; + struct br_scrubber *fsscrub = NULL; + scrub_throttle_t nthrottle = BR_SCRUB_THROTTLE_VOID; + + fsscrub = &priv->fsscrub; + fsscrub->throttle_reconf = _gf_false; + + ret = br_scrubber_fetch_option(this, "scrub-throttle", options, &tmp); + if (ret) + goto error_return; + + if (scrubstall) + tmp = BR_SCRUB_STALLED; + + if (strcasecmp(tmp, "lazy") == 0) + nthrottle = BR_SCRUB_THROTTLE_LAZY; + else if (strcasecmp(tmp, "normal") == 0) + nthrottle = BR_SCRUB_THROTTLE_NORMAL; + else if (strcasecmp(tmp, "aggressive") == 0) + nthrottle = BR_SCRUB_THROTTLE_AGGRESSIVE; + else if (strcasecmp(tmp, BR_SCRUB_STALLED) == 0) + nthrottle = BR_SCRUB_THROTTLE_STALLED; + else + goto error_return; + + /* on failure old throttling value is preserved */ + ret = br_scrubber_configure(this, priv, fsscrub, nthrottle); + if (ret) + goto error_return; + + if (fsscrub->throttle != nthrottle) + fsscrub->throttle_reconf = _gf_true; + + fsscrub->throttle = nthrottle; + return 0; + +error_return: + return -1; +} + +static int32_t +br_scrubber_handle_stall(xlator_t *this, br_private_t *priv, dict_t *options, + gf_boolean_t *scrubstall) +{ + int32_t ret = 0; + char *tmp = NULL; + + ret = br_scrubber_fetch_option(this, "scrub-state", options, &tmp); + if (ret) + goto error_return; + + if (strcasecmp(tmp, "pause") == 0) /* anything else is active */ + *scrubstall = _gf_true; + + return 0; + +error_return: + return -1; +} + +static int32_t +br_scrubber_handle_freq(xlator_t *this, br_private_t *priv, dict_t *options, + gf_boolean_t scrubstall) +{ + int32_t ret = -1; + char *tmp = NULL; + scrub_freq_t frequency = BR_FSSCRUB_FREQ_HOURLY; + struct br_scrubber *fsscrub = NULL; + + fsscrub = &priv->fsscrub; + fsscrub->frequency_reconf = _gf_true; + + ret = br_scrubber_fetch_option(this, "scrub-freq", options, &tmp); + if (ret) + goto error_return; + + if (scrubstall) + tmp = BR_SCRUB_STALLED; + + if (strcasecmp(tmp, "hourly") == 0) { + frequency = BR_FSSCRUB_FREQ_HOURLY; + } else if (strcasecmp(tmp, "daily") == 0) { + frequency = BR_FSSCRUB_FREQ_DAILY; + } else if (strcasecmp(tmp, "weekly") == 0) { + frequency = BR_FSSCRUB_FREQ_WEEKLY; + } else if (strcasecmp(tmp, "biweekly") == 0) { + frequency = BR_FSSCRUB_FREQ_BIWEEKLY; + } else if (strcasecmp(tmp, "monthly") == 0) { + frequency = BR_FSSCRUB_FREQ_MONTHLY; + } else if (strcasecmp(tmp, "minute") == 0) { + frequency = BR_FSSCRUB_FREQ_MINUTE; + } else if (strcasecmp(tmp, BR_SCRUB_STALLED) == 0) { + frequency = BR_FSSCRUB_FREQ_STALLED; + } else + goto error_return; + + if (fsscrub->frequency == frequency) + fsscrub->frequency_reconf = _gf_false; + else + fsscrub->frequency = frequency; + + return 0; + +error_return: + return -1; +} + +static void +br_scrubber_log_option(xlator_t *this, br_private_t *priv, + gf_boolean_t scrubstall) +{ + struct br_scrubber *fsscrub = &priv->fsscrub; + char *scrub_throttle_str[] = { + [BR_SCRUB_THROTTLE_LAZY] = "lazy", + [BR_SCRUB_THROTTLE_NORMAL] = "normal", + [BR_SCRUB_THROTTLE_AGGRESSIVE] = "aggressive", + [BR_SCRUB_THROTTLE_STALLED] = "stalled", + }; + + char *scrub_freq_str[] = { + [0] = "", + [BR_FSSCRUB_FREQ_HOURLY] = "hourly", + [BR_FSSCRUB_FREQ_DAILY] = "daily", + [BR_FSSCRUB_FREQ_WEEKLY] = "weekly", + [BR_FSSCRUB_FREQ_BIWEEKLY] = "biweekly", + [BR_FSSCRUB_FREQ_MONTHLY] = "monthly (30 days)", + [BR_FSSCRUB_FREQ_MINUTE] = "every minute", + }; + + if (scrubstall) + return; /* logged as pause */ + + if (fsscrub->frequency_reconf || fsscrub->throttle_reconf) { + if (fsscrub->throttle == BR_SCRUB_THROTTLE_VOID) + return; + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_TUNABLE, + "SCRUB TUNABLES:: [Frequency: %s, Throttle: %s]", + scrub_freq_str[fsscrub->frequency], + scrub_throttle_str[fsscrub->throttle]); + } +} + +int32_t +br_scrubber_handle_options(xlator_t *this, br_private_t *priv, dict_t *options) +{ + int32_t ret = 0; + gf_boolean_t scrubstall = _gf_false; /* not as dangerous as it sounds */ + + ret = br_scrubber_handle_stall(this, priv, options, &scrubstall); + if (ret) + goto error_return; + + ret = br_scrubber_handle_throttle(this, priv, options, scrubstall); + if (ret) + goto error_return; + + ret = br_scrubber_handle_freq(this, priv, options, scrubstall); + if (ret) + goto error_return; + + br_scrubber_log_option(this, priv, scrubstall); + + return 0; + +error_return: + return -1; +} + +inode_t * +br_lookup_bad_obj_dir(xlator_t *this, br_child_t *child, uuid_t gfid) +{ + struct iatt statbuf = { + 0, + }; + inode_table_t *table = NULL; + int32_t ret = -1; + loc_t loc = { + 0, + }; + inode_t *linked_inode = NULL; + int32_t op_errno = 0; + + GF_VALIDATE_OR_GOTO("bit-rot-scrubber", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, child, out); + + table = child->table; + + loc.inode = inode_new(table); + if (!loc.inode) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY, + "failed to allocate a new inode for" + "bad object directory"); + goto out; + } + + gf_uuid_copy(loc.gfid, gfid); + + ret = syncop_lookup(child->xl, &loc, &statbuf, NULL, NULL, NULL); + if (ret < 0) { + op_errno = -ret; + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_LOOKUP_FAILED, + "failed to lookup the bad " + "objects directory (gfid: %s (%s))", + uuid_utoa(gfid), strerror(op_errno)); + goto out; + } + + linked_inode = inode_link(loc.inode, NULL, NULL, &statbuf); + if (linked_inode) + inode_lookup(linked_inode); + +out: + loc_wipe(&loc); + return linked_inode; +} + +int32_t +br_read_bad_object_dir(xlator_t *this, br_child_t *child, fd_t *fd, + dict_t *dict) +{ + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + int32_t ret = -1; + off_t offset = 0; + int32_t count = 0; + char key[32] = { + 0, + }; + dict_t *out_dict = NULL; + + INIT_LIST_HEAD(&entries.list); + + while ((ret = syncop_readdir(child->xl, fd, 131072, offset, &entries, NULL, + &out_dict))) { + if (ret < 0) + goto out; + + list_for_each_entry(entry, &entries.list, list) + { + offset = entry->d_off; + + snprintf(key, sizeof(key), "quarantine-%d", count); + + /* + * ignore the dict_set errors for now. The intention is + * to get as many bad objects as possible instead of + * erroring out at the first failure. + */ + ret = dict_set_dynstr_with_alloc(dict, key, entry->d_name); + if (!ret) + count++; + + if (out_dict) { + dict_copy(out_dict, dict); + dict_unref(out_dict); + out_dict = NULL; + } + } + + gf_dirent_free(&entries); + } + + ret = count; + ret = dict_set_int32_sizen(dict, "count", count); + +out: + return ret; +} + +int32_t +br_get_bad_objects_from_child(xlator_t *this, dict_t *dict, br_child_t *child) +{ + inode_t *inode = NULL; + inode_table_t *table = NULL; + fd_t *fd = NULL; + int32_t ret = -1; + loc_t loc = { + 0, + }; + int32_t op_errno = 0; + + GF_VALIDATE_OR_GOTO("bit-rot-scrubber", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, child, out); + GF_VALIDATE_OR_GOTO(this->name, dict, out); + + table = child->table; + + inode = inode_find(table, BR_BAD_OBJ_CONTAINER); + if (!inode) { + inode = br_lookup_bad_obj_dir(this, child, BR_BAD_OBJ_CONTAINER); + if (!inode) + goto out; + } + + fd = fd_create(inode, 0); + if (!fd) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_FD_CREATE_FAILED, + "fd creation for the bad " + "objects directory failed (gfid: %s)", + uuid_utoa(BR_BAD_OBJ_CONTAINER)); + goto out; + } + + loc.inode = inode; + gf_uuid_copy(loc.gfid, inode->gfid); + + ret = syncop_opendir(child->xl, &loc, fd, NULL, NULL); + if (ret < 0) { + op_errno = -ret; + fd_unref(fd); + fd = NULL; + gf_msg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_FD_CREATE_FAILED, + "failed to open the bad " + "objects directory %s", + uuid_utoa(BR_BAD_OBJ_CONTAINER)); + goto out; + } + + fd_bind(fd); + + ret = br_read_bad_object_dir(this, child, fd, dict); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_BAD_OBJ_READDIR_FAIL, + "readdir of the bad " + "objects directory (%s) failed ", + uuid_utoa(BR_BAD_OBJ_CONTAINER)); + goto out; + } + + ret = 0; + +out: + loc_wipe(&loc); + if (fd) + fd_unref(fd); + return ret; +} + +int32_t +br_collect_bad_objects_of_child(xlator_t *this, br_child_t *child, dict_t *dict, + dict_t *child_dict, int32_t total_count) +{ + int32_t ret = -1; + int32_t count = 0; + char key[32] = { + 0, + }; + char main_key[32] = { + 0, + }; + int32_t j = 0; + int32_t tmp_count = 0; + char *entry = NULL; + char tmp[PATH_MAX] = { + 0, + }; + char *path = NULL; + int32_t len = 0; + + ret = dict_get_int32_sizen(child_dict, "count", &count); + if (ret) + goto out; + + tmp_count = total_count; + + for (j = 0; j < count; j++) { + len = snprintf(key, sizeof(key), "quarantine-%d", j); + ret = dict_get_strn(child_dict, key, len, &entry); + if (ret) + continue; + + ret = dict_get_str(child_dict, entry, &path); + len = snprintf(tmp, PATH_MAX, "%s ==> BRICK: %s\n path: %s", entry, + child->brick_path, path); + if ((len < 0) || (len >= PATH_MAX)) { + continue; + } + snprintf(main_key, sizeof(main_key), "quarantine-%d", tmp_count); + + ret = dict_set_dynstr_with_alloc(dict, main_key, tmp); + if (!ret) + tmp_count++; + path = NULL; + } + + ret = tmp_count; + +out: + return ret; +} + +int32_t +br_collect_bad_objects_from_children(xlator_t *this, dict_t *dict) +{ + int32_t ret = -1; + dict_t *child_dict = NULL; + int32_t i = 0; + int32_t total_count = 0; + br_child_t *child = NULL; + br_private_t *priv = NULL; + dict_t *tmp_dict = NULL; + + priv = this->private; + tmp_dict = dict; + + for (i = 0; i < priv->child_count; i++) { + child = &priv->children[i]; + GF_ASSERT(child); + if (!_br_is_child_connected(child)) + continue; + + child_dict = dict_new(); + if (!child_dict) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY, + "failed to allocate dict"); + continue; + } + ret = br_get_bad_objects_from_child(this, child_dict, child); + /* + * Continue asking the remaining children for the list of + * bad objects even though getting the list from one of them + * fails. + */ + if (ret) { + dict_unref(child_dict); + continue; + } + + ret = br_collect_bad_objects_of_child(this, child, tmp_dict, child_dict, + total_count); + if (ret < 0) { + dict_unref(child_dict); + continue; + } + + total_count = ret; + dict_unref(child_dict); + child_dict = NULL; + } + + ret = dict_set_int32(tmp_dict, "total-count", total_count); + + return ret; +} + +int32_t +br_get_bad_objects_list(xlator_t *this, dict_t **dict) +{ + int32_t ret = -1; + dict_t *tmp_dict = NULL; + + GF_VALIDATE_OR_GOTO("bir-rot-scrubber", this, out); + GF_VALIDATE_OR_GOTO(this->name, dict, out); + + tmp_dict = *dict; + if (!tmp_dict) { + tmp_dict = dict_new(); + if (!tmp_dict) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY, + "failed to allocate dict"); + goto out; + } + *dict = tmp_dict; + } + + ret = br_collect_bad_objects_from_children(this, tmp_dict); + +out: + return ret; +} + +static int +wait_for_scrub_to_finish(xlator_t *this) +{ + int ret = -1; + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + GF_VALIDATE_OR_GOTO("bit-rot", scrub_monitor, out); + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Waiting for all children to start and finish scrub"); + + pthread_mutex_lock(&scrub_monitor->donelock); + { + while (!scrub_monitor->done) + pthread_cond_wait(&scrub_monitor->donecond, + &scrub_monitor->donelock); + } + pthread_mutex_unlock(&scrub_monitor->donelock); + ret = 0; +out: + return ret; +} + +/** + * This function is executed in a separate thread. This is scrubber monitor + * thread that takes care of state machine. + */ +void * +br_monitor_thread(void *arg) +{ + int32_t ret = 0; + xlator_t *this = NULL; + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + this = arg; + priv = this->private; + + /* + * Since, this is the topmost xlator, THIS has to be set by bit-rot + * xlator itself (STACK_WIND won't help in this case). Also it has + * to be done for each thread that gets spawned. Otherwise, a new + * thread will get global_xlator's pointer when it does "THIS". + */ + THIS = this; + + scrub_monitor = &priv->scrub_monitor; + + pthread_mutex_lock(&scrub_monitor->mutex); + { + while (!scrub_monitor->inited) + pthread_cond_wait(&scrub_monitor->cond, &scrub_monitor->mutex); + } + pthread_mutex_unlock(&scrub_monitor->mutex); + + /* this needs to be serialized with reconfigure() */ + pthread_mutex_lock(&priv->lock); + { + ret = br_scrub_state_machine(this, _gf_false); + } + pthread_mutex_unlock(&priv->lock); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_SSM_FAILED, + "Scrub state machine failed"); + goto out; + } + + while (1) { + /* Wait for all children to finish scrubbing */ + ret = wait_for_scrub_to_finish(this); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_SCRUB_WAIT_FAILED, + "Scrub wait failed"); + goto out; + } + + /* scrub exit criteria: Move the state to PENDING */ + br_scrubber_exit_control(this); + } + +out: + return NULL; +} + +static void +br_set_scrub_state(struct br_monitor *scrub_monitor, br_scrub_state_t state) +{ + LOCK(&scrub_monitor->lock); + { + _br_monitor_set_scrub_state(scrub_monitor, state); + } + UNLOCK(&scrub_monitor->lock); +} + +int32_t +br_scrubber_monitor_init(xlator_t *this, br_private_t *priv) +{ + struct br_monitor *scrub_monitor = NULL; + int ret = 0; + + scrub_monitor = &priv->scrub_monitor; + + LOCK_INIT(&scrub_monitor->lock); + scrub_monitor->this = this; + + scrub_monitor->inited = _gf_false; + pthread_mutex_init(&scrub_monitor->mutex, NULL); + pthread_cond_init(&scrub_monitor->cond, NULL); + + scrub_monitor->kick = _gf_false; + scrub_monitor->active_child_count = 0; + pthread_mutex_init(&scrub_monitor->wakelock, NULL); + pthread_cond_init(&scrub_monitor->wakecond, NULL); + + scrub_monitor->done = _gf_false; + pthread_mutex_init(&scrub_monitor->donelock, NULL); + pthread_cond_init(&scrub_monitor->donecond, NULL); + + /* Set the state to INACTIVE */ + br_set_scrub_state(&priv->scrub_monitor, BR_SCRUB_STATE_INACTIVE); + + /* Start the monitor thread */ + ret = gf_thread_create(&scrub_monitor->thread, NULL, br_monitor_thread, + this, "brmon"); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_SPAWN_FAILED, + "monitor thread creation failed"); + ret = -1; + goto err; + } + + return 0; +err: + pthread_mutex_destroy(&scrub_monitor->mutex); + pthread_cond_destroy(&scrub_monitor->cond); + + pthread_mutex_destroy(&scrub_monitor->wakelock); + pthread_cond_destroy(&scrub_monitor->wakecond); + + pthread_mutex_destroy(&scrub_monitor->donelock); + pthread_cond_destroy(&scrub_monitor->donecond); + + LOCK_DESTROY(&scrub_monitor->lock); + + return ret; +} + +int32_t +br_scrubber_init(xlator_t *this, br_private_t *priv) +{ + struct br_scrubber *fsscrub = NULL; + int ret = 0; + + priv->tbf = tbf_init(NULL, 0); + if (!priv->tbf) + return -1; + + ret = br_scrubber_monitor_init(this, priv); + if (ret) + return -1; + + fsscrub = &priv->fsscrub; + + fsscrub->this = this; + fsscrub->throttle = BR_SCRUB_THROTTLE_VOID; + + pthread_mutex_init(&fsscrub->mutex, NULL); + pthread_cond_init(&fsscrub->cond, NULL); + + fsscrub->nr_scrubbers = 0; + INIT_LIST_HEAD(&fsscrub->scrubbers); + INIT_LIST_HEAD(&fsscrub->scrublist); + + return 0; +} diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h new file mode 100644 index 00000000000..4e5f67bc021 --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h @@ -0,0 +1,46 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_SCRUB_H__ +#define __BIT_ROT_SCRUB_H__ + +#include <glusterfs/xlator.h> +#include "bit-rot.h" + +void * +br_fsscanner(void *); + +int32_t +br_fsscan_schedule(xlator_t *); +int32_t +br_fsscan_reschedule(xlator_t *); +int32_t +br_fsscan_activate(xlator_t *); +int32_t +br_fsscan_deactivate(xlator_t *); +int32_t +br_fsscan_ondemand(xlator_t *); + +int32_t +br_scrubber_handle_options(xlator_t *, br_private_t *, dict_t *); + +int32_t +br_scrubber_monitor_init(xlator_t *, br_private_t *); + +int32_t +br_scrubber_init(xlator_t *, br_private_t *); + +int32_t +br_collect_bad_objects_from_children(xlator_t *this, dict_t *dict); + +void +br_child_set_scrub_state(br_child_t *, gf_boolean_t); + +#endif /* __BIT_ROT_SCRUB_H__ */ diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-ssm.c b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.c new file mode 100644 index 00000000000..753e31a3b23 --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.c @@ -0,0 +1,124 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "bit-rot-ssm.h" +#include "bit-rot-scrub.h" +#include "bit-rot-bitd-messages.h" + +int +br_scrub_ssm_noop(xlator_t *this) +{ + return 0; +} + +int +br_scrub_ssm_state_pause(xlator_t *this) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO, + "Scrubber paused"); + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PAUSED); + return 0; +} + +int +br_scrub_ssm_state_ipause(xlator_t *this) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO, + "Scrubber paused"); + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_IPAUSED); + return 0; +} + +int +br_scrub_ssm_state_active(xlator_t *this) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + if (scrub_monitor->done) { + (void)br_fsscan_activate(this); + } else { + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO, + "Scrubbing resumed"); + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_ACTIVE); + } + + return 0; +} + +int +br_scrub_ssm_state_stall(xlator_t *this) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO, + "Volume is under active scrubbing. Pausing scrub.."); + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_STALLED); + return 0; +} + +static br_scrub_ssm_call *br_scrub_ssm[BR_SCRUB_MAXSTATES][BR_SCRUB_MAXEVENTS] = + { + /* INACTIVE */ + {br_fsscan_schedule, br_scrub_ssm_state_ipause, br_scrub_ssm_noop}, + /* PENDING */ + {br_fsscan_reschedule, br_fsscan_deactivate, br_fsscan_ondemand}, + /* ACTIVE */ + {br_scrub_ssm_noop, br_scrub_ssm_state_stall, br_scrub_ssm_noop}, + /* PAUSED */ + {br_fsscan_activate, br_scrub_ssm_noop, br_scrub_ssm_noop}, + /* IPAUSED */ + {br_fsscan_schedule, br_scrub_ssm_noop, br_scrub_ssm_noop}, + /* STALLED */ + {br_scrub_ssm_state_active, br_scrub_ssm_noop, br_scrub_ssm_noop}, +}; + +int32_t +br_scrub_state_machine(xlator_t *this, gf_boolean_t scrub_ondemand) +{ + br_private_t *priv = NULL; + br_scrub_ssm_call *call = NULL; + struct br_scrubber *fsscrub = NULL; + br_scrub_state_t currstate = 0; + br_scrub_event_t event = 0; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + fsscrub = &priv->fsscrub; + scrub_monitor = &priv->scrub_monitor; + + currstate = scrub_monitor->state; + if (scrub_ondemand) + event = BR_SCRUB_EVENT_ONDEMAND; + else + event = _br_child_get_scrub_event(fsscrub); + + call = br_scrub_ssm[currstate][event]; + return call(this); +} diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h new file mode 100644 index 00000000000..37b45a42eac --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h @@ -0,0 +1,38 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_SSM_H__ +#define __BIT_ROT_SSM_H__ + +#include <glusterfs/xlator.h> + +typedef enum br_scrub_state { + BR_SCRUB_STATE_INACTIVE = 0, + BR_SCRUB_STATE_PENDING, + BR_SCRUB_STATE_ACTIVE, + BR_SCRUB_STATE_PAUSED, + BR_SCRUB_STATE_IPAUSED, + BR_SCRUB_STATE_STALLED, + BR_SCRUB_MAXSTATES, +} br_scrub_state_t; + +typedef enum br_scrub_event { + BR_SCRUB_EVENT_SCHEDULE = 0, + BR_SCRUB_EVENT_PAUSE, + BR_SCRUB_EVENT_ONDEMAND, + BR_SCRUB_MAXEVENTS, +} br_scrub_event_t; + +struct br_monitor; + +int32_t +br_scrub_state_machine(xlator_t *, gf_boolean_t); + +#endif /* __BIT_ROT_SSM_H__ */ diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c new file mode 100644 index 00000000000..a2f1c343a1d --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot.c @@ -0,0 +1,2232 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <ctype.h> + +#include <glusterfs/logging.h> +#include <glusterfs/compat-errno.h> + +#include "bit-rot.h" +#include "bit-rot-scrub.h" +#include <pthread.h> +#include "bit-rot-bitd-messages.h" + +#define BR_HASH_CALC_READ_SIZE (128 * 1024) + +typedef int32_t(br_child_handler)(xlator_t *, br_child_t *); + +struct br_child_event { + xlator_t *this; + + br_child_t *child; + + br_child_handler *call; + + struct list_head list; +}; + +static int +br_find_child_index(xlator_t *this, xlator_t *child) +{ + br_private_t *priv = NULL; + int i = -1; + int index = -1; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, child, out); + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (child == priv->children[i].xl) { + index = i; + break; + } + } + +out: + return index; +} + +br_child_t * +br_get_child_from_brick_path(xlator_t *this, char *brick_path) +{ + br_private_t *priv = NULL; + br_child_t *child = NULL; + br_child_t *tmp = NULL; + int i = 0; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, brick_path, out); + + priv = this->private; + + pthread_mutex_lock(&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + tmp = &priv->children[i]; + if (!strcmp(tmp->brick_path, brick_path)) { + child = tmp; + break; + } + } + } + pthread_mutex_unlock(&priv->lock); + +out: + return child; +} + +/** + * probably we'll encapsulate brick inside our own structure when + * needed -- later. + */ +void * +br_brick_init(void *xl, struct gf_brick_spec *brick) +{ + return brick; +} + +/** + * and cleanup things here when allocated br_brick_init(). + */ +void +br_brick_fini(void *xl, char *brick, void *data) +{ + return; +} + +/** + * TODO: Signature can contain null terminators which causes bitrot + * stub to store truncated hash as it depends on string length of + * the hash. + * + * FIX: Send the string length as part of the signature struct and + * change stub to handle this change. + */ +static br_isignature_t * +br_prepare_signature(const unsigned char *sign, unsigned long hashlen, + int8_t hashtype, br_object_t *object) +{ + br_isignature_t *signature = NULL; + + /* TODO: use mem-pool */ + signature = GF_CALLOC(1, signature_size(hashlen + 1), + gf_br_stub_mt_signature_t); + if (!signature) + return NULL; + + /* object version */ + signature->signedversion = object->signedversion; + + /* signature length & type */ + signature->signaturelen = hashlen; + signature->signaturetype = hashtype; + + /* signature itself */ + memcpy(signature->signature, (char *)sign, hashlen); + signature->signature[hashlen + 1] = '\0'; + + return signature; +} + +gf_boolean_t +bitd_is_bad_file(xlator_t *this, br_child_t *child, loc_t *loc, fd_t *fd) +{ + int32_t ret = -1; + dict_t *xattr = NULL; + inode_t *inode = NULL; + gf_boolean_t bad_file = _gf_false; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + + inode = (loc) ? loc->inode : fd->inode; + + if (fd) + ret = syncop_fgetxattr(child->xl, fd, &xattr, BITROT_OBJECT_BAD_KEY, + NULL, NULL); + else if (loc) + ret = syncop_getxattr(child->xl, loc, &xattr, BITROT_OBJECT_BAD_KEY, + NULL, NULL); + + if (!ret) { + gf_msg_debug(this->name, 0, "[GFID: %s] is marked corrupted", + uuid_utoa(inode->gfid)); + bad_file = _gf_true; + } + + if (xattr) + dict_unref(xattr); + +out: + return bad_file; +} + +/** + * Do a lookup on the gfid present within the object. + */ +static int32_t +br_object_lookup(xlator_t *this, br_object_t *object, struct iatt *iatt, + inode_t **linked_inode) +{ + int ret = -EINVAL; + loc_t loc = { + 0, + }; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, object, out); + + inode = inode_find(object->child->table, object->gfid); + + if (inode) + loc.inode = inode; + else + loc.inode = inode_new(object->child->table); + + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + + gf_uuid_copy(loc.gfid, object->gfid); + + ret = syncop_lookup(object->child->xl, &loc, iatt, NULL, NULL, NULL); + if (ret < 0) + goto out; + + /* + * The file might have been deleted by the application + * after getting the event, but before doing a lookup. + * So use linked_inode after inode_link is done. + */ + *linked_inode = inode_link(loc.inode, NULL, NULL, iatt); + if (*linked_inode) + inode_lookup(*linked_inode); + +out: + loc_wipe(&loc); + return ret; +} + +/** + * open the object with O_RDONLY flags and return the fd. How to let brick + * know that open is being done by bitd because syncop framework does not allow + * passing xdata -- may be use frame->root->pid itself. + */ +static int32_t +br_object_open(xlator_t *this, br_object_t *object, inode_t *inode, + fd_t **openfd) +{ + int32_t ret = -1; + fd_t *fd = NULL; + loc_t loc = { + 0, + }; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, object, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + ret = -EINVAL; + fd = fd_create(inode, 0); + if (!fd) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + goto out; + } + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + ret = syncop_open(object->child->xl, &loc, O_RDONLY, fd, NULL, NULL); + if (ret) { + br_log_object(this, "open", inode->gfid, -ret); + fd_unref(fd); + fd = NULL; + } else { + fd_bind(fd); + *openfd = fd; + } + + loc_wipe(&loc); + +out: + return ret; +} + +/** + * read 128k block from the object @object from the offset @offset + * and return the buffer. + */ +static int32_t +br_object_read_block_and_sign(xlator_t *this, fd_t *fd, br_child_t *child, + off_t offset, size_t size, SHA256_CTX *sha256) +{ + int32_t ret = -1; + tbf_t *tbf = NULL; + struct iovec *iovec = NULL; + struct iobref *iobref = NULL; + br_private_t *priv = NULL; + int count = 0; + int i = 0; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, out); + GF_VALIDATE_OR_GOTO(this->name, child, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + priv = this->private; + + GF_VALIDATE_OR_GOTO(this->name, priv->tbf, out); + tbf = priv->tbf; + + ret = syncop_readv(child->xl, fd, size, offset, 0, &iovec, &count, &iobref, + NULL, NULL, NULL); + + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, BRB_MSG_READV_FAILED, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + ret = -1; + goto out; + } + + if (ret == 0) + goto out; + + for (i = 0; i < count; i++) { + TBF_THROTTLE_BEGIN(tbf, TBF_OP_HASH, iovec[i].iov_len); + { + SHA256_Update(sha256, (const unsigned char *)(iovec[i].iov_base), + iovec[i].iov_len); + } + TBF_THROTTLE_BEGIN(tbf, TBF_OP_HASH, iovec[i].iov_len); + } + +out: + if (iovec) + GF_FREE(iovec); + + if (iobref) + iobref_unref(iobref); + + return ret; +} + +int32_t +br_calculate_obj_checksum(unsigned char *md, br_child_t *child, fd_t *fd, + struct iatt *iatt) +{ + int32_t ret = -1; + off_t offset = 0; + size_t block = BR_HASH_CALC_READ_SIZE; + xlator_t *this = NULL; + + SHA256_CTX sha256; + + GF_VALIDATE_OR_GOTO("bit-rot", child, out); + GF_VALIDATE_OR_GOTO("bit-rot", iatt, out); + GF_VALIDATE_OR_GOTO("bit-rot", fd, out); + + this = child->this; + + SHA256_Init(&sha256); + + while (1) { + ret = br_object_read_block_and_sign(this, fd, child, offset, block, + &sha256); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_BLOCK_READ_FAILED, + "offset=%" PRIu64, offset, "object-gfid=%s", + uuid_utoa(fd->inode->gfid), NULL); + break; + } + + if (ret == 0) + break; + + offset += ret; + } + + if (ret == 0) + SHA256_Final(md, &sha256); + +out: + return ret; +} + +static int32_t +br_object_checksum(unsigned char *md, br_object_t *object, fd_t *fd, + struct iatt *iatt) +{ + return br_calculate_obj_checksum(md, object->child, fd, iatt); +} + +static int32_t +br_object_read_sign(inode_t *linked_inode, fd_t *fd, br_object_t *object, + struct iatt *iatt) +{ + int32_t ret = -1; + xlator_t *this = NULL; + dict_t *xattr = NULL; + unsigned char *md = NULL; + br_isignature_t *sign = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot", object, out); + GF_VALIDATE_OR_GOTO("bit-rot", linked_inode, out); + GF_VALIDATE_OR_GOTO("bit-rot", fd, out); + + this = object->this; + + md = GF_MALLOC(SHA256_DIGEST_LENGTH, gf_common_mt_char); + if (!md) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_SAVING_HASH_FAILED, + "object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto out; + } + + ret = br_object_checksum(md, object, fd, iatt); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_CALC_CHECKSUM_FAILED, + "object-gfid=%s", uuid_utoa(linked_inode->gfid), NULL); + goto free_signature; + } + + sign = br_prepare_signature(md, SHA256_DIGEST_LENGTH, + BR_SIGNATURE_TYPE_SHA256, object); + if (!sign) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED, + "object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto free_signature; + } + + xattr = dict_for_key_value(GLUSTERFS_SET_OBJECT_SIGNATURE, (void *)sign, + signature_size(SHA256_DIGEST_LENGTH), _gf_true); + + if (!xattr) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED, + "dict-allocation object-gfid=%s", uuid_utoa(fd->inode->gfid), + NULL); + goto free_isign; + } + + ret = syncop_fsetxattr(object->child->xl, fd, xattr, 0, NULL, NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED, + "fsetxattr object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto unref_dict; + } + + ret = 0; + +unref_dict: + dict_unref(xattr); +free_isign: + GF_FREE(sign); +free_signature: + GF_FREE(md); +out: + return ret; +} + +static int +br_object_sign_softerror(int32_t op_errno) +{ + return ((op_errno == ENOENT) || (op_errno == ESTALE) || + (op_errno == ENODATA)); +} + +void +br_log_object(xlator_t *this, char *op, uuid_t gfid, int32_t op_errno) +{ + int softerror = br_object_sign_softerror(op_errno); + if (softerror) { + gf_msg_debug(this->name, 0, + "%s() failed on object %s " + "[reason: %s]", + op, uuid_utoa(gfid), strerror(op_errno)); + } else { + gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_OP_FAILED, "op=%s", + op, "gfid=%s", uuid_utoa(gfid), NULL); + } +} + +void +br_log_object_path(xlator_t *this, char *op, const char *path, int32_t op_errno) +{ + int softerror = br_object_sign_softerror(op_errno); + if (softerror) { + gf_msg_debug(this->name, 0, + "%s() failed on object %s " + "[reason: %s]", + op, path, strerror(op_errno)); + } else { + gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_OP_FAILED, "op=%s", + op, "path=%s", path, NULL); + } +} + +static void +br_trigger_sign(xlator_t *this, br_child_t *child, inode_t *linked_inode, + loc_t *loc, gf_boolean_t need_reopen) +{ + fd_t *fd = NULL; + int32_t ret = -1; + uint32_t val = 0; + dict_t *dict = NULL; + pid_t pid = GF_CLIENT_PID_BITD; + + syncopctx_setfspid(&pid); + + val = (need_reopen == _gf_true) ? BR_OBJECT_REOPEN : BR_OBJECT_RESIGN; + + dict = dict_new(); + if (!dict) + goto out; + + ret = dict_set_uint32(dict, BR_REOPEN_SIGN_HINT_KEY, val); + if (ret) + goto cleanup_dict; + + ret = -1; + fd = fd_create(linked_inode, 0); + if (!fd) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED, + "gfid=%s", uuid_utoa(linked_inode->gfid), NULL); + goto cleanup_dict; + } + + ret = syncop_open(child->xl, loc, O_RDWR, fd, NULL, NULL); + if (ret) { + br_log_object(this, "open", linked_inode->gfid, -ret); + goto unref_fd; + } + + fd_bind(fd); + + ret = syncop_fsetxattr(child->xl, fd, dict, 0, NULL, NULL); + if (ret) + br_log_object(this, "fsetxattr", linked_inode->gfid, -ret); + + /* passthough: fd_unref() */ + +unref_fd: + fd_unref(fd); +cleanup_dict: + dict_unref(dict); +out: + if (ret) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_TRIGGER_SIGN_FAILED, + "gfid=%s", uuid_utoa(linked_inode->gfid), "reopen-hint-val=%d", + val, NULL); + } +} + +static void +br_object_resign(xlator_t *this, br_object_t *object, inode_t *linked_inode) +{ + loc_t loc = { + 0, + }; + + loc.inode = inode_ref(linked_inode); + gf_uuid_copy(loc.gfid, linked_inode->gfid); + + br_trigger_sign(this, object->child, linked_inode, &loc, _gf_false); + + loc_wipe(&loc); +} + +/** + * Sign a given object. This routine runs full throttle. There needs to be + * some form of priority scheduling and/or read burstness to avoid starving + * (or kicking) client I/O's. + */ +static int32_t +br_sign_object(br_object_t *object) +{ + int32_t ret = -1; + inode_t *linked_inode = NULL; + xlator_t *this = NULL; + fd_t *fd = NULL; + struct iatt iatt = { + 0, + }; + pid_t pid = GF_CLIENT_PID_BITD; + br_sign_state_t sign_info = BR_SIGN_NORMAL; + + GF_VALIDATE_OR_GOTO("bit-rot", object, out); + + this = object->this; + + /** + * FIXME: This is required as signing an object is restricted to + * clients with special frame->root->pid. Change the way client + * pid is set. + */ + syncopctx_setfspid(&pid); + + ret = br_object_lookup(this, object, &iatt, &linked_inode); + if (ret) { + br_log_object(this, "lookup", object->gfid, -ret); + goto out; + } + + /** + * For fd's that have notified for reopening, we send an explicit + * open() followed by a dummy write() call. This triggers the + * actual signing of the object. + */ + sign_info = ntohl(object->sign_info); + if (sign_info == BR_SIGN_REOPEN_WAIT) { + br_object_resign(this, object, linked_inode); + goto unref_inode; + } + + ret = br_object_open(this, object, linked_inode, &fd); + if (!fd) { + br_log_object(this, "open", object->gfid, -ret); + goto unref_inode; + } + + /** + * we have an open file descriptor on the object. from here on, + * do not be generous to file operation errors. + */ + gf_msg_debug(this->name, 0, "Signing object [%s]", + uuid_utoa(linked_inode->gfid)); + + ret = br_object_read_sign(linked_inode, fd, object, &iatt); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_READ_AND_SIGN_FAILED, + "gfid=%s", uuid_utoa(linked_inode->gfid), NULL); + goto unref_fd; + } + + ret = 0; + +unref_fd: + fd_unref(fd); +unref_inode: + inode_unref(linked_inode); +out: + return ret; +} + +static br_object_t * +__br_pick_object(br_private_t *priv) +{ + br_object_t *object = NULL; + + while (list_empty(&priv->obj_queue->objects)) { + pthread_cond_wait(&priv->object_cond, &priv->lock); + } + + object = list_first_entry(&priv->obj_queue->objects, br_object_t, list); + list_del_init(&object->list); + + return object; +} + +/** + * This is the place where the signing of the objects is triggered. + */ +void * +br_process_object(void *arg) +{ + xlator_t *this = NULL; + br_object_t *object = NULL; + br_private_t *priv = NULL; + int32_t ret = -1; + + this = arg; + priv = this->private; + + THIS = this; + + for (;;) { + pthread_mutex_lock(&priv->lock); + { + object = __br_pick_object(priv); + } + pthread_mutex_unlock(&priv->lock); + + ret = br_sign_object(object); + if (ret && !br_object_sign_softerror(-ret)) + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED, + "gfid=%s", uuid_utoa(object->gfid), NULL); + GF_FREE(object); + } + + return NULL; +} + +/** + * This function gets kicked in once the object is expired from the + * timer wheel. This actually adds the object received via notification + * from the changelog to the queue from where the objects gets picked + * up for signing. + * + * This routine can be made lightweight by introducing an alternate + * timer-wheel API that dispatches _all_ expired objects in one-shot + * rather than an object at-a-time. This routine can then just simply + * be a call to list_splice_tail(). + * + * NOTE: use call_time to instrument signing time in br_sign_object(). + */ +void +br_add_object_to_queue(struct gf_tw_timer_list *timer, void *data, + unsigned long call_time) +{ + br_object_t *object = NULL; + xlator_t *this = NULL; + br_private_t *priv = NULL; + + object = data; + this = object->this; + priv = this->private; + + THIS = this; + + pthread_mutex_lock(&priv->lock); + { + list_add_tail(&object->list, &priv->obj_queue->objects); + pthread_cond_broadcast(&priv->object_cond); + } + pthread_mutex_unlock(&priv->lock); + + if (timer) + mem_put(timer); + return; +} + +static br_object_t * +br_initialize_object(xlator_t *this, br_child_t *child, changelog_event_t *ev) +{ + br_object_t *object = NULL; + + object = GF_CALLOC(1, sizeof(*object), gf_br_mt_br_object_t); + if (!object) + goto out; + INIT_LIST_HEAD(&object->list); + + object->this = this; + object->child = child; + gf_uuid_copy(object->gfid, ev->u.releasebr.gfid); + + /* NOTE: it's BE, but no worry */ + object->signedversion = ev->u.releasebr.version; + object->sign_info = ev->u.releasebr.sign_info; + +out: + return object; +} + +static struct gf_tw_timer_list * +br_initialize_timer(xlator_t *this, br_object_t *object, br_child_t *child, + changelog_event_t *ev) +{ + br_private_t *priv = NULL; + struct gf_tw_timer_list *timer = NULL; + + priv = this->private; + + timer = mem_get0(child->timer_pool); + if (!timer) + goto out; + INIT_LIST_HEAD(&timer->entry); + + timer->expires = priv->expiry_time; + if (!timer->expires) + timer->expires = 1; + + timer->data = object; + timer->function = br_add_object_to_queue; + gf_tw_add_timer(priv->timer_wheel, timer); + +out: + return timer; +} + +static int32_t +br_schedule_object_reopen(xlator_t *this, br_object_t *object, + br_child_t *child, changelog_event_t *ev) +{ + struct gf_tw_timer_list *timer = NULL; + + timer = br_initialize_timer(this, object, child, ev); + if (!timer) + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_TIMER_FAILED, + "gfid=%s", uuid_utoa(object->gfid), NULL); + return timer ? 0 : -1; +} + +static int32_t +br_object_quicksign(xlator_t *this, br_object_t *object) +{ + br_add_object_to_queue(NULL, object, 0ULL); + return 0; +} + +/** + * This callback function registered with the changelog is executed + * whenever a notification from the changelog is received. This should + * add the object (or the gfid) on which the notification has come to + * the timer-wheel with some expiry time. + * + * TODO: use mem-pool for allocations and maybe allocate timer and + * object as a single alloc and bifurcate their respective pointers. + */ +void +br_brick_callback(void *xl, char *brick, void *data, changelog_event_t *ev) +{ + int32_t ret = 0; + uuid_t gfid = { + 0, + }; + xlator_t *this = NULL; + br_object_t *object = NULL; + br_child_t *child = NULL; + br_sign_state_t sign_info = BR_SIGN_INVALID; + + this = xl; + + GF_VALIDATE_OR_GOTO(this->name, ev, out); + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + GF_ASSERT(ev->ev_type == CHANGELOG_OP_TYPE_BR_RELEASE); + GF_ASSERT(!gf_uuid_is_null(ev->u.releasebr.gfid)); + + gf_uuid_copy(gfid, ev->u.releasebr.gfid); + + gf_msg_debug(this->name, 0, "RELEASE EVENT [GFID %s]", uuid_utoa(gfid)); + + child = br_get_child_from_brick_path(this, brick); + if (!child) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SUBVOL_FAILED, + "brick=%s", brick, NULL); + goto out; + } + + object = br_initialize_object(this, child, ev); + if (!object) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY, + "object-gfid=%s", uuid_utoa(gfid), NULL); + goto out; + } + + /* sanity check */ + sign_info = ntohl(object->sign_info); + GF_ASSERT(sign_info != BR_SIGN_NORMAL); + + if (sign_info == BR_SIGN_REOPEN_WAIT) + ret = br_schedule_object_reopen(this, object, child, ev); + else + ret = br_object_quicksign(this, object); + + if (ret) + goto free_object; + + gf_msg_debug(this->name, 0, "->callback: brick [%s], type [%d]\n", brick, + ev->ev_type); + return; + +free_object: + GF_FREE(object); +out: + return; +} + +void +br_fill_brick_spec(struct gf_brick_spec *brick, char *path) +{ + brick->brick_path = gf_strdup(path); + brick->filter = CHANGELOG_OP_TYPE_BR_RELEASE; + + brick->init = br_brick_init; + brick->fini = br_brick_fini; + brick->callback = br_brick_callback; + brick->connected = NULL; + brick->disconnected = NULL; +} + +static gf_boolean_t +br_check_object_need_sign(xlator_t *this, dict_t *xattr, br_child_t *child) +{ + int32_t ret = -1; + gf_boolean_t need_sign = _gf_false; + br_isignature_out_t *sign = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, xattr, out); + GF_VALIDATE_OR_GOTO(this->name, child, out); + + ret = dict_get_ptr(xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, (void **)&sign); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED, + "object-info", NULL); + goto out; + } + + /* Object has been opened and hence dirty. Do not sign it */ + if (sign->stale) + need_sign = _gf_true; + +out: + return need_sign; +} + +int32_t +br_prepare_loc(xlator_t *this, br_child_t *child, loc_t *parent, + gf_dirent_t *entry, loc_t *loc) +{ + int32_t ret = -1; + inode_t *inode = NULL; + + inode = inode_grep(child->table, parent->inode, entry->d_name); + if (!inode) + loc->inode = inode_new(child->table); + else { + loc->inode = inode; + if (loc->inode->ia_type != IA_IFREG) { + gf_msg_debug(this->name, 0, + "%s is not a regular " + "file", + entry->d_name); + ret = 0; + goto out; + } + } + + loc->parent = inode_ref(parent->inode); + gf_uuid_copy(loc->pargfid, parent->inode->gfid); + + ret = inode_path(parent->inode, entry->d_name, (char **)&loc->path); + if (ret < 0 || !loc->path) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_PATH_FAILED, + "inode_path=%s", entry->d_name, "parent-gfid=%s", + uuid_utoa(parent->inode->gfid), NULL); + goto out; + } + + loc->name = strrchr(loc->path, '/'); + if (loc->name) + loc->name++; + + ret = 1; + +out: + return ret; +} + +/** + * Oneshot crawler + * --------------- + * This is a catchup mechanism. Objects that remained unsigned from the + * last run for whatever reason (node crashes, reboots, etc..) become + * candidates for signing. This allows the signature to "catch up" with + * the current state of the object. Triggering signing is easy: perform + * an open() followed by a close() thereby resulting in call boomerang. + * (though not back to itself :)) + */ +int +bitd_oneshot_crawl(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) +{ + int op_errno = 0; + br_child_t *child = NULL; + xlator_t *this = NULL; + loc_t loc = { + 0, + }; + struct iatt iatt = { + 0, + }; + struct iatt parent_buf = { + 0, + }; + dict_t *xattr = NULL; + int32_t ret = -1; + inode_t *linked_inode = NULL; + gf_boolean_t need_signing = _gf_false; + gf_boolean_t need_reopen = _gf_true; + + GF_VALIDATE_OR_GOTO("bit-rot", subvol, out); + GF_VALIDATE_OR_GOTO("bit-rot", data, out); + + child = data; + this = child->this; + + ret = br_prepare_loc(this, child, parent, entry, &loc); + if (!ret) + goto out; + + ret = syncop_lookup(child->xl, &loc, &iatt, &parent_buf, NULL, NULL); + if (ret) { + br_log_object_path(this, "lookup", loc.path, -ret); + goto out; + } + + linked_inode = inode_link(loc.inode, parent->inode, loc.name, &iatt); + if (linked_inode) + inode_lookup(linked_inode); + + if (iatt.ia_type != IA_IFREG) { + gf_msg_debug(this->name, 0, + "%s is not a regular file, " + "skipping..", + entry->d_name); + ret = 0; + goto unref_inode; + } + + /** + * As of now, 2 cases are possible and handled. + * 1) GlusterFS is upgraded from a previous version which does not + * have any idea about bit-rot and have data in the filesystem. + * In this case syncop_getxattr fails with ENODATA and the object + * is signed. (In real, when crawler sends lookup, bit-rot-stub + * creates the xattrs before returning lookup reply) + * 2) Bit-rot was not enabled or BitD was does for some reasons, during + * which some files were created, but since BitD was down, were not + * signed. + * If the file was just created and was being written some data when + * the down BitD came up, then bit-rot stub should be intelligent to + * identify this case (by comparing the ongoing version or by checking + * if there are any fds present for that inode) and handle properly. + */ + + if (bitd_is_bad_file(this, child, &loc, NULL)) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SKIP_OBJECT, "path=%s", + loc.path, NULL); + goto unref_inode; + } + + ret = syncop_getxattr(child->xl, &loc, &xattr, + GLUSTERFS_GET_OBJECT_SIGNATURE, NULL, NULL); + if (ret < 0) { + op_errno = -ret; + br_log_object(this, "getxattr", linked_inode->gfid, op_errno); + + /** + * No need to sign the zero byte objects as the signing + * happens upon first modification of the object. + */ + if (op_errno == ENODATA && (iatt.ia_size != 0)) + need_signing = _gf_true; + if (op_errno == EINVAL) + gf_smsg(this->name, GF_LOG_WARNING, 0, + BRB_MSG_PARTIAL_VERSION_PRESENCE, "gfid=%s", + uuid_utoa(linked_inode->gfid), NULL); + } else { + need_signing = br_check_object_need_sign(this, xattr, child); + + /* + * If we are here means, bitrot daemon has started. Is it just + * a simple restart of the daemon or is it started because the + * feature is enabled is something hard to determine. Hence, + * if need_signing is false (because bit-rot version and signature + * are present), then still go ahead and sign it. + */ + if (!need_signing) { + need_signing = _gf_true; + need_reopen = _gf_true; + } + } + + if (!need_signing) + goto unref_dict; + + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_TRIGGER_SIGN, "path=%s", + loc.path, "gfid=%s", uuid_utoa(linked_inode->gfid), "Brick-path=%s", + child->brick_path, NULL); + br_trigger_sign(this, child, linked_inode, &loc, need_reopen); + + ret = 0; + +unref_dict: + if (xattr) + dict_unref(xattr); +unref_inode: + inode_unref(linked_inode); +out: + loc_wipe(&loc); + + return ret; +} + +#define BR_CRAWL_THROTTLE_COUNT 50 +#define BR_CRAWL_THROTTLE_ZZZ 5 + +void * +br_oneshot_signer(void *arg) +{ + loc_t loc = { + 0, + }; + xlator_t *this = NULL; + br_child_t *child = NULL; + + child = arg; + this = child->this; + + THIS = this; + + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CRAWLING_START, "brick-path=%s", + child->brick_path, NULL); + + loc.inode = child->table->root; + (void)syncop_ftw_throttle(child->xl, &loc, GF_CLIENT_PID_BITD, child, + bitd_oneshot_crawl, BR_CRAWL_THROTTLE_COUNT, + BR_CRAWL_THROTTLE_ZZZ); + + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CRAWLING_FINISH, + "brick-path=%s", child->brick_path, NULL); + + return NULL; +} + +static void +br_set_child_state(br_child_t *child, br_child_state_t state) +{ + pthread_mutex_lock(&child->lock); + { + _br_set_child_state(child, state); + } + pthread_mutex_unlock(&child->lock); +} + +/** + * At this point a thread is spawned to crawl the filesystem (in + * tortoise pace) to sign objects that were not signed in previous run(s). + * Such objects are identified by examining it's dirtyness and timestamp. + * + * pick object: + * signature_is_stale() && (object_timestamp() <= stub_init_time()) + * + * Also, we register to the changelog library to subscribe for event + * notifications. + */ +static int32_t +br_enact_signer(xlator_t *this, br_child_t *child, br_stub_init_t *stub) +{ + int32_t ret = 0; + br_private_t *priv = NULL; + struct gf_brick_spec *brick = NULL; + + priv = this->private; + + brick = GF_CALLOC(1, sizeof(struct gf_brick_spec), + gf_common_mt_gf_brick_spec_t); + if (!brick) + goto error_return; + + br_fill_brick_spec(brick, stub->export); + ret = gf_changelog_register_generic(brick, 1, 1, + this->ctx->cmd_args.log_file, -1, this); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, errno, BRB_MSG_REGISTER_FAILED, NULL); + goto dealloc; + } + + child->threadrunning = 0; + ret = gf_thread_create(&child->thread, NULL, br_oneshot_signer, child, + "brosign"); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SPAWN_FAILED, + "FS-crawler-thread", NULL); + else + child->threadrunning = 1; + + /* it's OK to continue, "old" objects would be signed when modified */ + list_add_tail(&child->list, &priv->signing); + return 0; + +dealloc: + GF_FREE(brick); +error_return: + return -1; +} + +static int32_t +br_launch_scrubber(xlator_t *this, br_child_t *child, struct br_scanfs *fsscan, + struct br_scrubber *fsscrub) +{ + int32_t ret = -1; + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + + scrub_monitor = &priv->scrub_monitor; + ret = gf_thread_create(&child->thread, NULL, br_fsscanner, child, + "brfsscan"); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ALERT, 0, BRB_MSG_SPAWN_FAILED, + "bitrot-scrubber-daemon Brick-path=%s", child->brick_path, + NULL); + goto error_return; + } + + /* Signal monitor to kick off state machine*/ + pthread_mutex_lock(&scrub_monitor->mutex); + { + if (!scrub_monitor->inited) + pthread_cond_signal(&scrub_monitor->cond); + scrub_monitor->inited = _gf_true; + } + pthread_mutex_unlock(&scrub_monitor->mutex); + + /** + * Everything has been setup.. add this subvolume to scrubbers + * list. + */ + pthread_mutex_lock(&fsscrub->mutex); + { + list_add_tail(&child->list, &fsscrub->scrublist); + pthread_cond_broadcast(&fsscrub->cond); + } + pthread_mutex_unlock(&fsscrub->mutex); + + return 0; + +error_return: + return -1; +} + +static int32_t +br_enact_scrubber(xlator_t *this, br_child_t *child) +{ + int32_t ret = 0; + br_private_t *priv = NULL; + struct br_scanfs *fsscan = NULL; + struct br_scrubber *fsscrub = NULL; + + priv = this->private; + + fsscan = &child->fsscan; + fsscrub = &priv->fsscrub; + + /** + * if this child already witnesses a successful connection earlier + * there's no need to initialize mutexes, condvars, etc.. + */ + if (_br_child_witnessed_connection(child)) + return br_launch_scrubber(this, child, fsscan, fsscrub); + + LOCK_INIT(&fsscan->entrylock); + pthread_mutex_init(&fsscan->waitlock, NULL); + pthread_cond_init(&fsscan->waitcond, NULL); + + fsscan->entries = 0; + INIT_LIST_HEAD(&fsscan->queued); + INIT_LIST_HEAD(&fsscan->ready); + + ret = br_launch_scrubber(this, child, fsscan, fsscrub); + if (ret) + goto error_return; + + return 0; + +error_return: + LOCK_DESTROY(&fsscan->entrylock); + pthread_mutex_destroy(&fsscan->waitlock); + pthread_cond_destroy(&fsscan->waitcond); + + return -1; +} + +static int32_t +br_child_enaction(xlator_t *this, br_child_t *child, br_stub_init_t *stub) +{ + int32_t ret = -1; + br_private_t *priv = this->private; + + pthread_mutex_lock(&child->lock); + { + if (priv->iamscrubber) + ret = br_enact_scrubber(this, child); + else + ret = br_enact_signer(this, child, stub); + + if (!ret) { + child->witnessed = 1; + _br_set_child_state(child, BR_CHILD_STATE_CONNECTED); + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CONNECTED_TO_BRICK, + "brick-path=%s", child->brick_path, NULL); + } + } + pthread_mutex_unlock(&child->lock); + + return ret; +} + +/** + * This routine fetches various attributes associated with a child which + * is basically a subvolume. Attributes include brick path and the stub + * birth time. This is done by performing a lookup on the root followed + * by getxattr() on a virtual key. Depending on the configuration, the + * process either acts as a signer or a scrubber. + */ +int32_t +br_brick_connect(xlator_t *this, br_child_t *child) +{ + int32_t ret = -1; + loc_t loc = { + 0, + }; + struct iatt buf = { + 0, + }; + struct iatt parent = { + 0, + }; + br_stub_init_t *stub = NULL; + dict_t *xattr = NULL; + int op_errno = 0; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, child, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + br_child_set_scrub_state(child, _gf_false); + br_set_child_state(child, BR_CHILD_STATE_INITIALIZING); + + loc.inode = inode_ref(child->table->root); + gf_uuid_copy(loc.gfid, loc.inode->gfid); + loc.path = gf_strdup("/"); + + ret = syncop_lookup(child->xl, &loc, &buf, &parent, NULL, NULL); + if (ret) { + op_errno = -ret; + ret = -1; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_LOOKUP_FAILED, + NULL); + goto wipeloc; + } + + ret = syncop_getxattr(child->xl, &loc, &xattr, + GLUSTERFS_GET_BR_STUB_INIT_TIME, NULL, NULL); + if (ret) { + op_errno = -ret; + ret = -1; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_GET_INFO_FAILED, + NULL); + goto wipeloc; + } + + ret = dict_get_ptr(xattr, GLUSTERFS_GET_BR_STUB_INIT_TIME, (void **)&stub); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_INFO_FAILED, NULL); + goto free_dict; + } + + memcpy(child->brick_path, stub->export, strlen(stub->export) + 1); + child->tv.tv_sec = ntohl(stub->timebuf[0]); + child->tv.tv_usec = ntohl(stub->timebuf[1]); + + ret = br_child_enaction(this, child, stub); + +free_dict: + dict_unref(xattr); +wipeloc: + loc_wipe(&loc); +out: + if (ret) + br_set_child_state(child, BR_CHILD_STATE_CONNFAILED); + return ret; +} + +/* TODO: cleanup signer */ +static int32_t +br_cleanup_signer(xlator_t *this, br_child_t *child) +{ + return 0; +} + +static int32_t +br_cleanup_scrubber(xlator_t *this, br_child_t *child) +{ + int32_t ret = 0; + br_private_t *priv = NULL; + struct br_scrubber *fsscrub = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + fsscrub = &priv->fsscrub; + scrub_monitor = &priv->scrub_monitor; + + if (_br_is_child_scrub_active(child)) { + scrub_monitor->active_child_count--; + br_child_set_scrub_state(child, _gf_false); + } + + /** + * 0x0: child (brick) goes out of rotation + * + * This is fully safe w.r.t. entries for this child being actively + * scrubbed. Each of the scrubber thread(s) would finish scrubbing + * the entry (probably failing due to disconnection) and either + * putting the entry back into the queue or continuing further. + * Either way, pending entries for this child's queue need not be + * drained; entries just sit there in the queued/ready list to be + * consumed later upon re-connection. + */ + pthread_mutex_lock(&fsscrub->mutex); + { + list_del_init(&child->list); + } + pthread_mutex_unlock(&fsscrub->mutex); + + /** + * 0x1: cleanup scanner thread + * + * The pending timer needs to be removed _after_ cleaning up the + * filesystem scanner (scheduling the next scrub time is not a + * cancellation point). + */ + ret = gf_thread_cleanup_xint(child->thread); + if (ret) + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_THREAD_CLEANUP, NULL); + + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUBBER_CLEANED, + "brick-path=%s", child->brick_path, NULL); + + return 0; +} + +/** + * OK.. this child has made it's mind to go down the drain. So, + * let's clean up what it touched. (NOTE: there's no need to clean + * the inode table, it's just reused taking care of stale inodes) + */ +int32_t +br_brick_disconnect(xlator_t *this, br_child_t *child) +{ + int32_t ret = 0; + struct br_monitor *scrub_monitor = NULL; + br_private_t *priv = this->private; + + scrub_monitor = &priv->scrub_monitor; + + /* Lock order should be wakelock and then child lock to + * dead locks. + */ + pthread_mutex_lock(&scrub_monitor->wakelock); + { + pthread_mutex_lock(&child->lock); + { + if (!_br_is_child_connected(child)) + goto unblock; + + /* child is on death row.. */ + _br_set_child_state(child, BR_CHILD_STATE_DISCONNECTED); + + if (priv->iamscrubber) + ret = br_cleanup_scrubber(this, child); + else + ret = br_cleanup_signer(this, child); + } + unblock: + pthread_mutex_unlock(&child->lock); + } + pthread_mutex_unlock(&scrub_monitor->wakelock); + + return ret; +} + +/** + * This function is executed in a separate thread. The thread gets the + * brick from where CHILD_UP has received from the queue and gets the + * information regarding that brick (such as brick path). + */ +void * +br_handle_events(void *arg) +{ + int32_t ret = 0; + xlator_t *this = NULL; + br_private_t *priv = NULL; + br_child_t *child = NULL; + struct br_child_event *childev = NULL; + + this = arg; + priv = this->private; + + /* + * Since, this is the topmost xlator, THIS has to be set by bit-rot + * xlator itself (STACK_WIND won't help in this case). Also it has + * to be done for each thread that gets spawned. Otherwise, a new + * thread will get global_xlator's pointer when it does "THIS". + */ + THIS = this; + + while (1) { + pthread_mutex_lock(&priv->lock); + { + while (list_empty(&priv->bricks)) + pthread_cond_wait(&priv->cond, &priv->lock); + + childev = list_first_entry(&priv->bricks, struct br_child_event, + list); + list_del_init(&childev->list); + } + pthread_mutex_unlock(&priv->lock); + + child = childev->child; + ret = childev->call(this, child); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SUBVOL_CONNECT_FAILED, + "name=%s", child->xl->name, NULL); + GF_FREE(childev); + } + + return NULL; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int32_t ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init(this, gf_br_stub_mt_end + 1); + + if (ret != 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_MEM_ACNT_FAILED, NULL); + return ret; + } + + return ret; +} + +static void +_br_qchild_event(xlator_t *this, br_child_t *child, br_child_handler *call) +{ + br_private_t *priv = NULL; + struct br_child_event *childev = NULL; + + priv = this->private; + + childev = GF_CALLOC(1, sizeof(*childev), gf_br_mt_br_child_event_t); + if (!childev) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_EVENT_UNHANDLED, + "Brick-name=%s", child->xl->name, NULL); + return; + } + + INIT_LIST_HEAD(&childev->list); + childev->this = this; + childev->child = child; + childev->call = call; + + list_add_tail(&childev->list, &priv->bricks); +} + +int +br_scrubber_status_get(xlator_t *this, dict_t **dict) +{ + int ret = -1; + br_private_t *priv = NULL; + struct br_scrub_stats *scrub_stats = NULL; + + priv = this->private; + + GF_VALIDATE_OR_GOTO("bit-rot", priv, out); + + scrub_stats = &priv->scrub_stat; + + ret = br_get_bad_objects_list(this, dict); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to collect corrupt " + "files"); + } + + ret = dict_set_int8(*dict, "scrub-running", scrub_stats->scrub_running); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed setting scrub_running " + "entry to the dictionary"); + } + + ret = dict_set_uint64(*dict, "scrubbed-files", scrub_stats->scrubbed_files); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to setting scrubbed file " + "entry to the dictionary"); + } + + ret = dict_set_uint64(*dict, "unsigned-files", scrub_stats->unsigned_files); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to set unsigned file count" + " entry to the dictionary"); + } + + ret = dict_set_uint64(*dict, "scrub-duration", scrub_stats->scrub_duration); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to set scrub duration" + " entry to the dictionary"); + } + + ret = dict_set_dynstr_with_alloc(*dict, "last-scrub-time", + scrub_stats->last_scrub_time); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to set " + "last scrub time value"); + } + +out: + return ret; +} + +int +notify(xlator_t *this, int32_t event, void *data, ...) +{ + int idx = -1; + int ret = -1; + xlator_t *subvol = NULL; + br_child_t *child = NULL; + br_private_t *priv = NULL; + dict_t *output = NULL; + va_list ap; + struct br_monitor *scrub_monitor = NULL; + + subvol = (xlator_t *)data; + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + gf_msg_trace(this->name, 0, "Notification received: %d", event); + + idx = br_find_child_index(this, subvol); + + switch (event) { + case GF_EVENT_CHILD_UP: + if (idx < 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_INVALID_SUBVOL, + "event=%d", event, NULL); + goto out; + } + + pthread_mutex_lock(&priv->lock); + { + child = &priv->children[idx]; + if (child->child_up == 1) + goto unblock_0; + priv->up_children++; + + child->child_up = 1; + child->xl = subvol; + if (!child->table) + child->table = inode_table_new(4096, subvol); + + _br_qchild_event(this, child, br_brick_connect); + pthread_cond_signal(&priv->cond); + } + unblock_0: + pthread_mutex_unlock(&priv->lock); + + if (priv->up_children == priv->child_count) + default_notify(this, event, data); + break; + + case GF_EVENT_CHILD_DOWN: + if (idx < 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_INVALID_SUBVOL, + "event=%d", event, NULL); + goto out; + } + + pthread_mutex_lock(&priv->lock); + { + child = &priv->children[idx]; + if (child->child_up == 0) + goto unblock_1; + + child->child_up = 0; + priv->up_children--; + + _br_qchild_event(this, child, br_brick_disconnect); + pthread_cond_signal(&priv->cond); + } + unblock_1: + pthread_mutex_unlock(&priv->lock); + + if (priv->up_children == 0) + default_notify(this, event, data); + break; + + case GF_EVENT_SCRUB_STATUS: + gf_msg_debug(this->name, GF_LOG_INFO, + "BitRot scrub status " + "called"); + va_start(ap, data); + output = va_arg(ap, dict_t *); + va_end(ap); + + ret = br_scrubber_status_get(this, &output); + gf_msg_debug(this->name, 0, "returning %d", ret); + break; + + case GF_EVENT_SCRUB_ONDEMAND: + gf_log(this->name, GF_LOG_INFO, + "BitRot scrub ondemand " + "called"); + + if (scrub_monitor->state != BR_SCRUB_STATE_PENDING) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRB_MSG_RESCHEDULE_SCRUBBER_FAILED, "Current-state=%d", + scrub_monitor->state, NULL); + return -2; + } + + /* Needs synchronization with reconfigure thread */ + pthread_mutex_lock(&priv->lock); + { + ret = br_scrub_state_machine(this, _gf_true); + } + pthread_mutex_unlock(&priv->lock); + + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRB_MSG_COULD_NOT_SCHEDULE_SCRUB, NULL); + } + gf_msg_debug(this->name, 0, "returning %d", ret); + break; + default: + default_notify(this, event, data); + } + +out: + return 0; +} + +static void +br_fini_signer(xlator_t *this, br_private_t *priv) +{ + int i = 0; + + if (priv == NULL) + return; + + for (; i < priv->signer_th_count; i++) { + (void)gf_thread_cleanup_xint(priv->obj_queue->workers[i]); + } + GF_FREE(priv->obj_queue->workers); + + pthread_cond_destroy(&priv->object_cond); +} + +/** + * Initialize signer specific structures, spawn worker threads. + */ + +static int32_t +br_init_signer(xlator_t *this, br_private_t *priv) +{ + int i = 0; + int32_t ret = -1; + + /* initialize gfchangelog xlator context */ + ret = gf_changelog_init(this); + if (ret) + goto out; + + pthread_cond_init(&priv->object_cond, NULL); + + priv->obj_queue = GF_CALLOC(1, sizeof(*priv->obj_queue), + gf_br_mt_br_ob_n_wk_t); + if (!priv->obj_queue) + goto cleanup_cond; + INIT_LIST_HEAD(&priv->obj_queue->objects); + + priv->obj_queue->workers = GF_CALLOC( + priv->signer_th_count, sizeof(pthread_t), gf_br_mt_br_worker_t); + if (!priv->obj_queue->workers) + goto cleanup_obj_queue; + + for (i = 0; i < priv->signer_th_count; i++) { + ret = gf_thread_create(&priv->obj_queue->workers[i], NULL, + br_process_object, this, "brpobj"); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + BRB_MSG_THREAD_CREATION_FAILED, NULL); + ret = -1; + goto cleanup_threads; + } + } + + return 0; + +cleanup_threads: + for (i--; i >= 0; i--) { + (void)gf_thread_cleanup_xint(priv->obj_queue->workers[i]); + } + GF_FREE(priv->obj_queue->workers); + +cleanup_obj_queue: + GF_FREE(priv->obj_queue); + +cleanup_cond: + /* that's explicit */ + pthread_cond_destroy(&priv->object_cond); +out: + return -1; +} + +/** + * For signer, only rate limit CPU usage (during hash calculation) when + * compiled with -DBR_RATE_LIMIT_SIGNER cflags, else let it run full + * throttle. + */ +static int32_t +br_rate_limit_signer(xlator_t *this, int child_count, int numbricks) +{ + br_private_t *priv = NULL; + tbf_opspec_t spec = { + 0, + }; + + priv = this->private; + + spec.op = TBF_OP_HASH; + spec.rate = 0; + spec.maxlimit = 0; + + /** + * OK. Most implementations of TBF I've come across generate tokens + * every second (UML, etc..) and some chose sub-second granularity + * (blk-iothrottle cgroups). TBF algorithm itself does not enforce + * any logic for choosing generation interval and it seems pretty + * logical as one could jack up token count per interval w.r.t. + * generation rate. + * + * Value used here is chosen based on a series of test(s) performed + * to balance object signing time and not maxing out on all available + * CPU cores. It's obvious to have seconds granularity and jack up + * token count per interval, thereby achieving close to similar + * results. Let's stick to this as it seems to be working fine for + * the set of ops that are throttled. + **/ + spec.token_gen_interval = 600000; /* In usec */ + +#ifdef BR_RATE_LIMIT_SIGNER + + double contribution = 0; + contribution = ((double)1 - ((double)child_count / (double)numbricks)); + if (contribution == 0) + contribution = 1; + spec.rate = BR_HASH_CALC_READ_SIZE * contribution; + spec.maxlimit = priv->signer_th_count * BR_HASH_CALC_READ_SIZE; + +#endif + + if (!spec.rate) + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_RATE_LIMIT_INFO, + "FULL THROTTLE", NULL); + else + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_RATE_LIMIT_INFO, + "tokens/sec-rate=%lu", spec.rate, "maxlimit=%lu", spec.maxlimit, + NULL); + + priv->tbf = tbf_init(&spec, 1); + return priv->tbf ? 0 : -1; +} + +static int32_t +br_signer_handle_options(xlator_t *this, br_private_t *priv, dict_t *options) +{ + if (options) { + GF_OPTION_RECONF("expiry-time", priv->expiry_time, options, uint32, + error_return); + GF_OPTION_RECONF("signer-threads", priv->signer_th_count, options, + uint32, error_return); + } else { + GF_OPTION_INIT("expiry-time", priv->expiry_time, uint32, error_return); + GF_OPTION_INIT("signer-threads", priv->signer_th_count, uint32, + error_return); + } + + return 0; + +error_return: + return -1; +} + +static int32_t +br_signer_init(xlator_t *this, br_private_t *priv) +{ + int32_t ret = 0; + int numbricks = 0; + + GF_OPTION_INIT("expiry-time", priv->expiry_time, uint32, error_return); + GF_OPTION_INIT("brick-count", numbricks, int32, error_return); + GF_OPTION_INIT("signer-threads", priv->signer_th_count, uint32, + error_return); + + ret = br_rate_limit_signer(this, priv->child_count, numbricks); + if (ret) + goto error_return; + + ret = br_init_signer(this, priv); + if (ret) + goto cleanup_tbf; + + return 0; + +cleanup_tbf: + /* cleanup TBF */ +error_return: + return -1; +} + +static void +br_free_scrubber_monitor(xlator_t *this, br_private_t *priv) +{ + struct br_monitor *scrub_monitor = &priv->scrub_monitor; + + if (scrub_monitor->timer) { + (void)gf_tw_del_timer(priv->timer_wheel, scrub_monitor->timer); + + GF_FREE(scrub_monitor->timer); + scrub_monitor->timer = NULL; + } + + (void)gf_thread_cleanup_xint(scrub_monitor->thread); + + /* Clean up cond and mutex variables */ + pthread_mutex_destroy(&scrub_monitor->mutex); + pthread_cond_destroy(&scrub_monitor->cond); + + pthread_mutex_destroy(&scrub_monitor->wakelock); + pthread_cond_destroy(&scrub_monitor->wakecond); + + pthread_mutex_destroy(&scrub_monitor->donelock); + pthread_cond_destroy(&scrub_monitor->donecond); + + LOCK_DESTROY(&scrub_monitor->lock); +} + +static void +br_free_children(xlator_t *this, br_private_t *priv, int count) +{ + br_child_t *child = NULL; + + for (--count; count >= 0; count--) { + child = &priv->children[count]; + mem_pool_destroy(child->timer_pool); + pthread_mutex_destroy(&child->lock); + } + + GF_FREE(priv->children); + priv->children = NULL; +} + +static int +br_init_children(xlator_t *this, br_private_t *priv) +{ + int i = 0; + br_child_t *child = NULL; + xlator_list_t *trav = NULL; + + priv->child_count = xlator_subvolume_count(this); + priv->children = GF_CALLOC(priv->child_count, sizeof(*priv->children), + gf_br_mt_br_child_t); + if (!priv->children) + goto err; + + trav = this->children; + while (trav) { + child = &priv->children[i]; + + pthread_mutex_init(&child->lock, NULL); + child->witnessed = 0; + + br_set_child_state(child, BR_CHILD_STATE_DISCONNECTED); + + child->this = this; + child->xl = trav->xlator; + + child->timer_pool = mem_pool_new(struct gf_tw_timer_list, 4096); + if (!child->timer_pool) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_MEM_POOL_ALLOC, + NULL); + errno = ENOMEM; + goto freechild; + } + + INIT_LIST_HEAD(&child->list); + + i++; + trav = trav->next; + } + + return 0; + +freechild: + br_free_children(this, priv, i); +err: + return -1; +} + +int32_t +init(xlator_t *this) +{ + int32_t ret = -1; + br_private_t *priv = NULL; + + if (!this->children) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_NO_CHILD, NULL); + goto out; + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_br_mt_br_private_t); + if (!priv) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY, NULL); + goto out; + } + + GF_OPTION_INIT("scrubber", priv->iamscrubber, bool, free_priv); + + ret = br_init_children(this, priv); + if (ret) + goto free_priv; + + pthread_mutex_init(&priv->lock, NULL); + pthread_cond_init(&priv->cond, NULL); + + INIT_LIST_HEAD(&priv->bricks); + INIT_LIST_HEAD(&priv->signing); + + priv->timer_wheel = glusterfs_ctx_tw_get(this->ctx); + if (!priv->timer_wheel) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_TIMER_WHEEL_UNAVAILABLE, + NULL); + goto cleanup; + } + + this->private = priv; + + if (!priv->iamscrubber) { + ret = br_signer_init(this, priv); + if (!ret) + ret = br_signer_handle_options(this, priv, NULL); + } else { + ret = br_scrubber_init(this, priv); + if (!ret) + ret = br_scrubber_handle_options(this, priv, NULL); + } + + if (ret) + goto cleanup; + + ret = gf_thread_create(&priv->thread, NULL, br_handle_events, this, + "brhevent"); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_THREAD_CREATION_FAILED, + NULL); + ret = -1; + } + + if (!ret) { + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_BITROT_LOADED, "mode=%s", + (priv->iamscrubber) ? "SCRUBBER" : "SIGNER", NULL); + return 0; + } + +cleanup: + (void)pthread_cond_destroy(&priv->cond); + (void)pthread_mutex_destroy(&priv->lock); + + br_free_children(this, priv, priv->child_count); + +free_priv: + GF_FREE(priv); +out: + this->private = NULL; + return -1; +} + +void +fini(xlator_t *this) +{ + br_private_t *priv = this->private; + + if (!priv) + return; + + if (!priv->iamscrubber) + br_fini_signer(this, priv); + else + (void)br_free_scrubber_monitor(this, priv); + + br_free_children(this, priv, priv->child_count); + + this->private = NULL; + GF_FREE(priv); + + glusterfs_ctx_tw_put(this->ctx); + + return; +} + +static void +br_reconfigure_monitor(xlator_t *this) +{ + int32_t ret = 0; + + ret = br_scrub_state_machine(this, _gf_false); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_COULD_NOT_SCHEDULE_SCRUB, + NULL); + } +} + +static int +br_reconfigure_scrubber(xlator_t *this, dict_t *options) +{ + int32_t ret = -1; + br_private_t *priv = NULL; + + priv = this->private; + + pthread_mutex_lock(&priv->lock); + { + ret = br_scrubber_handle_options(this, priv, options); + } + pthread_mutex_unlock(&priv->lock); + + if (ret) + goto err; + + /* change state for all _up_ subvolume(s) */ + pthread_mutex_lock(&priv->lock); + { + br_reconfigure_monitor(this); + } + pthread_mutex_unlock(&priv->lock); + +err: + return ret; +} + +static int +br_reconfigure_signer(xlator_t *this, dict_t *options) +{ + br_private_t *priv = this->private; + + return br_signer_handle_options(this, priv, options); +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + int ret = 0; + br_private_t *priv = NULL; + + priv = this->private; + + if (priv->iamscrubber) + ret = br_reconfigure_scrubber(this, options); + else + ret = br_reconfigure_signer(this, options); + + return ret; +} + +struct xlator_fops fops; + +struct xlator_cbks cbks; + +struct volume_options options[] = { + { + .key = {"expiry-time"}, + .type = GF_OPTION_TYPE_INT, + .default_value = SIGNING_TIMEOUT, + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "Waiting time for an object on which it waits " + "before it is signed", + }, + { + .key = {"brick-count"}, + .type = GF_OPTION_TYPE_STR, + .description = "Total number of bricks for the current node for " + "all volumes in the trusted storage pool.", + }, + { + .key = {"scrubber", "scrub"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_FORCE, + .description = "option to run as a scrubber", + }, + { + .key = {"scrub-throttle"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "lazy", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "Scrub-throttle value is a measure of how fast " + "or slow the scrubber scrubs the filesystem for " + "volume <VOLNAME>", + }, + { + .key = {"scrub-freq"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "biweekly", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "Scrub frequency for volume <VOLNAME>", + }, + { + .key = {"scrub-state"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "active", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "Pause/Resume scrub. Upon resume, scrubber " + "continues from where it left off.", + }, + { + .key = {"signer-threads"}, + .type = GF_OPTION_TYPE_INT, + .default_value = BR_WORKERS, + .op_version = {GD_OP_VERSION_8_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "Number of signing process threads. As a best " + "practice, set this to the number of processor cores", + }, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "bit-rot", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h new file mode 100644 index 00000000000..8ac7dcdac3d --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot.h @@ -0,0 +1,302 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_H__ +#define __BIT_ROT_H__ + +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include <glusterfs/syncop.h> +#include <glusterfs/syncop-utils.h> +#include "changelog.h" +#include "timer-wheel.h" + +#include <glusterfs/throttle-tbf.h> +#include "bit-rot-ssm.h" + +#include "bit-rot-common.h" +#include "bit-rot-stub-mem-types.h" +#include "bit-rot-scrub-status.h" + +#include <openssl/sha.h> + +typedef enum scrub_throttle { + BR_SCRUB_THROTTLE_VOID = -1, + BR_SCRUB_THROTTLE_LAZY = 0, + BR_SCRUB_THROTTLE_NORMAL = 1, + BR_SCRUB_THROTTLE_AGGRESSIVE = 2, + BR_SCRUB_THROTTLE_STALLED = 3, +} scrub_throttle_t; + +typedef enum scrub_freq { + BR_FSSCRUB_FREQ_HOURLY = 1, + BR_FSSCRUB_FREQ_DAILY, + BR_FSSCRUB_FREQ_WEEKLY, + BR_FSSCRUB_FREQ_BIWEEKLY, + BR_FSSCRUB_FREQ_MONTHLY, + BR_FSSCRUB_FREQ_MINUTE, + BR_FSSCRUB_FREQ_STALLED, +} scrub_freq_t; + +#define signature_size(hl) (sizeof(br_isignature_t) + hl + 1) + +struct br_scanfs { + gf_lock_t entrylock; + + pthread_mutex_t waitlock; + pthread_cond_t waitcond; + + unsigned int entries; + struct list_head queued; + struct list_head ready; +}; + +/* just need three states to track child status */ +typedef enum br_child_state { + BR_CHILD_STATE_CONNECTED = 1, + BR_CHILD_STATE_INITIALIZING, + BR_CHILD_STATE_CONNFAILED, + BR_CHILD_STATE_DISCONNECTED, +} br_child_state_t; + +struct br_child { + pthread_mutex_t lock; /* protects child state */ + char witnessed; /* witnessed at least one successful + connection */ + br_child_state_t c_state; /* current state of this child */ + + char child_up; /* Indicates whether this child is + up or not */ + xlator_t *xl; /* client xlator corresponding to + this child */ + inode_table_t *table; /* inode table for this child */ + char brick_path[PATH_MAX]; /* brick export directory of this + child */ + struct list_head list; /* hook to attach to the list of + UP children */ + xlator_t *this; /* Bit rot xlator */ + + pthread_t thread; /* initial crawler for unsigned + object(s) or scrub crawler */ + int threadrunning; /* active thread */ + + struct mem_pool *timer_pool; /* timer-wheel's timer mem-pool */ + + struct timeval tv; + + struct br_scanfs fsscan; /* per subvolume FS scanner */ + + gf_boolean_t active_scrubbing; /* Actively scrubbing or not */ +}; + +typedef struct br_child br_child_t; + +struct br_obj_n_workers { + struct list_head objects; /* queue of objects expired from the + timer wheel and ready to be picked + up for signing */ + pthread_t *workers; /* Threads which pick up the objects + from the above queue and start + signing each object */ +}; + +struct br_scrubber { + xlator_t *this; + + scrub_throttle_t throttle; + + /** + * frequency of scanning for this subvolume. this should + * normally be per-child, but since all children follow the + * same frequency for a volume, this option ends up here + * instead of br_child_t. + */ + scrub_freq_t frequency; + + gf_boolean_t frequency_reconf; + gf_boolean_t throttle_reconf; + + pthread_mutex_t mutex; + pthread_cond_t cond; + + unsigned int nr_scrubbers; + struct list_head scrubbers; + + /** + * list of "rotatable" subvolume(s) undergoing scrubbing + */ + struct list_head scrublist; +}; + +struct br_monitor { + gf_lock_t lock; + pthread_t thread; /* Monitor thread */ + + gf_boolean_t inited; + pthread_mutex_t mutex; + pthread_cond_t cond; /* Thread starts and will be waiting on cond. + First child which is up wakes this up */ + + xlator_t *this; + /* scheduler */ + uint32_t boot; + + int32_t active_child_count; /* Number of children currently scrubbing */ + gf_boolean_t kick; /* This variable tracks the scrubber is + * kicked or not. Both 'kick' and + * 'active_child_count' uses the same pair + * of mutex-cond variable, i.e, wakelock and + * wakecond. */ + + pthread_mutex_t wakelock; + pthread_cond_t wakecond; + + gf_boolean_t done; + pthread_mutex_t donelock; + pthread_cond_t donecond; + + struct gf_tw_timer_list *timer; + br_scrub_state_t state; /* current scrub state */ +}; + +typedef struct br_obj_n_workers br_obj_n_workers_t; + +typedef struct br_private br_private_t; + +typedef void (*br_scrubbed_file_update)(br_private_t *priv); + +struct br_private { + pthread_mutex_t lock; + + struct list_head bricks; /* list of bricks from which enents + have been received */ + + struct list_head signing; + + pthread_cond_t object_cond; /* handling signing of objects */ + int child_count; + br_child_t *children; /* list of subvolumes */ + int up_children; + + pthread_cond_t cond; /* handling CHILD_UP notifications */ + pthread_t thread; /* thread for connecting each UP + child with changelog */ + + struct tvec_base *timer_wheel; /* timer wheel where the objects which + changelog has sent sits and waits + for expiry */ + br_obj_n_workers_t *obj_queue; /* place holder for all the objects + that are expired from timer wheel + and ready to be picked up for + signing and the workers which sign + the objects */ + + uint32_t expiry_time; /* objects "wait" time */ + + uint32_t signer_th_count; /* Number of signing process threads */ + + tbf_t *tbf; /* token bucket filter */ + + gf_boolean_t iamscrubber; /* function as a fs scrubber */ + + struct br_scrub_stats scrub_stat; /* statistics of scrub*/ + + struct br_scrubber fsscrub; /* scrubbers for this subvolume */ + + struct br_monitor scrub_monitor; /* scrubber monitor */ +}; + +struct br_object { + xlator_t *this; + + uuid_t gfid; + + unsigned long signedversion; /* version against which this object will + be signed */ + br_child_t *child; /* object's subvolume */ + + int sign_info; + + struct list_head list; /* hook to add to the queue once the + object is expired from timer wheel */ + void *data; +}; + +typedef struct br_object br_object_t; +typedef int32_t(br_scrub_ssm_call)(xlator_t *); + +void +br_log_object(xlator_t *, char *, uuid_t, int32_t); + +void +br_log_object_path(xlator_t *, char *, const char *, int32_t); + +int32_t +br_calculate_obj_checksum(unsigned char *, br_child_t *, fd_t *, struct iatt *); + +int32_t +br_prepare_loc(xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *); + +gf_boolean_t +bitd_is_bad_file(xlator_t *, br_child_t *, loc_t *, fd_t *); + +static inline void +_br_set_child_state(br_child_t *child, br_child_state_t state) +{ + child->c_state = state; +} + +static inline int +_br_is_child_connected(br_child_t *child) +{ + return (child->c_state == BR_CHILD_STATE_CONNECTED); +} + +static inline int +_br_is_child_scrub_active(br_child_t *child) +{ + return child->active_scrubbing; +} + +static inline int +_br_child_failed_conn(br_child_t *child) +{ + return (child->c_state == BR_CHILD_STATE_CONNFAILED); +} + +static inline int +_br_child_witnessed_connection(br_child_t *child) +{ + return (child->witnessed == 1); +} + +/* scrub state */ +static inline void +_br_monitor_set_scrub_state(struct br_monitor *scrub_monitor, + br_scrub_state_t state) +{ + scrub_monitor->state = state; +} + +static inline br_scrub_event_t +_br_child_get_scrub_event(struct br_scrubber *fsscrub) +{ + return (fsscrub->frequency == BR_FSSCRUB_FREQ_STALLED) + ? BR_SCRUB_EVENT_PAUSE + : BR_SCRUB_EVENT_SCHEDULE; +} + +int32_t +br_get_bad_objects_list(xlator_t *this, dict_t **dict); + +#endif /* __BIT_ROT_H__ */ diff --git a/xlators/features/bit-rot/src/stub/Makefile.am b/xlators/features/bit-rot/src/stub/Makefile.am new file mode 100644 index 00000000000..f13de7145fc --- /dev/null +++ b/xlators/features/bit-rot/src/stub/Makefile.am @@ -0,0 +1,20 @@ +if WITH_SERVER +xlator_LTLIBRARIES = bitrot-stub.la +endif +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +bitrot_stub_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +bitrot_stub_la_SOURCES = bit-rot-stub-helpers.c bit-rot-stub.c +bitrot_stub_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = bit-rot-stub.h bit-rot-common.h bit-rot-stub-mem-types.h \ + bit-rot-object-version.h bit-rot-stub-messages.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/bit-rot/src/stub/bit-rot-common.h b/xlators/features/bit-rot/src/stub/bit-rot-common.h new file mode 100644 index 00000000000..20561aa7764 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-common.h @@ -0,0 +1,178 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_COMMON_H__ +#define __BIT_ROT_COMMON_H__ + +#include <glusterfs/glusterfs.h> +#include "bit-rot-object-version.h" + +#define BR_VXATTR_VERSION (1 << 0) +#define BR_VXATTR_SIGNATURE (1 << 1) + +#define BR_VXATTR_SIGN_MISSING (BR_VXATTR_SIGNATURE) +#define BR_VXATTR_ALL_MISSING (BR_VXATTR_VERSION | BR_VXATTR_SIGNATURE) + +#define BR_BAD_OBJ_CONTAINER \ + (uuid_t) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 } + +typedef enum br_vxattr_state { + BR_VXATTR_STATUS_FULL = 0, + BR_VXATTR_STATUS_MISSING = 1, + BR_VXATTR_STATUS_UNSIGNED = 2, + BR_VXATTR_STATUS_INVALID = 3, +} br_vxattr_status_t; + +typedef enum br_sign_state { + BR_SIGN_INVALID = -1, + BR_SIGN_NORMAL = 0, + BR_SIGN_REOPEN_WAIT = 1, + BR_SIGN_QUICK = 2, +} br_sign_state_t; + +static inline br_vxattr_status_t +br_version_xattr_state(dict_t *xattr, br_version_t **obuf, + br_signature_t **sbuf, gf_boolean_t *objbad) +{ + int32_t ret = 0; + int32_t vxattr = 0; + br_vxattr_status_t status; + void *data = NULL; + + /** + * The key being present in the dict indicates the xattr was set on + * disk. The presence of xattr itself as of now is suffecient to say + * the the object is bad. + */ + *objbad = _gf_false; + ret = dict_get_bin(xattr, BITROT_OBJECT_BAD_KEY, (void **)&data); + if (!ret) + *objbad = _gf_true; + + ret = dict_get_bin(xattr, BITROT_CURRENT_VERSION_KEY, (void **)obuf); + if (ret) + vxattr |= BR_VXATTR_VERSION; + + ret = dict_get_bin(xattr, BITROT_SIGNING_VERSION_KEY, (void **)sbuf); + if (ret) + vxattr |= BR_VXATTR_SIGNATURE; + + switch (vxattr) { + case 0: + status = BR_VXATTR_STATUS_FULL; + break; + case BR_VXATTR_SIGN_MISSING: + status = BR_VXATTR_STATUS_UNSIGNED; + break; + case BR_VXATTR_ALL_MISSING: + status = BR_VXATTR_STATUS_MISSING; + break; + default: + status = BR_VXATTR_STATUS_INVALID; + } + + return status; +} + +/** + * in-memory representation of signature used by signer for object + * signing. + */ +typedef struct br_isignature_in { + int8_t signaturetype; /* signature type */ + + unsigned long signedversion; /* version against which the + object was signed */ + + size_t signaturelen; /* signature length */ + char signature[0]; /* object signature */ +} br_isignature_t; + +/** + * in-memory representation of signature used by scrubber for object + * verification. + */ +typedef struct br_isignature_out { + char stale; /* stale signature? */ + + unsigned long version; /* current signed version */ + + uint32_t time[2]; /* time when the object + got dirtied */ + + int8_t signaturetype; /* hash type */ + size_t signaturelen; /* signature length */ + char signature[0]; /* signature (hash) */ +} br_isignature_out_t; + +typedef struct br_stub_init { + uint32_t timebuf[2]; + char export[PATH_MAX]; +} br_stub_init_t; + +typedef enum { + BR_SIGNATURE_TYPE_VOID = -1, /* object is not signed */ + BR_SIGNATURE_TYPE_ZERO = 0, /* min boundary */ + BR_SIGNATURE_TYPE_SHA256 = 1, /* signed with SHA256 */ + BR_SIGNATURE_TYPE_MAX = 2, /* max boundary */ +} br_signature_type; + +/* BitRot stub start time (virtual xattr) */ +#define GLUSTERFS_GET_BR_STUB_INIT_TIME "trusted.glusterfs.bit-rot.stub-init" + +/* signing/reopen hint */ +#define BR_OBJECT_RESIGN 0 +#define BR_OBJECT_REOPEN 1 +#define BR_REOPEN_SIGN_HINT_KEY "trusted.glusterfs.bit-rot.reopen-hint" + +static inline int +br_is_signature_type_valid(int8_t signaturetype) +{ + return ((signaturetype > BR_SIGNATURE_TYPE_ZERO) && + (signaturetype < BR_SIGNATURE_TYPE_MAX)); +} + +static inline void +br_set_default_ongoingversion(br_version_t *buf, uint32_t *tv) +{ + buf->ongoingversion = BITROT_DEFAULT_CURRENT_VERSION; + buf->timebuf[0] = tv[0]; + buf->timebuf[1] = tv[1]; +} + +static inline void +br_set_default_signature(br_signature_t *buf, size_t *size) +{ + buf->signaturetype = (int8_t)BR_SIGNATURE_TYPE_VOID; + buf->signedversion = BITROT_DEFAULT_SIGNING_VERSION; + + *size = sizeof(br_signature_t); /* no signature */ +} + +static inline void +br_set_ongoingversion(br_version_t *buf, unsigned long version, uint32_t *tv) +{ + buf->ongoingversion = version; + buf->timebuf[0] = tv[0]; + buf->timebuf[1] = tv[1]; +} + +static inline void +br_set_signature(br_signature_t *buf, br_isignature_t *sign, + size_t signaturelen, size_t *size) +{ + buf->signaturetype = sign->signaturetype; + buf->signedversion = ntohl(sign->signedversion); + + memcpy(buf->signature, sign->signature, signaturelen); + *size = sizeof(br_signature_t) + signaturelen; +} + +#endif /* __BIT_ROT_COMMON_H__ */ diff --git a/xlators/features/bit-rot/src/stub/bit-rot-object-version.h b/xlators/features/bit-rot/src/stub/bit-rot-object-version.h new file mode 100644 index 00000000000..7ae6a5200df --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-object-version.h @@ -0,0 +1,30 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_OBJECT_VERSION_H +#define __BIT_ROT_OBJECT_VERSION_H + +/** + * on-disk formats for ongoing version and object signature. + */ +typedef struct br_version { + unsigned long ongoingversion; + uint32_t timebuf[2]; +} br_version_t; + +typedef struct __attribute__((__packed__)) br_signature { + int8_t signaturetype; + + unsigned long signedversion; + + char signature[0]; +} br_signature_t; + +#endif diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c b/xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c new file mode 100644 index 00000000000..8ac13a09941 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c @@ -0,0 +1,796 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "bit-rot-stub.h" + +br_stub_fd_t * +br_stub_fd_new(void) +{ + br_stub_fd_t *br_stub_fd = NULL; + + br_stub_fd = GF_CALLOC(1, sizeof(*br_stub_fd), gf_br_stub_mt_br_stub_fd_t); + + return br_stub_fd; +} + +int +__br_stub_fd_ctx_set(xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd) +{ + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, br_stub_fd, out); + + value = (uint64_t)(long)br_stub_fd; + + ret = __fd_ctx_set(fd, this, value); + +out: + return ret; +} + +br_stub_fd_t * +__br_stub_fd_ctx_get(xlator_t *this, fd_t *fd) +{ + br_stub_fd_t *br_stub_fd = NULL; + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + ret = __fd_ctx_get(fd, this, &value); + if (ret) + return NULL; + + br_stub_fd = (br_stub_fd_t *)((long)value); + +out: + return br_stub_fd; +} + +br_stub_fd_t * +br_stub_fd_ctx_get(xlator_t *this, fd_t *fd) +{ + br_stub_fd_t *br_stub_fd = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + LOCK(&fd->lock); + { + br_stub_fd = __br_stub_fd_ctx_get(this, fd); + } + UNLOCK(&fd->lock); + +out: + return br_stub_fd; +} + +int32_t +br_stub_fd_ctx_set(xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, br_stub_fd, out); + + LOCK(&fd->lock); + { + ret = __br_stub_fd_ctx_set(this, fd, br_stub_fd); + } + UNLOCK(&fd->lock); + +out: + return ret; +} + +/** + * Adds an entry to the bad objects directory. + * @gfid: gfid of the bad object being added to the bad objects directory + */ +int +br_stub_add(xlator_t *this, uuid_t gfid) +{ + char gfid_path[BR_PATH_MAX_PLUS] = {0}; + char bad_gfid_path[BR_PATH_MAX_PLUS] = {0}; + int ret = 0; + br_stub_private_t *priv = NULL; + struct stat st = {0}; + + priv = this->private; + GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, !gf_uuid_is_null(gfid), out, + errno, EINVAL); + + snprintf(gfid_path, sizeof(gfid_path), "%s/%s", priv->stub_basepath, + uuid_utoa(gfid)); + + ret = sys_stat(gfid_path, &st); + if (!ret) + goto out; + snprintf(bad_gfid_path, sizeof(bad_gfid_path), "%s/stub-%s", + priv->stub_basepath, uuid_utoa(priv->bad_object_dir_gfid)); + + ret = sys_link(bad_gfid_path, gfid_path); + if (ret) { + if ((errno != ENOENT) && (errno != EMLINK) && (errno != EEXIST)) + goto out; + + /* + * Continue with success. At least we'll have half of the + * functionality, in the sense, object is marked bad and + * would be inaccessible. It's only scrub status that would + * show up less number of objects. That's fine as we'll have + * the log files that will have the missing information. + */ + gf_smsg(this->name, GF_LOG_WARNING, errno, BRS_MSG_LINK_FAIL, "gfid=%s", + uuid_utoa(gfid), NULL); + } + + return 0; +out: + return -1; +} + +int +br_stub_del(xlator_t *this, uuid_t gfid) +{ + int32_t op_errno __attribute__((unused)) = 0; + br_stub_private_t *priv = NULL; + int ret = 0; + char gfid_path[BR_PATH_MAX_PLUS] = {0}; + + priv = this->private; + GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, !gf_uuid_is_null(gfid), out, + op_errno, EINVAL); + snprintf(gfid_path, sizeof(gfid_path), "%s/%s", priv->stub_basepath, + uuid_utoa(gfid)); + ret = sys_unlink(gfid_path); + if (ret && (errno != ENOENT)) { + gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJ_UNLINK_FAIL, + "path=%s", gfid_path, NULL); + ret = -errno; + goto out; + } + + ret = 0; + +out: + return ret; +} + +static int +br_stub_check_stub_directory(xlator_t *this, char *fullpath) +{ + int ret = 0; + struct stat st = { + 0, + }; + char oldpath[BR_PATH_MAX_PLUS] = {0}; + br_stub_private_t *priv = NULL; + + priv = this->private; + + snprintf(oldpath, sizeof(oldpath), "%s/%s", priv->export, + OLD_BR_STUB_QUARANTINE_DIR); + + ret = sys_stat(fullpath, &st); + if (!ret && !S_ISDIR(st.st_mode)) + goto error_return; + if (ret) { + if (errno != ENOENT) + goto error_return; + ret = sys_stat(oldpath, &st); + if (ret) + ret = mkdir_p(fullpath, 0600, _gf_true); + else + ret = sys_rename(oldpath, fullpath); + } + + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL, + "create-path=%s", fullpath, NULL); + return ret; + +error_return: + gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL, + "verify-path=%s", fullpath, NULL); + return -1; +} + +/** + * Function to create the container for the bad objects within the bad objects + * directory. + */ +static int +br_stub_check_stub_file(xlator_t *this, char *path) +{ + int ret = 0; + int fd = -1; + struct stat st = { + 0, + }; + + ret = sys_stat(path, &st); + if (!ret && !S_ISREG(st.st_mode)) + goto error_return; + if (ret) { + if (errno != ENOENT) + goto error_return; + fd = sys_creat(path, 0); + if (fd < 0) + gf_smsg(this->name, GF_LOG_ERROR, errno, + BRS_MSG_BAD_OBJECT_DIR_FAIL, "create-path=%s", path, NULL); + } + + if (fd >= 0) { + sys_close(fd); + ret = 0; + } + + return ret; + +error_return: + gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL, + "verify-path=%s", path, NULL); + return -1; +} + +int +br_stub_dir_create(xlator_t *this, br_stub_private_t *priv) +{ + int ret = -1; + char fullpath[BR_PATH_MAX_PLUS] = { + 0, + }; + char stub_gfid_path[BR_PATH_MAX_PLUS] = { + 0, + }; + + gf_uuid_copy(priv->bad_object_dir_gfid, BR_BAD_OBJ_CONTAINER); + + if (snprintf(fullpath, sizeof(fullpath), "%s", priv->stub_basepath) >= + sizeof(fullpath)) + goto out; + + if (snprintf(stub_gfid_path, sizeof(stub_gfid_path), "%s/stub-%s", + priv->stub_basepath, uuid_utoa(priv->bad_object_dir_gfid)) >= + sizeof(stub_gfid_path)) + goto out; + + ret = br_stub_check_stub_directory(this, fullpath); + if (ret) + goto out; + ret = br_stub_check_stub_file(this, stub_gfid_path); + if (ret) + goto out; + + return 0; + +out: + return -1; +} + +call_stub_t * +__br_stub_dequeue(struct list_head *callstubs) +{ + call_stub_t *stub = NULL; + + if (!list_empty(callstubs)) { + stub = list_entry(callstubs->next, call_stub_t, list); + list_del_init(&stub->list); + } + + return stub; +} + +void +__br_stub_enqueue(struct list_head *callstubs, call_stub_t *stub) +{ + list_add_tail(&stub->list, callstubs); +} + +void +br_stub_worker_enqueue(xlator_t *this, call_stub_t *stub) +{ + br_stub_private_t *priv = NULL; + + priv = this->private; + pthread_mutex_lock(&priv->container.bad_lock); + { + __br_stub_enqueue(&priv->container.bad_queue, stub); + pthread_cond_signal(&priv->container.bad_cond); + } + pthread_mutex_unlock(&priv->container.bad_lock); +} + +void * +br_stub_worker(void *data) +{ + br_stub_private_t *priv = NULL; + xlator_t *this = NULL; + call_stub_t *stub = NULL; + + THIS = data; + this = data; + priv = this->private; + + for (;;) { + pthread_mutex_lock(&priv->container.bad_lock); + { + while (list_empty(&priv->container.bad_queue)) { + (void)pthread_cond_wait(&priv->container.bad_cond, + &priv->container.bad_lock); + } + + stub = __br_stub_dequeue(&priv->container.bad_queue); + } + pthread_mutex_unlock(&priv->container.bad_lock); + + if (stub) /* guard against spurious wakeups */ + call_resume(stub); + } + + return NULL; +} + +int32_t +br_stub_lookup_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xattr_req) +{ + br_stub_private_t *priv = NULL; + struct stat lstatbuf = {0}; + int ret = 0; + int32_t op_errno = EINVAL; + int32_t op_ret = -1; + struct iatt stbuf = { + 0, + }; + struct iatt postparent = { + 0, + }; + dict_t *xattr = NULL; + gf_boolean_t ver_enabled = _gf_false; + + BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled); + priv = this->private; + BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), done); + + VALIDATE_OR_GOTO(loc, done); + if (gf_uuid_compare(loc->gfid, priv->bad_object_dir_gfid)) + goto done; + + ret = sys_lstat(priv->stub_basepath, &lstatbuf); + if (ret) { + gf_msg_debug(this->name, errno, + "Stat failed on stub bad " + "object dir"); + op_errno = errno; + goto done; + } else if (!S_ISDIR(lstatbuf.st_mode)) { + gf_msg_debug(this->name, errno, + "bad object container is not " + "a directory"); + op_errno = ENOTDIR; + goto done; + } + + iatt_from_stat(&stbuf, &lstatbuf); + gf_uuid_copy(stbuf.ia_gfid, priv->bad_object_dir_gfid); + + op_ret = op_errno = 0; + xattr = dict_new(); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + } + +done: + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, loc->inode, &stbuf, + xattr, &postparent); + if (xattr) + dict_unref(xattr); + return 0; +} + +static int +is_bad_gfid_file_current(char *filename, uuid_t gfid) +{ + char current_stub_gfid[GF_UUID_BUF_SIZE + 16] = { + 0, + }; + + snprintf(current_stub_gfid, sizeof current_stub_gfid, "stub-%s", + uuid_utoa(gfid)); + return (!strcmp(filename, current_stub_gfid)); +} + +static void +check_delete_stale_bad_file(xlator_t *this, char *filename) +{ + int ret = 0; + struct stat st = {0}; + char filepath[BR_PATH_MAX_PLUS] = {0}; + br_stub_private_t *priv = NULL; + + priv = this->private; + + if (is_bad_gfid_file_current(filename, priv->bad_object_dir_gfid)) + return; + + snprintf(filepath, sizeof(filepath), "%s/%s", priv->stub_basepath, + filename); + + ret = sys_stat(filepath, &st); + if (!ret && st.st_nlink == 1) + sys_unlink(filepath); +} + +static int +br_stub_fill_readdir(fd_t *fd, br_stub_fd_t *fctx, DIR *dir, off_t off, + size_t size, gf_dirent_t *entries) +{ + off_t in_case = -1; + off_t last_off = 0; + size_t filled = 0; + int count = 0; + int32_t this_size = -1; + gf_dirent_t *this_entry = NULL; + xlator_t *this = NULL; + struct dirent *entry = NULL; + struct dirent scratch[2] = { + { + 0, + }, + }; + + this = THIS; + if (!off) { + rewinddir(dir); + } else { + seekdir(dir, off); +#ifndef GF_LINUX_HOST_OS + if ((u_long)telldir(dir) != off && off != fctx->bad_object.dir_eof) { + gf_smsg(THIS->name, GF_LOG_ERROR, 0, + BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL, "off=(0x%llx)", off, + "dir=%p", dir, NULL); + errno = EINVAL; + count = -1; + goto out; + } +#endif /* GF_LINUX_HOST_OS */ + } + + while (filled <= size) { + in_case = (u_long)telldir(dir); + + if (in_case == -1) { + gf_smsg(THIS->name, GF_LOG_ERROR, 0, + BRS_MSG_BAD_OBJECT_DIR_TELL_FAIL, "dir=%p", dir, "err=%s", + strerror(errno), NULL); + goto out; + } + + errno = 0; + entry = sys_readdir(dir, scratch); + if (!entry || errno != 0) { + if (errno == EBADF) { + gf_smsg(THIS->name, GF_LOG_WARNING, 0, + BRS_MSG_BAD_OBJECT_DIR_READ_FAIL, "dir=%p", dir, + "err=%s", strerror(errno), NULL); + goto out; + } + break; + } + + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) + continue; + + if (!strncmp(entry->d_name, "stub-", strlen("stub-"))) { + check_delete_stale_bad_file(this, entry->d_name); + continue; + } + + this_size = max(sizeof(gf_dirent_t), sizeof(gfs3_dirplist)) + + strlen(entry->d_name) + 1; + + if (this_size + filled > size) { + seekdir(dir, in_case); +#ifndef GF_LINUX_HOST_OS + if ((u_long)telldir(dir) != in_case && + in_case != fctx->bad_object.dir_eof) { + gf_smsg(THIS->name, GF_LOG_ERROR, 0, + BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL, "in_case=(0x%llx)", + in_case, "dir=%p", dir, NULL); + errno = EINVAL; + count = -1; + goto out; + } +#endif /* GF_LINUX_HOST_OS */ + break; + } + + this_entry = gf_dirent_for_name(entry->d_name); + + if (!this_entry) { + gf_smsg(THIS->name, GF_LOG_ERROR, 0, + BRS_MSG_CREATE_GF_DIRENT_FAILED, "entry-name=%s", + entry->d_name, "err=%s", strerror(errno), NULL); + goto out; + } + /* + * we store the offset of next entry here, which is + * probably not intended, but code using syncop_readdir() + * (glfs-heal.c, afr-self-heald.c, pump.c) rely on it + * for directory read resumption. + */ + last_off = (u_long)telldir(dir); + this_entry->d_off = last_off; + this_entry->d_ino = entry->d_ino; + + list_add_tail(&this_entry->list, &entries->list); + + filled += this_size; + count++; + } + + if ((!sys_readdir(dir, scratch) && (errno == 0))) { + /* Indicate EOF */ + errno = ENOENT; + /* Remember EOF offset for later detection */ + fctx->bad_object.dir_eof = last_off; + } +out: + return count; +} + +int32_t +br_stub_readdir_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t off, dict_t *xdata) +{ + br_stub_fd_t *fctx = NULL; + DIR *dir = NULL; + int ret = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + int count = 0; + gf_dirent_t entries; + gf_boolean_t xdata_unref = _gf_false; + dict_t *dict = NULL; + + INIT_LIST_HEAD(&entries.list); + + fctx = br_stub_fd_ctx_get(this, fd); + if (!fctx) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_GET_FD_CONTEXT_FAILED, + "fd=%p", fd, NULL); + op_errno = -ret; + goto done; + } + + dir = fctx->bad_object.dir; + + if (!dir) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_BAD_HANDLE_DIR_NULL, + "fd=%p", fd, NULL); + op_errno = EINVAL; + goto done; + } + + count = br_stub_fill_readdir(fd, fctx, dir, off, size, &entries); + + /* pick ENOENT to indicate EOF */ + op_errno = errno; + op_ret = count; + + dict = xdata; + (void)br_stub_bad_objects_path(this, fd, &entries, &dict); + if (!xdata && dict) { + xdata = dict; + xdata_unref = _gf_true; + } + +done: + STACK_UNWIND_STRICT(readdir, frame, op_ret, op_errno, &entries, xdata); + gf_dirent_free(&entries); + if (xdata_unref) + dict_unref(xdata); + return 0; +} + +/** + * This function is called to mainly obtain the paths of the corrupt + * objects (files as of now). Currently scrub status prints only the + * gfid of the corrupted files. Reason is, bitrot-stub maintains the + * list of the corrupted objects as entries inside the quarantine + * directory (<brick export>/.glusterfs/quarantine) + * + * And the name of each entry in the qurantine directory is the gfid + * of the corrupted object. So scrub status will just show that info. + * But it helps the users a lot if the actual path to the object is + * also reported. Hence the below function to get that information. + * The function allocates a new dict to be returned (if it does not + * get one from the caller of readdir i.e. scrubber as of now), and + * stores the paths of each corrupted gfid there. The gfid is used as + * the key and path is used as the value. + * + * NOTE: The path will be there in following situations + * 1) gfid2path option has been enabled (posix xlator option) + * and the corrupted file contains the path as an extended + * attribute. + * 2) If the gfid2path option is not enabled, OR if the xattr + * is absent, then the inode table should have it. + * The path will be there if a name based lookup has happened + * on the file which has been corrupted. With lookup a inode and + * dentry would be created in the inode table. And the path is + * constructed using the in memory inode and dentry. If a lookup + * has not happened OR the inode corresponding to the corrupted + * file does not exist in the inode table (because it got purged + * as lru limit of the inodes exceeded) OR a nameless lookup had + * happened to populate the inode in the inode table, then the + * path will not be printed in scrub and only the gfid will be there. + **/ +int +br_stub_bad_objects_path(xlator_t *this, fd_t *fd, gf_dirent_t *entries, + dict_t **dict) +{ + gf_dirent_t *entry = NULL; + inode_t *inode = NULL; + char *hpath = NULL; + uuid_t gfid = {0}; + int ret = -1; + dict_t *tmp_dict = NULL; + char str_gfid[64] = {0}; + + if (list_empty(&entries->list)) + return 0; + + tmp_dict = *dict; + + if (!tmp_dict) { + tmp_dict = dict_new(); + /* + * If the allocation of dict fails then no need treat it + * it as a error. This path (or function) is executed when + * "gluster volume bitrot <volume name> scrub status" is + * executed, to get the list of the corrupted objects. + * And the motive of this function is to get the paths of + * the corrupted objects. If the dict allocation fails, then + * the scrub status will only show the gfids of those corrupted + * objects (which is the behavior as of the time of this patch + * being worked upon). So just return and only the gfids will + * be shown. + */ + if (!tmp_dict) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_FAILED, NULL); + goto out; + } + } + + list_for_each_entry(entry, &entries->list, list) + { + gf_uuid_clear(gfid); + gf_uuid_parse(entry->d_name, gfid); + + inode = inode_find(fd->inode->table, gfid); + + /* No need to check the return value here. + * Because @hpath is examined. + */ + (void)br_stub_get_path_of_gfid(this, fd->inode, inode, gfid, &hpath); + + if (hpath) { + gf_msg_debug(this->name, 0, + "path of the corrupted " + "object (gfid: %s) is %s", + uuid_utoa(gfid), hpath); + br_stub_entry_xattr_fill(this, hpath, entry, tmp_dict); + } else + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED, + "gfid=%s", uuid_utoa_r(gfid, str_gfid), NULL); + + inode = NULL; + hpath = NULL; + } + + ret = 0; + *dict = tmp_dict; + +out: + return ret; +} + +int +br_stub_get_path_of_gfid(xlator_t *this, inode_t *parent, inode_t *inode, + uuid_t gfid, char **path) +{ + int32_t ret = -1; + char gfid_str[64] = {0}; + + GF_VALIDATE_OR_GOTO("bitrot-stub", this, out); + GF_VALIDATE_OR_GOTO(this->name, parent, out); + GF_VALIDATE_OR_GOTO(this->name, path, out); + + /* Above, No need to validate the @inode for hard resolution. Because + * inode can be NULL and if it is NULL, then syncop_gfid_to_path_hard + * will allocate a new inode and proceed. So no need to bother about + * @inode. Because we need it only to send a syncop_getxattr call + * from inside syncop_gfid_to_path_hard. And getxattr fetches the + * path from the backend. + */ + + ret = syncop_gfid_to_path_hard(parent->table, FIRST_CHILD(this), gfid, + inode, path, _gf_true); + if (ret < 0) + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED, + "gfid=%s", uuid_utoa_r(gfid, gfid_str), NULL); + + /* + * Try with soft resolution of path if hard resolve fails. Because + * checking the xattr on disk to get the path of a inode (or gfid) + * is dependent on whether that option is enabled in the posix + * xlator or not. If it is not enabled, then hard resolution by + * checking the on disk xattr fails. + * + * Thus in such situations fall back to the soft resolution which + * mainly depends on the inode_path() function. And for using + * inode_path, @inode has to be linked i.e. a successful lookup should + * have happened on the gfid (or the path) to link the inode to the + * inode table. And if @inode is NULL, means, the inode has not been + * found in the inode table and better not to do inode_path() on the + * inode which has not been linked. + */ + if (ret < 0 && inode) { + ret = syncop_gfid_to_path_hard(parent->table, FIRST_CHILD(this), gfid, + inode, path, _gf_false); + if (ret < 0) + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED, + "from-memory gfid=%s", uuid_utoa_r(gfid, gfid_str), NULL); + } + +out: + return ret; +} + +/** + * NOTE: If the file has multiple hardlinks (in gluster volume + * namespace), the path would be one of the hardlinks. Its up to + * the user to find the remaining hardlinks (using find -samefile) + * and remove them. + **/ +void +br_stub_entry_xattr_fill(xlator_t *this, char *hpath, gf_dirent_t *entry, + dict_t *dict) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO(this->name, hpath, out); + + /* + * Use the entry->d_name (which is nothing but the gfid of the + * corrupted object) as the key. And the value will be the actual + * path of that object (or file). + * + * ALso ignore the dict_set errors. scrubber will get the gfid of + * the corrupted object for sure. So, for now lets just log the + * dict_set_dynstr failure and move on. + */ + + ret = dict_set_dynstr(dict, entry->d_name, hpath); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_DICT_SET_FAILED, + "path=%s", hpath, "object-name=%s", entry->d_name, NULL); +out: + return; +} diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h new file mode 100644 index 00000000000..9d93caf069f --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h @@ -0,0 +1,36 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _BR_MEM_TYPES_H +#define _BR_MEM_TYPES_H + +#include <glusterfs/mem-types.h> + +enum br_mem_types { + gf_br_stub_mt_private_t = gf_common_mt_end + 1, + gf_br_stub_mt_version_t, + gf_br_stub_mt_inode_ctx_t, + gf_br_stub_mt_signature_t, + gf_br_mt_br_private_t, + gf_br_mt_br_child_t, + gf_br_mt_br_object_t, + gf_br_mt_br_ob_n_wk_t, + gf_br_mt_br_scrubber_t, + gf_br_mt_br_fsscan_entry_t, + gf_br_stub_mt_br_stub_fd_t, + gf_br_stub_mt_br_scanner_freq_t, + gf_br_stub_mt_sigstub_t, + gf_br_mt_br_child_event_t, + gf_br_stub_mt_misc, + gf_br_mt_br_worker_t, + gf_br_stub_mt_end, +}; + +#endif diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h new file mode 100644 index 00000000000..6c15a166f18 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h @@ -0,0 +1,117 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _BITROT_STUB_MESSAGES_H_ +#define _BITROT_STUB_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(BITROT_STUB, BRS_MSG_NO_MEMORY, BRS_MSG_SET_EVENT_FAILED, + BRS_MSG_MEM_ACNT_FAILED, BRS_MSG_CREATE_FRAME_FAILED, + BRS_MSG_SET_CONTEXT_FAILED, BRS_MSG_CHANGE_VERSION_FAILED, + BRS_MSG_ADD_FD_TO_LIST_FAILED, BRS_MSG_SET_FD_CONTEXT_FAILED, + BRS_MSG_CREATE_ANONYMOUS_FD_FAILED, BRS_MSG_NO_CHILD, + BRS_MSG_STUB_ALLOC_FAILED, BRS_MSG_GET_INODE_CONTEXT_FAILED, + BRS_MSG_CANCEL_SIGN_THREAD_FAILED, BRS_MSG_ADD_FD_TO_INODE, + BRS_MSG_SIGN_VERSION_ERROR, BRS_MSG_BAD_OBJ_MARK_FAIL, + BRS_MSG_NON_SCRUB_BAD_OBJ_MARK, BRS_MSG_REMOVE_INTERNAL_XATTR, + BRS_MSG_SET_INTERNAL_XATTR, BRS_MSG_BAD_OBJECT_ACCESS, + BRS_MSG_BAD_CONTAINER_FAIL, BRS_MSG_BAD_OBJECT_DIR_FAIL, + BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL, BRS_MSG_BAD_OBJECT_DIR_TELL_FAIL, + BRS_MSG_BAD_OBJECT_DIR_READ_FAIL, BRS_MSG_GET_FD_CONTEXT_FAILED, + BRS_MSG_BAD_HANDLE_DIR_NULL, BRS_MSG_BAD_OBJ_THREAD_FAIL, + BRS_MSG_BAD_OBJ_DIR_CLOSE_FAIL, BRS_MSG_LINK_FAIL, + BRS_MSG_BAD_OBJ_UNLINK_FAIL, BRS_MSG_DICT_SET_FAILED, + BRS_MSG_PATH_GET_FAILED, BRS_MSG_NULL_LOCAL, + BRS_MSG_SPAWN_SIGN_THRD_FAILED, BRS_MSG_KILL_SIGN_THREAD, + BRS_MSG_NON_BITD_PID, BRS_MSG_SIGN_PREPARE_FAIL, + BRS_MSG_USING_DEFAULT_THREAD_SIZE, BRS_MSG_ALLOC_MEM_FAILED, + BRS_MSG_DICT_ALLOC_FAILED, BRS_MSG_CREATE_GF_DIRENT_FAILED, + BRS_MSG_ALLOC_FAILED, BRS_MSG_PATH_XATTR_GET_FAILED, + BRS_MSG_VERSION_PREPARE_FAIL); + +#define BRS_MSG_MEM_ACNT_FAILED_STR "Memory accounting init failed" +#define BRS_MSG_BAD_OBJ_THREAD_FAIL_STR "pthread_init failed" +#define BRS_MSG_USING_DEFAULT_THREAD_SIZE_STR "Using default thread stack size" +#define BRS_MSG_NO_CHILD_STR "FATAL: no children" +#define BRS_MSG_SPAWN_SIGN_THRD_FAILED_STR \ + "failed to create the new thread for signer" +#define BRS_MSG_BAD_CONTAINER_FAIL_STR \ + "failed to launch the thread for storing bad gfids" +#define BRS_MSG_CANCEL_SIGN_THREAD_FAILED_STR \ + "Could not cancel sign serializer thread" +#define BRS_MSG_KILL_SIGN_THREAD_STR "killed the signer thread" +#define BRS_MSG_GET_INODE_CONTEXT_FAILED_STR \ + "failed to init the inode context for the inode" +#define BRS_MSG_ADD_FD_TO_INODE_STR "failed to add fd to the inode" +#define BRS_MSG_NO_MEMORY_STR "local allocation failed" +#define BRS_MSG_BAD_OBJECT_ACCESS_STR "bad object accessed. Returning" +#define BRS_MSG_SIGN_VERSION_ERROR_STR "Signing version exceeds current version" +#define BRS_MSG_NON_BITD_PID_STR \ + "PID from where signature request came, does not belong to bit-rot " \ + "daemon. Unwinding the fop" +#define BRS_MSG_SIGN_PREPARE_FAIL_STR \ + "failed to prepare the signature. Unwinding the fop" +#define BRS_MSG_VERSION_PREPARE_FAIL_STR \ + "failed to prepare the version. Unwinding the fop" +#define BRS_MSG_STUB_ALLOC_FAILED_STR "failed to allocate stub fop, Unwinding" +#define BRS_MSG_BAD_OBJ_MARK_FAIL_STR "failed to mark object as bad" +#define BRS_MSG_NON_SCRUB_BAD_OBJ_MARK_STR \ + "bad object marking is not from the scrubber" +#define BRS_MSG_ALLOC_MEM_FAILED_STR "failed to allocate memory" +#define BRS_MSG_SET_INTERNAL_XATTR_STR "called on the internal xattr" +#define BRS_MSG_REMOVE_INTERNAL_XATTR_STR "removexattr called on internal xattr" +#define BRS_MSG_CREATE_ANONYMOUS_FD_FAILED_STR \ + "failed to create anonymous fd for the inode" +#define BRS_MSG_ADD_FD_TO_LIST_FAILED_STR "failed add fd to the list" +#define BRS_MSG_SET_FD_CONTEXT_FAILED_STR \ + "failed to set the fd context for the file" +#define BRS_MSG_NULL_LOCAL_STR "local is NULL" +#define BRS_MSG_DICT_ALLOC_FAILED_STR \ + "dict allocation failed: cannot send IPC FOP to changelog" +#define BRS_MSG_SET_EVENT_FAILED_STR "cannot set release event in dict" +#define BRS_MSG_CREATE_FRAME_FAILED_STR "create_frame() failure" +#define BRS_MSG_BAD_OBJ_DIR_CLOSE_FAIL_STR "closedir error" +#define BRS_MSG_LINK_FAIL_STR "failed to record gfid" +#define BRS_MSG_BAD_OBJ_UNLINK_FAIL_STR \ + "failed to delete bad object link from quaratine directory" +#define BRS_MSG_BAD_OBJECT_DIR_FAIL_STR "failed stub directory" +#define BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL_STR \ + "seekdir failed. Invalid argument (offset reused from another DIR * " \ + "structure)" +#define BRS_MSG_BAD_OBJECT_DIR_TELL_FAIL_STR "telldir failed on dir" +#define BRS_MSG_BAD_OBJECT_DIR_READ_FAIL_STR "readdir failed on dir" +#define BRS_MSG_CREATE_GF_DIRENT_FAILED_STR "could not create gf_dirent" +#define BRS_MSG_GET_FD_CONTEXT_FAILED_STR "pfd is NULL" +#define BRS_MSG_BAD_HANDLE_DIR_NULL_STR "dir if NULL" +#define BRS_MSG_ALLOC_FAILED_STR \ + "failed to allocate new dict for saving the paths of the corrupted " \ + "objects. Scrub status will only display the gfid" +#define BRS_MSG_PATH_GET_FAILED_STR "failed to get the path" +#define BRS_MSG_PATH_XATTR_GET_FAILED_STR \ + "failed to get the path xattr from disk for the gfid. Trying to get path " \ + "from the memory" +#define BRS_MSG_DICT_SET_FAILED_STR \ + "failed to set the actual path as the value in the dict for the " \ + "corrupted object" +#define BRS_MSG_SET_CONTEXT_FAILED_STR \ + "could not set fd context for release callback" +#define BRS_MSG_CHANGE_VERSION_FAILED_STR "change version failed" +#endif /* !_BITROT_STUB_MESSAGES_H_ */ diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c new file mode 100644 index 00000000000..447dd47ff41 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c @@ -0,0 +1,3590 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <ctype.h> +#include <sys/uio.h> +#include <signal.h> + +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include "changelog.h" +#include <glusterfs/compat-errno.h> +#include <glusterfs/call-stub.h> + +#include "bit-rot-stub.h" +#include "bit-rot-stub-mem-types.h" +#include "bit-rot-stub-messages.h" +#include "bit-rot-common.h" + +#define BR_STUB_REQUEST_COOKIE 0x1 + +void +br_stub_lock_cleaner(void *arg) +{ + pthread_mutex_t *clean_mutex = arg; + + pthread_mutex_unlock(clean_mutex); + return; +} + +void * +br_stub_signth(void *); + +struct br_stub_signentry { + unsigned long v; + + call_stub_t *stub; + + struct list_head list; +}; + +int32_t +mem_acct_init(xlator_t *this) +{ + int32_t ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init(this, gf_br_stub_mt_end + 1); + + if (ret != 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_MEM_ACNT_FAILED, NULL); + return ret; + } + + return ret; +} + +int +br_stub_bad_object_container_init(xlator_t *this, br_stub_private_t *priv) +{ + pthread_attr_t w_attr; + int ret = -1; + + ret = pthread_cond_init(&priv->container.bad_cond, NULL); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL, + "cond_init ret=%d", ret, NULL); + goto out; + } + + ret = pthread_mutex_init(&priv->container.bad_lock, NULL); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL, + "mutex_init ret=%d", ret, NULL); + goto cleanup_cond; + } + + ret = pthread_attr_init(&w_attr); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL, + "attr_init ret=%d", ret, NULL); + goto cleanup_lock; + } + + ret = pthread_attr_setstacksize(&w_attr, BAD_OBJECT_THREAD_STACK_SIZE); + if (ret == EINVAL) { + gf_smsg(this->name, GF_LOG_WARNING, 0, + BRS_MSG_USING_DEFAULT_THREAD_SIZE, NULL); + } + + INIT_LIST_HEAD(&priv->container.bad_queue); + ret = br_stub_dir_create(this, priv); + if (ret < 0) + goto cleanup_lock; + + ret = gf_thread_create(&priv->container.thread, &w_attr, br_stub_worker, + this, "brswrker"); + if (ret) + goto cleanup_attr; + + return 0; + +cleanup_attr: + pthread_attr_destroy(&w_attr); +cleanup_lock: + pthread_mutex_destroy(&priv->container.bad_lock); +cleanup_cond: + pthread_cond_destroy(&priv->container.bad_cond); +out: + return -1; +} + +int32_t +init(xlator_t *this) +{ + int ret = 0; + char *tmp = NULL; + struct timeval tv = { + 0, + }; + br_stub_private_t *priv = NULL; + + if (!this->children) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_NO_CHILD, NULL); + goto error_return; + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_br_stub_mt_private_t); + if (!priv) + goto error_return; + + priv->local_pool = mem_pool_new(br_stub_local_t, 512); + if (!priv->local_pool) + goto free_priv; + + GF_OPTION_INIT("bitrot", priv->do_versioning, bool, free_mempool); + + GF_OPTION_INIT("export", tmp, str, free_mempool); + + if (snprintf(priv->export, PATH_MAX, "%s", tmp) >= PATH_MAX) + goto free_mempool; + + if (snprintf(priv->stub_basepath, sizeof(priv->stub_basepath), "%s/%s", + priv->export, + BR_STUB_QUARANTINE_DIR) >= sizeof(priv->stub_basepath)) + goto free_mempool; + + (void)gettimeofday(&tv, NULL); + + /* boot time is in network endian format */ + priv->boot[0] = htonl(tv.tv_sec); + priv->boot[1] = htonl(tv.tv_usec); + + pthread_mutex_init(&priv->lock, NULL); + pthread_cond_init(&priv->cond, NULL); + INIT_LIST_HEAD(&priv->squeue); + + /* Thread creations need 'this' to be passed so that THIS can be + * assigned inside the thread. So setting this->private here. + */ + this->private = priv; + if (!priv->do_versioning) + return 0; + + ret = gf_thread_create(&priv->signth, NULL, br_stub_signth, this, + "brssign"); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SPAWN_SIGN_THRD_FAILED, + NULL); + goto cleanup_lock; + } + + ret = br_stub_bad_object_container_init(this, priv); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_CONTAINER_FAIL, NULL); + goto cleanup_lock; + } + + gf_msg_debug(this->name, 0, "bit-rot stub loaded"); + + return 0; + +cleanup_lock: + pthread_cond_destroy(&priv->cond); + pthread_mutex_destroy(&priv->lock); +free_mempool: + mem_pool_destroy(priv->local_pool); + priv->local_pool = NULL; +free_priv: + GF_FREE(priv); + this->private = NULL; +error_return: + return -1; +} + +/* TODO: + * As of now enabling bitrot option does 2 things. + * 1) Start the Bitrot Daemon which signs the objects (currently files only) + * upon getting notified by the stub. + * 2) Enable versioning of the objects. Object versions (again files only) are + * incremented upon modification. + * So object versioning is tied to bitrot daemon's signing. In future, object + * versioning might be necessary for other things as well apart from bit-rot + * detection (well that's the objective of bringing in object-versioning :)). + * In that case, better to make versioning a new option and letting it to be + * enabled despite bit-rot detection is not needed. + * Ex: ICAP. + */ +int32_t +reconfigure(xlator_t *this, dict_t *options) +{ + int32_t ret = -1; + br_stub_private_t *priv = NULL; + + priv = this->private; + + GF_OPTION_RECONF("bitrot", priv->do_versioning, options, bool, err); + if (priv->do_versioning && !priv->signth) { + ret = gf_thread_create(&priv->signth, NULL, br_stub_signth, this, + "brssign"); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, + BRS_MSG_SPAWN_SIGN_THRD_FAILED, NULL); + goto err; + } + + ret = br_stub_bad_object_container_init(this, priv); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_CONTAINER_FAIL, + NULL); + goto err; + } + } else { + if (priv->signth) { + if (gf_thread_cleanup_xint(priv->signth)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL); + } else { + gf_smsg(this->name, GF_LOG_INFO, 0, BRS_MSG_KILL_SIGN_THREAD, + NULL); + priv->signth = 0; + } + } + + if (priv->container.thread) { + if (gf_thread_cleanup_xint(priv->container.thread)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL); + } + priv->container.thread = 0; + } + } + + ret = 0; + return ret; +err: + if (priv->signth) { + if (gf_thread_cleanup_xint(priv->signth)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL); + } + priv->signth = 0; + } + + if (priv->container.thread) { + if (gf_thread_cleanup_xint(priv->container.thread)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL); + } + priv->container.thread = 0; + } + ret = -1; + return ret; +} + +int +notify(xlator_t *this, int event, void *data, ...) +{ + br_stub_private_t *priv = NULL; + + if (!this) + return 0; + + priv = this->private; + if (!priv) + return 0; + + default_notify(this, event, data); + return 0; +} + +void +fini(xlator_t *this) +{ + int32_t ret = 0; + br_stub_private_t *priv = this->private; + struct br_stub_signentry *sigstub = NULL; + call_stub_t *stub = NULL; + + if (!priv) + return; + + if (!priv->do_versioning) + goto cleanup; + + ret = gf_thread_cleanup_xint(priv->signth); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CANCEL_SIGN_THREAD_FAILED, + NULL); + goto out; + } + priv->signth = 0; + + while (!list_empty(&priv->squeue)) { + sigstub = list_first_entry(&priv->squeue, struct br_stub_signentry, + list); + list_del_init(&sigstub->list); + + call_stub_destroy(sigstub->stub); + GF_FREE(sigstub); + } + + ret = gf_thread_cleanup_xint(priv->container.thread); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CANCEL_SIGN_THREAD_FAILED, + NULL); + goto out; + } + + priv->container.thread = 0; + + while (!list_empty(&priv->container.bad_queue)) { + stub = list_first_entry(&priv->container.bad_queue, call_stub_t, list); + list_del_init(&stub->list); + call_stub_destroy(stub); + } + + pthread_mutex_destroy(&priv->container.bad_lock); + pthread_cond_destroy(&priv->container.bad_cond); + +cleanup: + pthread_mutex_destroy(&priv->lock); + pthread_cond_destroy(&priv->cond); + + if (priv->local_pool) { + mem_pool_destroy(priv->local_pool); + priv->local_pool = NULL; + } + + this->private = NULL; + GF_FREE(priv); + +out: + return; +} + +static int +br_stub_alloc_versions(br_version_t **obuf, br_signature_t **sbuf, + size_t signaturelen) +{ + void *mem = NULL; + size_t size = 0; + + if (obuf) + size += sizeof(br_version_t); + if (sbuf) + size += sizeof(br_signature_t) + signaturelen; + + mem = GF_CALLOC(1, size, gf_br_stub_mt_version_t); + if (!mem) + goto error_return; + + if (obuf) { + *obuf = (br_version_t *)mem; + mem = ((char *)mem + sizeof(br_version_t)); + } + if (sbuf) { + *sbuf = (br_signature_t *)mem; + } + + return 0; + +error_return: + return -1; +} + +static void +br_stub_dealloc_versions(void *mem) +{ + GF_FREE(mem); +} + +static br_stub_local_t * +br_stub_alloc_local(xlator_t *this) +{ + br_stub_private_t *priv = this->private; + + return mem_get0(priv->local_pool); +} + +static void +br_stub_dealloc_local(br_stub_local_t *ptr) +{ + if (!ptr) + return; + + mem_put(ptr); +} + +static int +br_stub_prepare_version_request(xlator_t *this, dict_t *dict, + br_version_t *obuf, unsigned long oversion) +{ + br_stub_private_t *priv = NULL; + + priv = this->private; + br_set_ongoingversion(obuf, oversion, priv->boot); + + return dict_set_bin(dict, BITROT_CURRENT_VERSION_KEY, (void *)obuf, + sizeof(br_version_t)); +} + +static int +br_stub_prepare_signing_request(dict_t *dict, br_signature_t *sbuf, + br_isignature_t *sign, size_t signaturelen) +{ + size_t size = 0; + + br_set_signature(sbuf, sign, signaturelen, &size); + + return dict_set_bin(dict, BITROT_SIGNING_VERSION_KEY, (void *)sbuf, size); +} + +/** + * initialize an inode context starting with a given ongoing version. + * a fresh lookup() or a first creat() call initializes the inode + * context, hence the inode is marked dirty. this routine also + * initializes the transient inode version. + */ +static int +br_stub_init_inode_versions(xlator_t *this, fd_t *fd, inode_t *inode, + unsigned long version, gf_boolean_t markdirty, + gf_boolean_t bad_object, uint64_t *ctx_addr) +{ + int32_t ret = 0; + br_stub_inode_ctx_t *ctx = NULL; + + ctx = GF_CALLOC(1, sizeof(br_stub_inode_ctx_t), gf_br_stub_mt_inode_ctx_t); + if (!ctx) + goto error_return; + + INIT_LIST_HEAD(&ctx->fd_list); + (markdirty) ? __br_stub_mark_inode_dirty(ctx) + : __br_stub_mark_inode_synced(ctx); + __br_stub_set_ongoing_version(ctx, version); + + if (bad_object) + __br_stub_mark_object_bad(ctx); + + if (fd) { + ret = br_stub_add_fd_to_inode(this, fd, ctx); + if (ret) + goto free_ctx; + } + + ret = br_stub_set_inode_ctx(this, inode, ctx); + if (ret) + goto free_ctx; + + if (ctx_addr) + *ctx_addr = (uint64_t)(uintptr_t)ctx; + return 0; + +free_ctx: + GF_FREE(ctx); +error_return: + return -1; +} + +/** + * modify the ongoing version of an inode. + */ +static int +br_stub_mod_inode_versions(xlator_t *this, fd_t *fd, inode_t *inode, + unsigned long version) +{ + int32_t ret = -1; + br_stub_inode_ctx_t *ctx = 0; + + LOCK(&inode->lock); + { + ctx = __br_stub_get_ongoing_version_ctx(this, inode, NULL); + if (ctx == NULL) + goto unblock; + if (__br_stub_is_inode_dirty(ctx)) { + __br_stub_set_ongoing_version(ctx, version); + __br_stub_mark_inode_synced(ctx); + } + + ret = 0; + } +unblock: + UNLOCK(&inode->lock); + + return ret; +} + +static void +br_stub_fill_local(br_stub_local_t *local, call_stub_t *stub, fd_t *fd, + inode_t *inode, uuid_t gfid, int versioningtype, + unsigned long memversion) +{ + local->fopstub = stub; + local->versioningtype = versioningtype; + local->u.context.version = memversion; + if (fd) + local->u.context.fd = fd_ref(fd); + if (inode) + local->u.context.inode = inode_ref(inode); + gf_uuid_copy(local->u.context.gfid, gfid); +} + +static void +br_stub_cleanup_local(br_stub_local_t *local) +{ + if (!local) + return; + + local->fopstub = NULL; + local->versioningtype = 0; + local->u.context.version = 0; + if (local->u.context.fd) { + fd_unref(local->u.context.fd); + local->u.context.fd = NULL; + } + if (local->u.context.inode) { + inode_unref(local->u.context.inode); + local->u.context.inode = NULL; + } + memset(local->u.context.gfid, '\0', sizeof(uuid_t)); +} + +static int +br_stub_need_versioning(xlator_t *this, fd_t *fd, gf_boolean_t *versioning, + gf_boolean_t *modified, br_stub_inode_ctx_t **ctx) +{ + int32_t ret = -1; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *c = NULL; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; + + *versioning = _gf_false; + *modified = _gf_false; + + /* Bitrot stub inode context was initialized only in lookup, create + * and mknod cbk path. Object versioning was enabled by default + * irrespective of bitrot enabled or not. But it's made optional now. + * As a consequence there could be cases where getting inode ctx would + * fail because it's not set yet. + * e.g., If versioning (with bitrot enable) is enabled while I/O is + * happening, it could directly get other fops like writev without + * lookup, where getting inode ctx would fail. Hence initialize the + * inode ctx on failure to get ctx. This is done in all places where + * applicable. + */ + ret = br_stub_get_inode_ctx(this, fd->inode, &ctx_addr); + if (ret < 0) { + ret = br_stub_init_inode_versions(this, fd, fd->inode, version, + _gf_true, _gf_false, &ctx_addr); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRS_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s", + uuid_utoa(fd->inode->gfid), NULL); + goto error_return; + } + } + + c = (br_stub_inode_ctx_t *)(long)ctx_addr; + + LOCK(&fd->inode->lock); + { + if (__br_stub_is_inode_dirty(c)) + *versioning = _gf_true; + if (__br_stub_is_inode_modified(c)) + *modified = _gf_true; + } + UNLOCK(&fd->inode->lock); + + if (ctx) + *ctx = c; + return 0; + +error_return: + return -1; +} + +static int32_t +br_stub_anon_fd_ctx(xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + br_stub_fd_t *br_stub_fd = NULL; + + br_stub_fd = br_stub_fd_ctx_get(this, fd); + if (!br_stub_fd) { + ret = br_stub_add_fd_to_inode(this, fd, ctx); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ADD_FD_TO_INODE, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto out; + } + } + + ret = 0; + +out: + return ret; +} + +static int +br_stub_versioning_prep(call_frame_t *frame, xlator_t *this, fd_t *fd, + br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + br_stub_local_t *local = NULL; + + local = br_stub_alloc_local(this); + if (!local) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRS_MSG_NO_MEMORY, "gfid=%s", + uuid_utoa(fd->inode->gfid), NULL); + goto error_return; + } + + if (fd_is_anonymous(fd)) { + ret = br_stub_anon_fd_ctx(this, fd, ctx); + if (ret) + goto free_local; + } + + frame->local = local; + + return 0; + +free_local: + br_stub_dealloc_local(local); +error_return: + return -1; +} + +static int +br_stub_mark_inode_modified(xlator_t *this, br_stub_local_t *local) +{ + fd_t *fd = NULL; + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; + + fd = local->u.context.fd; + + ret = br_stub_get_inode_ctx(this, fd->inode, &ctx_addr); + if (ret < 0) { + ret = br_stub_init_inode_versions(this, fd, fd->inode, version, + _gf_true, _gf_false, &ctx_addr); + if (ret) + goto error_return; + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + LOCK(&fd->inode->lock); + { + __br_stub_set_inode_modified(ctx); + } + UNLOCK(&fd->inode->lock); + + return 0; + +error_return: + return -1; +} + +/** + * The possible return values from br_stub_is_bad_object () are: + * 1) 0 => as per the inode context object is not bad + * 2) -1 => Failed to get the inode context itself + * 3) -2 => As per the inode context object is bad + * Both -ve values means the fop which called this function is failed + * and error is returned upwards. + */ +static int +br_stub_check_bad_object(xlator_t *this, inode_t *inode, int32_t *op_ret, + int32_t *op_errno) +{ + int ret = -1; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; + + ret = br_stub_is_bad_object(this, inode); + if (ret == -2) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJECT_ACCESS, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + *op_ret = -1; + *op_errno = EIO; + } + + if (ret == -1) { + ret = br_stub_init_inode_versions(this, NULL, inode, version, _gf_true, + _gf_false, NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRS_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s", + uuid_utoa(inode->gfid), NULL); + *op_ret = -1; + *op_errno = EINVAL; + } + } + + return ret; +} + +/** + * callback for inode/fd versioning + */ +int +br_stub_fd_incversioning_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + fd_t *fd = NULL; + inode_t *inode = NULL; + unsigned long version = 0; + br_stub_local_t *local = NULL; + + local = (br_stub_local_t *)frame->local; + if (op_ret < 0) + goto done; + fd = local->u.context.fd; + inode = local->u.context.inode; + version = local->u.context.version; + + op_ret = br_stub_mod_inode_versions(this, fd, inode, version); + if (op_ret < 0) + op_errno = EINVAL; + +done: + if (op_ret < 0) { + frame->local = NULL; + call_unwind_error(local->fopstub, -1, op_errno); + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + } else { + call_resume(local->fopstub); + } + return 0; +} + +/** + * Initial object versioning + * + * Version persists two (2) extended attributes as explained below: + * 1. Current (ongoing) version: This is incremented on an writev () + * or truncate () and is the running version for an object. + * 2. Signing version: This is the version against which an object + * was signed (checksummed). + * + * During initial versioning, both ongoing and signing versions are + * set of one and zero respectively. A write() call increments the + * ongoing version as an indication of modification to the object. + * Additionally this needs to be persisted on disk and needs to be + * durable: fsync().. :-/ + * As an optimization only the first write() synchronizes the ongoing + * version to disk, subsequent write()s before the *last* release() + * are no-op's. + * + * create(), just like lookup() initializes the object versions to + * the default. As an optimization this is not a durable operation: + * in case of a crash, hard reboot etc.. absence of versioning xattrs + * is ignored in scrubber along with the one time crawler explicitly + * triggering signing for such objects. + * + * c.f. br_stub_writev() / br_stub_truncate() + */ + +/** + * perform full or incremental versioning on an inode pointd by an + * fd. incremental versioning is done when an inode is dirty and a + * writeback is triggered. + */ + +int +br_stub_fd_versioning(xlator_t *this, call_frame_t *frame, call_stub_t *stub, + dict_t *dict, fd_t *fd, br_stub_version_cbk *callback, + unsigned long memversion, int versioningtype, int durable) +{ + int32_t ret = -1; + int flags = 0; + dict_t *xdata = NULL; + br_stub_local_t *local = NULL; + + xdata = dict_new(); + if (!xdata) + goto done; + + ret = dict_set_int32(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1); + if (ret) + goto dealloc_xdata; + + if (durable) { + ret = dict_set_int32(xdata, GLUSTERFS_DURABLE_OP, 0); + if (ret) + goto dealloc_xdata; + } + + local = frame->local; + + br_stub_fill_local(local, stub, fd, fd->inode, fd->inode->gfid, + versioningtype, memversion); + + STACK_WIND(frame, callback, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + + ret = 0; + +dealloc_xdata: + dict_unref(xdata); +done: + return ret; +} + +static int +br_stub_perform_incversioning(xlator_t *this, call_frame_t *frame, + call_stub_t *stub, fd_t *fd, + br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + dict_t *dict = NULL; + br_version_t *obuf = NULL; + unsigned long writeback_version = 0; + int op_errno = 0; + br_stub_local_t *local = NULL; + + op_errno = EINVAL; + local = frame->local; + + writeback_version = __br_stub_writeback_version(ctx); + + op_errno = ENOMEM; + dict = dict_new(); + if (!dict) + goto out; + ret = br_stub_alloc_versions(&obuf, NULL, 0); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_MEM_FAILED, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto out; + } + ret = br_stub_prepare_version_request(this, dict, obuf, writeback_version); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_VERSION_PREPARE_FAIL, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + br_stub_dealloc_versions(obuf); + goto out; + } + + ret = br_stub_fd_versioning( + this, frame, stub, dict, fd, br_stub_fd_incversioning_cbk, + writeback_version, BR_STUB_INCREMENTAL_VERSIONING, !WRITEBACK_DURABLE); +out: + if (dict) + dict_unref(dict); + if (ret) { + if (local) + frame->local = NULL; + call_unwind_error(stub, -1, op_errno); + if (local) { + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + } + } + + return ret; +} + +/** {{{ */ + +/* fsetxattr() */ + +int32_t +br_stub_perform_objsign(call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int flags, dict_t *xdata) +{ + STACK_WIND(frame, default_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + + dict_unref(xdata); + return 0; +} + +void * +br_stub_signth(void *arg) +{ + xlator_t *this = arg; + br_stub_private_t *priv = this->private; + struct br_stub_signentry *sigstub = NULL; + + THIS = this; + while (1) { + /* + * Disabling bit-rot feature leads to this particular thread + * getting cleaned up by reconfigure via a call to the function + * gf_thread_cleanup_xint (which in turn calls pthread_cancel + * and pthread_join). But, if this thread had held the mutex + * &priv->lock at the time of cancellation, then it leads to + * deadlock in future when bit-rot feature is enabled (which + * again spawns this thread which cant hold the lock as the + * mutex is still held by the previous instance of the thread + * which got killed). Also, the br_stub_handle_object_signature + * function which is called whenever file has to be signed + * also gets blocked as it too attempts to acquire &priv->lock. + * + * So, arrange for the lock to be unlocked as part of the + * cleanup of this thread using pthread_cleanup_push and + * pthread_cleanup_pop. + */ + pthread_cleanup_push(br_stub_lock_cleaner, &priv->lock); + pthread_mutex_lock(&priv->lock); + { + while (list_empty(&priv->squeue)) + pthread_cond_wait(&priv->cond, &priv->lock); + + sigstub = list_first_entry(&priv->squeue, struct br_stub_signentry, + list); + list_del_init(&sigstub->list); + } + pthread_mutex_unlock(&priv->lock); + pthread_cleanup_pop(0); + + call_resume(sigstub->stub); + + GF_FREE(sigstub); + } + + return NULL; +} + +static gf_boolean_t +br_stub_internal_xattr(dict_t *dict) +{ + if (dict_get(dict, GLUSTERFS_SET_OBJECT_SIGNATURE) || + dict_get(dict, GLUSTERFS_GET_OBJECT_SIGNATURE) || + dict_get(dict, BR_REOPEN_SIGN_HINT_KEY) || + dict_get(dict, BITROT_OBJECT_BAD_KEY) || + dict_get(dict, BITROT_SIGNING_VERSION_KEY) || + dict_get(dict, BITROT_CURRENT_VERSION_KEY)) + return _gf_true; + + return _gf_false; +} + +int +orderq(struct list_head *elem1, struct list_head *elem2) +{ + struct br_stub_signentry *s1 = NULL; + struct br_stub_signentry *s2 = NULL; + + s1 = list_entry(elem1, struct br_stub_signentry, list); + s2 = list_entry(elem2, struct br_stub_signentry, list); + + return (s1->v > s2->v); +} + +static int +br_stub_compare_sign_version(xlator_t *this, inode_t *inode, + br_signature_t *sbuf, dict_t *dict, + int *fakesuccess) +{ + int32_t ret = -1; + uint64_t tmp_ctx = 0; + gf_boolean_t invalid = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, sbuf, out); + GF_VALIDATE_OR_GOTO(this->name, dict, out); + + ret = br_stub_get_inode_ctx(this, inode, &tmp_ctx); + if (ret) { + dict_del(dict, BITROT_SIGNING_VERSION_KEY); + goto out; + } + + ctx = (br_stub_inode_ctx_t *)(long)tmp_ctx; + + LOCK(&inode->lock); + { + if (ctx->currentversion < sbuf->signedversion) { + invalid = _gf_true; + } else if (ctx->currentversion > sbuf->signedversion) { + gf_msg_debug(this->name, 0, + "\"Signing version\" " + "(%lu) lower than \"Current version \" " + "(%lu)", + ctx->currentversion, sbuf->signedversion); + *fakesuccess = 1; + } + } + UNLOCK(&inode->lock); + + if (invalid) { + ret = -1; + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SIGN_VERSION_ERROR, + "Signing-ver=%lu", sbuf->signedversion, "current-ver=%lu", + ctx->currentversion, NULL); + } + +out: + return ret; +} + +static int +br_stub_prepare_signature(xlator_t *this, dict_t *dict, inode_t *inode, + br_isignature_t *sign, int *fakesuccess) +{ + int32_t ret = -1; + size_t signaturelen = 0; + br_signature_t *sbuf = NULL; + + if (!br_is_signature_type_valid(sign->signaturetype)) + goto out; + + signaturelen = sign->signaturelen; + ret = br_stub_alloc_versions(NULL, &sbuf, signaturelen); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_MEM_FAILED, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + ret = -1; + goto out; + } + ret = br_stub_prepare_signing_request(dict, sbuf, sign, signaturelen); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SIGN_PREPARE_FAIL, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + ret = -1; + br_stub_dealloc_versions(sbuf); + goto out; + } + + /* At this point sbuf has been added to dict, so the memory will be freed + * when the data from the dict is destroyed + */ + ret = br_stub_compare_sign_version(this, inode, sbuf, dict, fakesuccess); +out: + return ret; +} + +static void +br_stub_handle_object_signature(call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, br_isignature_t *sign, + dict_t *xdata) +{ + int32_t ret = -1; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + int fakesuccess = 0; + br_stub_private_t *priv = NULL; + struct br_stub_signentry *sigstub = NULL; + + priv = this->private; + + if (frame->root->pid != GF_CLIENT_PID_BITD) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, BRS_MSG_NON_BITD_PID, + "PID=%d", frame->root->pid, NULL); + goto dofop; + } + + ret = br_stub_prepare_signature(this, dict, fd->inode, sign, &fakesuccess); + if (ret) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SIGN_PREPARE_FAIL, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto dofop; + } + if (fakesuccess) { + op_ret = op_errno = 0; + goto dofop; + } + + dict_del(dict, GLUSTERFS_SET_OBJECT_SIGNATURE); + + ret = -1; + if (!xdata) { + xdata = dict_new(); + if (!xdata) + goto dofop; + } else { + dict_ref(xdata); + } + + ret = dict_set_int32(xdata, GLUSTERFS_DURABLE_OP, 0); + if (ret) + goto unref_dict; + + /* prepare dispatch stub to order object signing */ + sigstub = GF_CALLOC(1, sizeof(*sigstub), gf_br_stub_mt_sigstub_t); + if (!sigstub) + goto unref_dict; + + INIT_LIST_HEAD(&sigstub->list); + sigstub->v = ntohl(sign->signedversion); + sigstub->stub = fop_fsetxattr_stub(frame, br_stub_perform_objsign, fd, dict, + 0, xdata); + if (!sigstub->stub) + goto cleanup_stub; + + pthread_mutex_lock(&priv->lock); + { + list_add_order(&sigstub->list, &priv->squeue, orderq); + pthread_cond_signal(&priv->cond); + } + pthread_mutex_unlock(&priv->lock); + + return; + +cleanup_stub: + GF_FREE(sigstub); +unref_dict: + dict_unref(xdata); +dofop: + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL); +} + +int32_t +br_stub_fsetxattr_resume(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + int32_t ret = -1; + br_stub_local_t *local = NULL; + + local = frame->local; + frame->local = NULL; + + ret = br_stub_mark_inode_modified(this, local); + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata); + + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + + return 0; +} + +/** + * Handles object reopens. Object reopens can be of 3 types. 2 are from + * oneshot crawler and 1 from the regular signer. + * ONESHOT CRAWLER: + * For those objects which were created before bitrot was enabled. oneshow + * crawler crawls the namespace and signs all the objects. It has to do + * the versioning before making bit-rot-stub send a sign notification. + * So it sends fsetxattr with BR_OBJECT_REOPEN as the value. And bit-rot-stub + * upon getting BR_OBJECT_REOPEN value checks if the version has to be + * increased or not. By default the version will be increased. But if the + * object is modified before BR_OBJECT_REOPEN from oneshot crawler, then + * versioning need not be done. In that case simply a success is returned. + * SIGNER: + * Signer wait for 2 minutes upon getting the notification from bit-rot-stub + * and then it sends a dummy write (in reality a fsetxattr) call, to change + * the state of the inode from REOPEN_WAIT to SIGN_QUICK. The funny part here + * is though the inode's state is REOPEN_WAIT, the call sent by signer is + * BR_OBJECT_RESIGN. Once the state is changed to SIGN_QUICK, then yet another + * notification is sent upon release (RESIGN would have happened via fsetxattr, + * so a fd is needed) and the object is signed truly this time. + * There is a challenge in the above RESIGN method by signer. After sending + * the 1st notification, the inode could be forgotten before RESIGN request + * is received. In that case, the inode's context (the newly looked up inode) + * would not indicate the inode as being modified (it would be in the default + * state) and because of this, a SIGN_QUICK notification to truly sign the + * object would not be sent. So, this is how its handled. + * if (request == RESIGN) { + * if (inode->sign_info == NORMAL) { + * mark_inode_non_dirty; + * mark_inode_modified; + * } + * GOBACK (means unwind without doing versioning) + * } + */ +static void +br_stub_handle_object_reopen(call_frame_t *frame, xlator_t *this, fd_t *fd, + uint32_t val) +{ + int32_t ret = -1; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + call_stub_t *stub = NULL; + gf_boolean_t inc_version = _gf_false; + gf_boolean_t modified = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_local_t *local = NULL; + gf_boolean_t goback = _gf_true; + + ret = br_stub_need_versioning(this, fd, &inc_version, &modified, &ctx); + if (ret) + goto unwind; + + LOCK(&fd->inode->lock); + { + if ((val == BR_OBJECT_REOPEN) && inc_version) + goback = _gf_false; + if (val == BR_OBJECT_RESIGN && ctx->info_sign == BR_SIGN_NORMAL) { + __br_stub_mark_inode_synced(ctx); + __br_stub_set_inode_modified(ctx); + } + (void)__br_stub_inode_sign_state(ctx, GF_FOP_FSETXATTR, fd); + } + UNLOCK(&fd->inode->lock); + + if (goback) { + op_ret = op_errno = 0; + goto unwind; + } + + ret = br_stub_versioning_prep(frame, this, fd, ctx); + if (ret) + goto unwind; + local = frame->local; + + stub = fop_fsetxattr_cbk_stub(frame, br_stub_fsetxattr_resume, 0, 0, NULL); + if (!stub) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED, + "fsetxattr gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto cleanup_local; + } + + (void)br_stub_perform_incversioning(this, frame, stub, fd, ctx); + return; + +cleanup_local: + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL); +} + +/** + * This function only handles bad file identification. Instead of checking in + * fops like open, readv, writev whether the object is bad or not by doing + * getxattr calls, better to catch them when scrubber marks it as bad. + * So this callback is called only when the fsetxattr is sent by the scrubber + * to mark the object as bad. + */ +int +br_stub_fsetxattr_bad_object_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + int32_t ret = -1; + + local = frame->local; + frame->local = NULL; + + if (op_ret < 0) + goto unwind; + + /* + * What to do if marking the object as bad fails? (i.e. in memory + * marking within the inode context. If we are here means fsetxattr + * fop has succeeded on disk and the bad object xattr has been set). + * We can return failure to scruber, but there is nothing the scrubber + * can do with it (it might assume that the on disk setxattr itself has + * failed). The main purpose of this operation is to help identify the + * bad object by checking the inode context itself (thus avoiding the + * necessity of doing a getxattr fop on the disk). + * + * So as of now, success itself is being returned even though inode + * context set operation fails. + * In future if there is any change in the policy which can handle this, + * then appropriate response should be sent (i.e. success or error). + */ + ret = br_stub_mark_object_bad(this, local->u.context.inode); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_MARK_FAIL, + "gfid=%s", uuid_utoa(local->u.context.inode->gfid), NULL); + + ret = br_stub_add(this, local->u.context.inode->gfid); + +unwind: + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata); + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + return 0; +} + +static int32_t +br_stub_handle_bad_object_key(call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int flags, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + if (frame->root->pid != GF_CLIENT_PID_SCRUB) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_NON_SCRUB_BAD_OBJ_MARK, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto unwind; + } + + local = br_stub_alloc_local(this); + if (!local) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_MEM_FAILED, + "fsetxattr gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid, + BR_STUB_NO_VERSIONING, 0); + frame->local = local; + + STACK_WIND(frame, br_stub_fsetxattr_bad_object_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL); + return 0; +} + +/** + * As of now, versioning is done by the stub (though as a setxattr + * operation) as part of inode modification operations such as writev, + * truncate, ftruncate. And signing is done by BitD by a fsetxattr call. + * So any kind of setxattr coming on the versioning and the signing xattr is + * not allowed (i.e. BITROT_CURRENT_VERSION_KEY and BITROT_SIGNING_VERSION_KEY). + * In future if BitD/scrubber are allowed to change the versioning + * xattrs (though I cannot see a reason for it as of now), then the below + * function can be modified to block setxattr on version for only applications. + * + * NOTE: BitD sends sign request on GLUSTERFS_SET_OBJECT_SIGNATURE key. + * BITROT_SIGNING_VERSION_KEY is the xattr used to save the signature. + * + */ +static int32_t +br_stub_handle_internal_xattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + char *key) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_INTERNAL_XATTR, + "setxattr key=%s", key, "inode-gfid=%s", uuid_utoa(fd->inode->gfid), + NULL); + + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL); + return 0; +} + +static void +br_stub_dump_xattr(xlator_t *this, dict_t *dict, int *op_errno) +{ + char *format = "(%s:%s)"; + char *dump = NULL; + + dump = GF_CALLOC(1, BR_STUB_DUMP_STR_SIZE, gf_br_stub_mt_misc); + if (!dump) { + *op_errno = ENOMEM; + goto out; + } + dict_dump_to_str(dict, dump, BR_STUB_DUMP_STR_SIZE, format); + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_INTERNAL_XATTR, + "fsetxattr dump=%s", dump, NULL); +out: + if (dump) { + GF_FREE(dump); + } + return; +} + +int +br_stub_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int flags, dict_t *xdata) +{ + int32_t ret = 0; + uint32_t val = 0; + br_isignature_t *sign = NULL; + br_stub_private_t *priv = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + priv = this->private; + + if ((frame->root->pid != GF_CLIENT_PID_BITD && + frame->root->pid != GF_CLIENT_PID_SCRUB) && + br_stub_internal_xattr(dict)) { + br_stub_dump_xattr(this, dict, &op_errno); + goto unwind; + } + + if (!priv->do_versioning) + goto wind; + + if (!IA_ISREG(fd->inode->ia_type)) + goto wind; + + /* object signature request */ + ret = dict_get_bin(dict, GLUSTERFS_SET_OBJECT_SIGNATURE, (void **)&sign); + if (!ret) { + gf_msg_debug(this->name, 0, "got SIGNATURE request on %s", + uuid_utoa(fd->inode->gfid)); + br_stub_handle_object_signature(frame, this, fd, dict, sign, xdata); + goto done; + } + + /* signing xattr */ + if (dict_get(dict, BITROT_SIGNING_VERSION_KEY)) { + br_stub_handle_internal_xattr(frame, this, fd, + BITROT_SIGNING_VERSION_KEY); + goto done; + } + + /* version xattr */ + if (dict_get(dict, BITROT_CURRENT_VERSION_KEY)) { + br_stub_handle_internal_xattr(frame, this, fd, + BITROT_CURRENT_VERSION_KEY); + goto done; + } + + if (dict_get(dict, GLUSTERFS_GET_OBJECT_SIGNATURE)) { + br_stub_handle_internal_xattr(frame, this, fd, + GLUSTERFS_GET_OBJECT_SIGNATURE); + goto done; + } + + /* object reopen request */ + ret = dict_get_uint32(dict, BR_REOPEN_SIGN_HINT_KEY, &val); + if (!ret) { + br_stub_handle_object_reopen(frame, this, fd, val); + goto done; + } + + /* handle bad object */ + if (dict_get(dict, BITROT_OBJECT_BAD_KEY)) { + br_stub_handle_bad_object_key(frame, this, fd, dict, flags, xdata); + goto done; + } + +wind: + STACK_WIND(frame, default_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL); + +done: + return 0; +} + +/** + * Currently BitD and scrubber are doing fsetxattr to either sign the object + * or to mark it as bad. Hence setxattr on any of those keys is denied directly + * without checking from where the fop is coming. + * Later, if BitD or Scrubber does setxattr of those keys, then appropriate + * check has to be added below. + */ +int +br_stub_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int flags, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + if (br_stub_internal_xattr(dict)) { + br_stub_dump_xattr(this, dict, &op_errno); + goto unwind; + } + + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, + loc, dict, flags, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, NULL); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* {f}removexattr() */ + +int32_t +br_stub_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + if (!strcmp(BITROT_OBJECT_BAD_KEY, name) || + !strcmp(BITROT_SIGNING_VERSION_KEY, name) || + !strcmp(BITROT_CURRENT_VERSION_KEY, name)) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_REMOVE_INTERNAL_XATTR, + "name=%s", name, "file-path=%s", loc->path, NULL); + goto unwind; + } + + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(removexattr, frame, op_ret, op_errno, NULL); + return 0; +} + +int32_t +br_stub_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + if (!strcmp(BITROT_OBJECT_BAD_KEY, name) || + !strcmp(BITROT_SIGNING_VERSION_KEY, name) || + !strcmp(BITROT_CURRENT_VERSION_KEY, name)) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_REMOVE_INTERNAL_XATTR, + "name=%s", name, "inode-gfid=%s", uuid_utoa(fd->inode->gfid), + NULL); + goto unwind; + } + + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(fremovexattr, frame, op_ret, op_errno, NULL); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* {f}getxattr() */ + +int +br_stub_listxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + if (op_ret < 0) + goto unwind; + + br_stub_remove_vxattrs(xattr, _gf_true); + +unwind: + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, xattr, xdata); + return 0; +} + +/** + * ONE SHOT CRAWLER from BitD signs the objects that it encounters while + * crawling, if the object is identified as stale by the stub. Stub follows + * the below logic to mark an object as stale or not. + * If the ongoing version and the signed_version match, then the object is not + * stale. Just return. Otherwise if they does not match, then it means one + * of the below things. + * 1) If the inode does not need write back of the version and the sign state is + * is NORMAL, then some active i/o is going on the object. So skip it. + * A notification will be sent to trigger the sign once the release is + * received on the object. + * 2) If inode does not need writeback of the version and the sign state is + * either reopen wait or quick sign, then it means: + * A) BitD restarted and it is not sure whether the object it encountered + * while crawling is in its timer wheel or not. Since there is no way to + * scan the timer wheel as of now, ONE SHOT CRAWLER just goes ahead and + * signs the object. Since the inode does not need writeback, version will + * not be incremented and directly the object will be signed. + * 3) If the inode needs writeback, then it means the inode was forgotten after + * the versioning and it has to be signed now. + * + * This is the algorithm followed: + * if (ongoing_version == signed_version); then + * object_is_not_stale; + * return; + * else; then + * if (!inode_needs_writeback && inode_sign_state != NORMAL); then + * object_is_stale; + * if (inode_needs_writeback); then + * object_is_stale; + * + * For SCRUBBER, no need to check for the sign state and inode writeback. + * If the ondisk ongoingversion and the ondisk signed version does not match, + * then treat the object as stale. + */ +char +br_stub_is_object_stale(xlator_t *this, call_frame_t *frame, inode_t *inode, + br_version_t *obuf, br_signature_t *sbuf) +{ + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = -1; + char stale = 0; + + if (obuf->ongoingversion == sbuf->signedversion) + goto out; + + if (frame->root->pid == GF_CLIENT_PID_SCRUB) { + stale = 1; + goto out; + } + + ret = br_stub_get_inode_ctx(this, inode, &ctx_addr); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + goto out; + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + LOCK(&inode->lock); + { + if ((!__br_stub_is_inode_dirty(ctx) && + ctx->info_sign != BR_SIGN_NORMAL) || + __br_stub_is_inode_dirty(ctx)) + stale = 1; + } + UNLOCK(&inode->lock); + +out: + return stale; +} + +int +br_stub_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + int32_t ret = 0; + size_t totallen = 0; + size_t signaturelen = 0; + br_stub_private_t *priv = NULL; + br_version_t *obuf = NULL; + br_signature_t *sbuf = NULL; + br_isignature_out_t *sign = NULL; + br_vxattr_status_t status; + br_stub_local_t *local = NULL; + inode_t *inode = NULL; + gf_boolean_t bad_object = _gf_false; + gf_boolean_t ver_enabled = _gf_false; + + BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled); + priv = this->private; + + if (op_ret < 0) + goto unwind; + BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), delkeys); + + if (cookie != (void *)BR_STUB_REQUEST_COOKIE) + goto unwind; + + local = frame->local; + frame->local = NULL; + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto unwind; + } + inode = local->u.context.inode; + + op_ret = -1; + status = br_version_xattr_state(xattr, &obuf, &sbuf, &bad_object); + + op_errno = EIO; + if (bad_object) + goto delkeys; + + op_errno = EINVAL; + if (status == BR_VXATTR_STATUS_INVALID) + goto delkeys; + + op_errno = ENODATA; + if ((status == BR_VXATTR_STATUS_MISSING) || + (status == BR_VXATTR_STATUS_UNSIGNED)) + goto delkeys; + + /** + * okay.. we have enough information to satisfy the request, + * namely: version and signing extended attribute. what's + * pending is the signature length -- that's figured out + * indirectly via the size of the _whole_ xattr and the + * on-disk signing xattr header size. + */ + op_errno = EINVAL; + ret = dict_get_uint32(xattr, BITROT_SIGNING_XATTR_SIZE_KEY, + (uint32_t *)&signaturelen); + if (ret) + goto delkeys; + + signaturelen -= sizeof(br_signature_t); + totallen = sizeof(br_isignature_out_t) + signaturelen; + + op_errno = ENOMEM; + sign = GF_CALLOC(1, totallen, gf_br_stub_mt_signature_t); + if (!sign) + goto delkeys; + + sign->time[0] = obuf->timebuf[0]; + sign->time[1] = obuf->timebuf[1]; + + /* Object's dirty state & current signed version */ + sign->version = sbuf->signedversion; + sign->stale = br_stub_is_object_stale(this, frame, inode, obuf, sbuf); + + /* Object's signature */ + sign->signaturelen = signaturelen; + sign->signaturetype = sbuf->signaturetype; + (void)memcpy(sign->signature, sbuf->signature, signaturelen); + + op_errno = EINVAL; + ret = dict_set_bin(xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, (void *)sign, + totallen); + if (ret < 0) { + GF_FREE(sign); + goto delkeys; + } + op_errno = 0; + op_ret = totallen; + +delkeys: + br_stub_remove_vxattrs(xattr, _gf_true); + +unwind: + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, xattr, xdata); + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + return 0; +} + +static void +br_stub_send_stub_init_time(call_frame_t *frame, xlator_t *this) +{ + int op_ret = 0; + int op_errno = 0; + dict_t *xattr = NULL; + br_stub_init_t stub = { + { + 0, + }, + }; + br_stub_private_t *priv = NULL; + + priv = this->private; + + xattr = dict_new(); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + stub.timebuf[0] = priv->boot[0]; + stub.timebuf[1] = priv->boot[1]; + memcpy(stub.export, priv->export, strlen(priv->export) + 1); + + op_ret = dict_set_static_bin(xattr, GLUSTERFS_GET_BR_STUB_INIT_TIME, + (void *)&stub, sizeof(br_stub_init_t)); + if (op_ret < 0) { + op_errno = EINVAL; + goto unwind; + } + + op_ret = sizeof(br_stub_init_t); + +unwind: + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, xattr, NULL); + + if (xattr) + dict_unref(xattr); +} + +int +br_stub_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + void *cookie = NULL; + static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + fop_getxattr_cbk_t cbk = br_stub_getxattr_cbk; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + br_stub_local_t *local = NULL; + br_stub_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc, unwind); + GF_VALIDATE_OR_GOTO(this->name, this->private, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind); + + if (!name) { + cbk = br_stub_listxattr_cbk; + goto wind; + } + + if (br_stub_is_internal_xattr(name)) + goto unwind; + + priv = this->private; + BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + /** + * If xattr is node-uuid and the inode is marked bad, return EIO. + * Returning EIO would result in AFR to choose correct node-uuid + * corresponding to the subvolume * where the good copy of the + * file resides. + */ + if (IA_ISREG(loc->inode->ia_type) && XATTR_IS_NODE_UUID(name) && + br_stub_check_bad_object(this, loc->inode, &op_ret, &op_errno)) { + goto unwind; + } + + /** + * this special extended attribute is allowed only on root + */ + if (name && + (strncmp(name, GLUSTERFS_GET_BR_STUB_INIT_TIME, + sizeof(GLUSTERFS_GET_BR_STUB_INIT_TIME) - 1) == 0) && + ((gf_uuid_compare(loc->gfid, rootgfid) == 0) || + (gf_uuid_compare(loc->inode->gfid, rootgfid) == 0))) { + BR_STUB_RESET_LOCAL_NULL(frame); + br_stub_send_stub_init_time(frame, this); + return 0; + } + + if (!IA_ISREG(loc->inode->ia_type)) + goto wind; + + if (name && (strncmp(name, GLUSTERFS_GET_OBJECT_SIGNATURE, + sizeof(GLUSTERFS_GET_OBJECT_SIGNATURE) - 1) == 0)) { + cookie = (void *)BR_STUB_REQUEST_COOKIE; + + local = br_stub_alloc_local(this); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + br_stub_fill_local(local, NULL, NULL, loc->inode, loc->inode->gfid, + BR_STUB_NO_VERSIONING, 0); + frame->local = local; + } + +wind: + STACK_WIND_COOKIE(frame, cbk, cookie, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + return 0; +unwind: + BR_STUB_RESET_LOCAL_NULL(frame); + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, NULL, NULL); + return 0; +} + +int +br_stub_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + void *cookie = NULL; + static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + fop_fgetxattr_cbk_t cbk = br_stub_getxattr_cbk; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + br_stub_local_t *local = NULL; + br_stub_private_t *priv = NULL; + + priv = this->private; + + if (!name) { + cbk = br_stub_listxattr_cbk; + goto wind; + } + + if (br_stub_is_internal_xattr(name)) + goto unwind; + + BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + /** + * If xattr is node-uuid and the inode is marked bad, return EIO. + * Returning EIO would result in AFR to choose correct node-uuid + * corresponding to the subvolume * where the good copy of the + * file resides. + */ + if (IA_ISREG(fd->inode->ia_type) && XATTR_IS_NODE_UUID(name) && + br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno)) { + goto unwind; + } + + /** + * this special extended attribute is allowed only on root + */ + if (name && + (strncmp(name, GLUSTERFS_GET_BR_STUB_INIT_TIME, + sizeof(GLUSTERFS_GET_BR_STUB_INIT_TIME) - 1) == 0) && + (gf_uuid_compare(fd->inode->gfid, rootgfid) == 0)) { + BR_STUB_RESET_LOCAL_NULL(frame); + br_stub_send_stub_init_time(frame, this); + return 0; + } + + if (!IA_ISREG(fd->inode->ia_type)) + goto wind; + + if (name && (strncmp(name, GLUSTERFS_GET_OBJECT_SIGNATURE, + sizeof(GLUSTERFS_GET_OBJECT_SIGNATURE) - 1) == 0)) { + cookie = (void *)BR_STUB_REQUEST_COOKIE; + + local = br_stub_alloc_local(this); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid, + BR_STUB_NO_VERSIONING, 0); + frame->local = local; + } + +wind: + STACK_WIND_COOKIE(frame, cbk, cookie, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + return 0; +unwind: + BR_STUB_RESET_LOCAL_NULL(frame); + STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, NULL, NULL); + return 0; +} + +int32_t +br_stub_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + int32_t ret = -1; + br_stub_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, frame, unwind); + GF_VALIDATE_OR_GOTO(this->name, this->private, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, unwind); + + priv = this->private; + if (!priv->do_versioning) + goto wind; + + ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno); + if (ret) + goto unwind; + +wind: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, NULL, 0, NULL, NULL, + NULL); + return 0; +} + +/** + * The first write response on the first fd in the list of fds will set + * the flag to indicate that the inode is modified. The subsequent write + * respnses coming on either the first fd or some other fd will not change + * the fd. The inode-modified flag is unset only upon release of all the + * fds. + */ +int32_t +br_stub_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + br_stub_local_t *local = NULL; + + local = frame->local; + frame->local = NULL; + + if (op_ret < 0) + goto unwind; + + ret = br_stub_mark_inode_modified(this, local); + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + +unwind: + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + + return 0; +} + +int32_t +br_stub_writev_resume(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + STACK_WIND(frame, br_stub_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; +} + +/** + * This is probably the most crucial part about the whole versioning thing. + * There's absolutely no differentiation as such between an anonymous fd + * and a regular fd except the fd context initialization. Object versioning + * is performed when the inode is dirty. Parallel write operations are no + * special with each write performing object versioning followed by marking + * the inode as non-dirty (synced). This is followed by the actual operation + * (writev() in this case) which on a success marks the inode as modified. + * This prevents signing of objects that have not been modified. + */ +int32_t +br_stub_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + gf_boolean_t modified = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = -1; + fop_writev_cbk_t cbk = default_writev_cbk; + br_stub_local_t *local = NULL; + br_stub_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, this->private, unwind); + GF_VALIDATE_OR_GOTO(this->name, frame, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd, unwind); + + priv = this->private; + if (!priv->do_versioning) + goto wind; + + ret = br_stub_need_versioning(this, fd, &inc_version, &modified, &ctx); + if (ret) + goto unwind; + + ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno); + if (ret) + goto unwind; + + /** + * The inode is not dirty and also witnessed at least one successful + * modification operation. Therefore, subsequent operations need not + * perform any special tracking. + */ + if (!inc_version && modified) + goto wind; + + /** + * okay.. so, either the inode needs versioning or the modification + * needs to be tracked. ->cbk is set to the appropriate callback + * routine for this. + * NOTE: ->local needs to be deallocated on failures from here on. + */ + ret = br_stub_versioning_prep(frame, this, fd, ctx); + if (ret) + goto unwind; + + local = frame->local; + if (!inc_version) { + br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid, + BR_STUB_NO_VERSIONING, 0); + cbk = br_stub_writev_cbk; + goto wind; + } + + stub = fop_writev_stub(frame, br_stub_writev_resume, fd, vector, count, + offset, flags, iobref, xdata); + + if (!stub) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED, + "write gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto cleanup_local; + } + + /* Perform Versioning */ + return br_stub_perform_incversioning(this, frame, stub, fd, ctx); + +wind: + STACK_WIND(frame, cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, + fd, vector, count, offset, flags, iobref, xdata); + return 0; + +cleanup_local: + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, NULL, NULL, NULL); + + return 0; +} + +int32_t +br_stub_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = -1; + br_stub_local_t *local = NULL; + + local = frame->local; + frame->local = NULL; + + if (op_ret < 0) + goto unwind; + + ret = br_stub_mark_inode_modified(this, local); + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + +unwind: + STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + + return 0; +} + +int32_t +br_stub_ftruncate_resume(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata) +{ + STACK_WIND(frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; +} + +/* c.f. br_stub_writev() for explanation */ +int32_t +br_stub_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + br_stub_local_t *local = NULL; + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + gf_boolean_t modified = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = -1; + fop_ftruncate_cbk_t cbk = default_ftruncate_cbk; + br_stub_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, this->private, unwind); + GF_VALIDATE_OR_GOTO(this->name, frame, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd, unwind); + + priv = this->private; + if (!priv->do_versioning) + goto wind; + + ret = br_stub_need_versioning(this, fd, &inc_version, &modified, &ctx); + if (ret) + goto unwind; + + ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno); + if (ret) + goto unwind; + + if (!inc_version && modified) + goto wind; + + ret = br_stub_versioning_prep(frame, this, fd, ctx); + if (ret) + goto unwind; + + local = frame->local; + if (!inc_version) { + br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid, + BR_STUB_NO_VERSIONING, 0); + cbk = br_stub_ftruncate_cbk; + goto wind; + } + + stub = fop_ftruncate_stub(frame, br_stub_ftruncate_resume, fd, offset, + xdata); + if (!stub) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED, + "ftruncate gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto cleanup_local; + } + + return br_stub_perform_incversioning(this, frame, stub, fd, ctx); + +wind: + STACK_WIND(frame, cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; + +cleanup_local: + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, NULL, NULL, NULL); + + return 0; +} + +int32_t +br_stub_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + br_stub_local_t *local = NULL; + + local = frame->local; + frame->local = NULL; + + if (op_ret < 0) + goto unwind; + + ret = br_stub_mark_inode_modified(this, local); + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + +unwind: + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + return 0; +} + +int32_t +br_stub_truncate_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) +{ + br_stub_local_t *local = frame->local; + + fd_unref(local->u.context.fd); + STACK_WIND(frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; +} + +/** + * Bit-rot-stub depends heavily on the fd based operations to for doing + * versioning and sending notification. It starts tracking the operation + * upon getting first fd based modify operation by doing versioning and + * sends notification when last fd using which the inode was modified is + * released. + * But for truncate there is no fd and hence it becomes difficult to do + * the versioning and send notification. It is handled by doing versioning + * on an anonymous fd. The fd will be valid till the completion of the + * truncate call. It guarantees that release on this anonymous fd will happen + * after the truncate call and notification is sent after the truncate call. + * + * c.f. br_writev_cbk() for explanation + */ +int32_t +br_stub_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + br_stub_local_t *local = NULL; + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + gf_boolean_t modified = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = -1; + fd_t *fd = NULL; + fop_truncate_cbk_t cbk = default_truncate_cbk; + br_stub_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, this->private, unwind); + GF_VALIDATE_OR_GOTO(this->name, frame, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind); + + priv = this->private; + if (!priv->do_versioning) + goto wind; + + fd = fd_anonymous(loc->inode); + if (!fd) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CREATE_ANONYMOUS_FD_FAILED, + "inode-gfid=%s", uuid_utoa(loc->inode->gfid), NULL); + goto unwind; + } + + ret = br_stub_need_versioning(this, fd, &inc_version, &modified, &ctx); + if (ret) + goto cleanup_fd; + + ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno); + if (ret) + goto unwind; + + if (!inc_version && modified) + goto wind; + + ret = br_stub_versioning_prep(frame, this, fd, ctx); + if (ret) + goto cleanup_fd; + + local = frame->local; + if (!inc_version) { + br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid, + BR_STUB_NO_VERSIONING, 0); + cbk = br_stub_truncate_cbk; + goto wind; + } + + stub = fop_truncate_stub(frame, br_stub_truncate_resume, loc, offset, + xdata); + if (!stub) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED, + "truncate gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto cleanup_local; + } + + return br_stub_perform_incversioning(this, frame, stub, fd, ctx); + +wind: + STACK_WIND(frame, cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, + loc, offset, xdata); + if (fd) + fd_unref(fd); + return 0; + +cleanup_local: + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); +cleanup_fd: + fd_unref(fd); +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, NULL, NULL, NULL); + + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* open() */ + +/** + * It's probably worth mentioning a bit about why some of the housekeeping + * work is done in open() call path, rather than the callback path. + * Two (or more) open()'s in parallel can race and lead to a situation + * where a release() gets triggered (possibly after a series of write() + * calls) when *other* open()'s have still not reached callback path + * thereby having an active fd on an inode that is in process of getting + * signed with the current version. + * + * Maintaining fd list in the call path ensures that a release() would + * not be triggered if an open() call races ahead (followed by a close()) + * threby finding non-empty fd list. + */ + +int +br_stub_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + int32_t ret = -1; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + br_stub_private_t *priv = NULL; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, this->private, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, unwind); + + priv = this->private; + + if (!priv->do_versioning) + goto wind; + + ret = br_stub_get_inode_ctx(this, fd->inode, &ctx_addr); + if (ret) { + ret = br_stub_init_inode_versions(this, fd, fd->inode, version, + _gf_true, _gf_false, &ctx_addr); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRS_MSG_GET_INODE_CONTEXT_FAILED, "path=%s", loc->path, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto unwind; + } + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno); + if (ret) + goto unwind; + + if (frame->root->pid == GF_CLIENT_PID_SCRUB) + goto wind; + + if (flags == O_RDONLY) + goto wind; + + ret = br_stub_add_fd_to_inode(this, fd, ctx); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ADD_FD_TO_LIST_FAILED, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto unwind; + } + +wind: + STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, NULL, NULL); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* creat() */ + +/** + * This routine registers a release callback for the given fd and adds the + * fd to the inode context fd tracking list. + */ +int32_t +br_stub_add_fd_to_inode(xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + br_stub_fd_t *br_stub_fd = NULL; + + ret = br_stub_require_release_call(this, fd, &br_stub_fd); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_FD_CONTEXT_FAILED, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto out; + } + + LOCK(&fd->inode->lock); + { + list_add_tail(&ctx->fd_list, &br_stub_fd->list); + } + UNLOCK(&fd->inode->lock); + + ret = 0; + +out: + return ret; +} + +int +br_stub_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; + br_stub_private_t *priv = NULL; + + priv = this->private; + + if (op_ret < 0) + goto unwind; + + if (!priv->do_versioning) + goto unwind; + + ret = br_stub_get_inode_ctx(this, fd->inode, &ctx_addr); + if (ret < 0) { + ret = br_stub_init_inode_versions(this, fd, inode, version, _gf_true, + _gf_false, &ctx_addr); + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + } else { + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + ret = br_stub_add_fd_to_inode(this, fd, ctx); + } + +unwind: + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, stbuf, + preparent, postparent, xdata); + return 0; +} + +int +br_stub_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, unwind); + + STACK_WIND(frame, br_stub_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; +} + +int +br_stub_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + int32_t ret = -1; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; + br_stub_private_t *priv = NULL; + + priv = this->private; + + if (op_ret < 0) + goto unwind; + + if (!priv->do_versioning) + goto unwind; + + ret = br_stub_init_inode_versions(this, NULL, inode, version, _gf_true, + _gf_false, NULL); + /** + * Like lookup, if init_inode_versions fail, return EINVAL + */ + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + +unwind: + STACK_UNWIND_STRICT(mknod, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, xdata); + return 0; +} + +int +br_stub_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *xdata) +{ + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind); + + STACK_WIND(frame, br_stub_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, dev, umask, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(mknod, frame, -1, EINVAL, NULL, NULL, NULL, NULL, NULL); + return 0; +} + +/** }}} */ + +/** + * As of now, only lookup searches for bad object xattr and marks the + * object as bad in its inode context if the xattr is present. But there + * is a possibility that, at the time of the lookup the object was not + * marked bad (i.e. bad object xattr was not set), and later its marked + * as bad. In this case, object is not bad, so when a fop such as open or + * readv or writev comes on the object, the fop will be sent downward instead + * of sending as error upwards. + * The solution for this is to do a getxattr for the below list of fops. + * lookup, readdirp, open, readv, writev. + * But doing getxattr for each of the above fops might be costly. + * So another method followed is to catch the bad file marking by the scrubber + * and set that info within the object's inode context. In this way getxattr + * calls can be avoided and bad objects can be caught instantly. Fetching the + * xattr is needed only in lookups when there is a brick restart or inode + * forget. + * + * If the dict (@xattr) is NULL, then how should that be handled? Fail the + * lookup operation? Or let it continue with version being initialized to + * BITROT_DEFAULT_CURRENT_VERSION. But what if the version was different + * on disk (and also a right signature was there), but posix failed to + * successfully allocate the dict? Posix does not treat call back xdata + * creattion failure as the lookup failure. + */ +static int32_t +br_stub_lookup_version(xlator_t *this, uuid_t gfid, inode_t *inode, + dict_t *xattr) +{ + unsigned long version = 0; + br_version_t *obuf = NULL; + br_signature_t *sbuf = NULL; + br_vxattr_status_t status; + gf_boolean_t bad_object = _gf_false; + + /** + * versioning xattrs were requested from POSIX. if available, figure + * out the correct version to use in the inode context (start with + * the default version if unavailable). As of now versions are not + * persisted on-disk. The inode is marked dirty, so that the first + * operation (such as write(), etc..) triggers synchronization to + * disk. + */ + status = br_version_xattr_state(xattr, &obuf, &sbuf, &bad_object); + version = ((status == BR_VXATTR_STATUS_FULL) || + (status == BR_VXATTR_STATUS_UNSIGNED)) + ? obuf->ongoingversion + : BITROT_DEFAULT_CURRENT_VERSION; + + /** + * If signature is there, but version is not there then that status is + * is treated as INVALID. So in that case, we should not initialize the + * inode context with wrong version names etc. + */ + if (status == BR_VXATTR_STATUS_INVALID) + return -1; + + return br_stub_init_inode_versions(this, NULL, inode, version, _gf_true, + bad_object, NULL); +} + +/** {{{ */ + +int32_t +br_stub_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + br_stub_private_t *priv = NULL; + br_stub_fd_t *fd_ctx = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + priv = this->private; + if (gf_uuid_compare(fd->inode->gfid, priv->bad_object_dir_gfid)) + goto normal; + + fd_ctx = br_stub_fd_new(); + if (!fd_ctx) { + op_errno = ENOMEM; + goto unwind; + } + + fd_ctx->bad_object.dir_eof = -1; + fd_ctx->bad_object.dir = sys_opendir(priv->stub_basepath); + if (!fd_ctx->bad_object.dir) { + op_errno = errno; + goto err_freectx; + } + + op_ret = br_stub_fd_ctx_set(this, fd, fd_ctx); + if (!op_ret) + goto unwind; + + sys_closedir(fd_ctx->bad_object.dir); + +err_freectx: + GF_FREE(fd_ctx); +unwind: + STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, NULL); + return 0; + +normal: + STACK_WIND(frame, default_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + return 0; +} + +int32_t +br_stub_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + call_stub_t *stub = NULL; + br_stub_private_t *priv = NULL; + + priv = this->private; + if (!priv->do_versioning) + goto out; + + if (gf_uuid_compare(fd->inode->gfid, priv->bad_object_dir_gfid)) + goto out; + stub = fop_readdir_stub(frame, br_stub_readdir_wrapper, fd, size, off, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL); + return 0; + } + br_stub_worker_enqueue(this, stub); + return 0; +out: + STACK_WIND(frame, default_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, fd, size, off, xdata); + return 0; +} + +int +br_stub_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *dict) +{ + int32_t ret = 0; + uint64_t ctxaddr = 0; + gf_dirent_t *entry = NULL; + br_stub_private_t *priv = NULL; + gf_boolean_t ver_enabled = _gf_false; + + BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled); + priv = this->private; + BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), unwind); + + if (op_ret < 0) + goto unwind; + + list_for_each_entry(entry, &entries->list, list) + { + if ((strcmp(entry->d_name, ".") == 0) || + (strcmp(entry->d_name, "..") == 0)) + continue; + + if (!IA_ISREG(entry->d_stat.ia_type)) + continue; + + /* + * Readdirp for most part is a bulk lookup for all the entries + * present in the directory being read. Ideally, for each + * entry, the handling should be similar to that of a lookup + * callback. But for now, just keeping this as it has been + * until now (which means, this comment has been added much + * later as part of a change that wanted to send the flag + * of true/false to br_stub_remove_vxattrs to indicate whether + * the bad-object xattr should be removed from the entry->dict + * or not). Until this change, the function br_stub_remove_vxattrs + * was just removing all the xattrs associated with bit-rot-stub + * (like version, bad-object, signature etc). But, there are + * scenarios where we only want to send bad-object xattr and not + * others. So this comment is part of that change which also + * mentions about another possible change that might be needed + * in future. + * But for now, adding _gf_true means functionally its same as + * what this function was doing before. Just remove all the stub + * related xattrs. + */ + ret = br_stub_get_inode_ctx(this, entry->inode, &ctxaddr); + if (ret < 0) + ctxaddr = 0; + if (ctxaddr) { /* already has the context */ + br_stub_remove_vxattrs(entry->dict, _gf_true); + continue; + } + + ret = br_stub_lookup_version(this, entry->inode->gfid, entry->inode, + entry->dict); + br_stub_remove_vxattrs(entry->dict, _gf_true); + if (ret) { + /** + * there's no per-file granularity support in case of + * failure. let's fail the entire request for now.. + */ + break; + } + } + + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + +unwind: + STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, dict); + + return 0; +} + +int +br_stub_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict) +{ + int32_t ret = -1; + int op_errno = 0; + gf_boolean_t xref = _gf_false; + br_stub_private_t *priv = NULL; + + priv = this->private; + BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + op_errno = ENOMEM; + if (!dict) { + dict = dict_new(); + if (!dict) + goto unwind; + } else { + dict = dict_ref(dict); + } + + xref = _gf_true; + + op_errno = EINVAL; + ret = dict_set_uint32(dict, BITROT_CURRENT_VERSION_KEY, 0); + if (ret) + goto unwind; + ret = dict_set_uint32(dict, BITROT_SIGNING_VERSION_KEY, 0); + if (ret) + goto unwind; + ret = dict_set_uint32(dict, BITROT_OBJECT_BAD_KEY, 0); + if (ret) + goto unwind; + +wind: + STACK_WIND(frame, br_stub_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict); + goto unref_dict; + +unwind: + if (frame->local == (void *)0x1) + frame->local = NULL; + STACK_UNWIND_STRICT(readdirp, frame, -1, op_errno, NULL, NULL); + return 0; + +unref_dict: + if (xref) + dict_unref(dict); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* lookup() */ + +/** + * This function mainly handles the ENOENT error for the bad objects. Though + * br_stub_forget () handles removal of the link for the bad object from the + * quarantine directory, its better to handle it in lookup as well, where + * a failed lookup on a bad object with ENOENT, will trigger deletion of the + * link for the bad object from quarantine directory. So whoever comes first + * either forget () or lookup () will take care of removing the link. + */ +void +br_stub_handle_lookup_error(xlator_t *this, inode_t *inode, int32_t op_errno) +{ + int32_t ret = -1; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + + if (op_errno != ENOENT) + goto out; + + if (!inode_is_linked(inode)) + goto out; + + ret = br_stub_get_inode_ctx(this, inode, &ctx_addr); + if (ret) + goto out; + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + LOCK(&inode->lock); + { + if (__br_stub_is_bad_object(ctx)) + (void)br_stub_del(this, inode->gfid); + } + UNLOCK(&inode->lock); + + if (__br_stub_is_bad_object(ctx)) { + /* File is not present, might be deleted for recovery, + * del the bitrot inode context + */ + ctx_addr = 0; + inode_ctx_del(inode, this, &ctx_addr); + if (ctx_addr) { + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + GF_FREE(ctx); + } + } + +out: + return; +} + +int +br_stub_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent) +{ + int32_t ret = 0; + br_stub_private_t *priv = NULL; + gf_boolean_t ver_enabled = _gf_false; + gf_boolean_t remove_bad_file_marker = _gf_true; + + BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled); + priv = this->private; + + if (op_ret < 0) { + (void)br_stub_handle_lookup_error(this, inode, op_errno); + + /* + * If the lookup error is not ENOENT, then it is better + * to send the bad file marker to the higher layer (if + * it has been set) + */ + if (op_errno != ENOENT) + remove_bad_file_marker = _gf_false; + goto delkey; + } + + BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), delkey); + + if (!IA_ISREG(stbuf->ia_type)) + goto unwind; + + /** + * If the object is bad, then "bad inode" marker has to be sent back + * in resoinse, for revalidated lookups as well. Some xlators such as + * quick-read might cache the data in revalidated lookup as fresh + * lookup would anyway have sent "bad inode" marker. + * In general send bad inode marker for every lookup operation on the + * bad object. + */ + if (cookie != (void *)BR_STUB_REQUEST_COOKIE) { + ret = br_stub_mark_xdata_bad_object(this, inode, xattr); + if (ret) { + op_ret = -1; + op_errno = EIO; + /* + * This flag ensures that in the label @delkey below, + * bad file marker is not removed from the dictinary, + * but other virtual xattrs (such as version, signature) + * are removed. + */ + remove_bad_file_marker = _gf_false; + } + goto delkey; + } + + ret = br_stub_lookup_version(this, stbuf->ia_gfid, inode, xattr); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + goto delkey; + } + + /** + * If the object is bad, send "bad inode" marker back in response + * for xlator(s) to act accordingly (such as quick-read, etc..) + */ + ret = br_stub_mark_xdata_bad_object(this, inode, xattr); + if (ret) { + /** + * aaha! bad object, but sorry we would not + * satisfy the request on allocation failures. + */ + op_ret = -1; + op_errno = EIO; + goto delkey; + } + +delkey: + br_stub_remove_vxattrs(xattr, remove_bad_file_marker); +unwind: + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, stbuf, xattr, + postparent); + + return 0; +} + +int +br_stub_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int32_t ret = 0; + int op_errno = 0; + void *cookie = NULL; + uint64_t ctx_addr = 0; + gf_boolean_t xref = _gf_false; + br_stub_private_t *priv = NULL; + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind); + + priv = this->private; + + BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + if (!gf_uuid_compare(loc->gfid, priv->bad_object_dir_gfid) || + !gf_uuid_compare(loc->pargfid, priv->bad_object_dir_gfid)) { + stub = fop_lookup_stub(frame, br_stub_lookup_wrapper, loc, xdata); + if (!stub) { + op_errno = ENOMEM; + goto unwind; + } + br_stub_worker_enqueue(this, stub); + return 0; + } + + ret = br_stub_get_inode_ctx(this, loc->inode, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + if (ctx_addr != 0) + goto wind; + + /** + * fresh lookup: request version keys from POSIX + */ + op_errno = ENOMEM; + if (!xdata) { + xdata = dict_new(); + if (!xdata) + goto unwind; + } else { + xdata = dict_ref(xdata); + } + + xref = _gf_true; + + /** + * Requesting both xattrs provides a way of sanity checking the + * object. Anomaly checking is done in cbk by examining absence + * of either or both xattrs. + */ + op_errno = EINVAL; + ret = dict_set_uint32(xdata, BITROT_CURRENT_VERSION_KEY, 0); + if (ret) + goto unwind; + ret = dict_set_uint32(xdata, BITROT_SIGNING_VERSION_KEY, 0); + if (ret) + goto unwind; + ret = dict_set_uint32(xdata, BITROT_OBJECT_BAD_KEY, 0); + if (ret) + goto unwind; + cookie = (void *)BR_STUB_REQUEST_COOKIE; + +wind: + STACK_WIND_COOKIE(frame, br_stub_lookup_cbk, cookie, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + goto dealloc_dict; + +unwind: + if (frame->local == (void *)0x1) + frame->local = NULL; + STACK_UNWIND_STRICT(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); +dealloc_dict: + if (xref) + dict_unref(xdata); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* stat */ +int +br_stub_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int32_t ret = 0; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + br_stub_private_t *priv = NULL; + + priv = this->private; + + if (!priv->do_versioning) + goto wind; + + if (!IA_ISREG(loc->inode->ia_type)) + goto wind; + + ret = br_stub_check_bad_object(this, loc->inode, &op_ret, &op_errno); + if (ret) + goto unwind; + +wind: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat, + loc, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, NULL, NULL); + return 0; +} + +/* fstat */ +int +br_stub_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int32_t ret = 0; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + br_stub_private_t *priv = NULL; + + priv = this->private; + + if (!priv->do_versioning) + goto wind; + + if (!IA_ISREG(fd->inode->ia_type)) + goto wind; + + ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno); + if (ret) + goto unwind; + +wind: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat, + fd, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(fstat, frame, op_ret, op_errno, NULL, NULL); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* unlink() */ + +int +br_stub_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + inode_t *inode = NULL; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = -1; + br_stub_private_t *priv = NULL; + gf_boolean_t ver_enabled = _gf_false; + + BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled); + priv = this->private; + BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), unwind); + + local = frame->local; + frame->local = NULL; + + if (op_ret < 0) + goto unwind; + + if (!local) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_NULL_LOCAL, NULL); + goto unwind; + } + inode = local->u.context.inode; + if (!IA_ISREG(inode->ia_type)) + goto unwind; + + ret = br_stub_get_inode_ctx(this, inode, &ctx_addr); + if (ret) { + /** + * If the inode is bad AND context is not there, then there + * is a possibility of the gfid of the object being listed + * in the quarantine directory and will be shown in the + * bad objects list. So continuing with the fop with a + * warning log. The entry from the quarantine directory + * has to be removed manually. Its not a good idea to fail + * the fop, as the object has already been deleted. + */ + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED, + "inode-gfid=%s", uuid_utoa(inode->gfid), NULL); + goto unwind; + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + LOCK(&inode->lock); + { + /** + * Ignoring the return value of br_stub_del (). + * There is not much that can be done if unlinking + * of the entry in the quarantine directory fails. + * The failure is logged. + */ + if (__br_stub_is_bad_object(ctx)) + (void)br_stub_del(this, inode->gfid); + } + UNLOCK(&inode->lock); + +unwind: + STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + return 0; +} + +int +br_stub_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int flag, + dict_t *xdata) +{ + br_stub_local_t *local = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + br_stub_private_t *priv = NULL; + + priv = this->private; + BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + local = br_stub_alloc_local(this); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRS_MSG_ALLOC_MEM_FAILED, + "local path=%s", loc->path, "gfid=%s", + uuid_utoa(loc->inode->gfid), NULL); + goto unwind; + } + + br_stub_fill_local(local, NULL, NULL, loc->inode, loc->inode->gfid, + BR_STUB_NO_VERSIONING, 0); + + frame->local = local; + +wind: + STACK_WIND(frame, br_stub_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, flag, xdata); + return 0; + +unwind: + if (frame->local == (void *)0x1) + frame->local = NULL; + STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, NULL, NULL, NULL); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* forget() */ + +int +br_stub_forget(xlator_t *this, inode_t *inode) +{ + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + + inode_ctx_del(inode, this, &ctx_addr); + if (!ctx_addr) + return 0; + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + GF_FREE(ctx); + + return 0; +} + +/** }}} */ + +/** {{{ */ + +int32_t +br_stub_noop(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + STACK_DESTROY(frame->root); + return 0; +} + +static void +br_stub_send_ipc_fop(xlator_t *this, fd_t *fd, unsigned long releaseversion, + int sign_info) +{ + int32_t op = 0; + int32_t ret = 0; + dict_t *xdata = NULL; + call_frame_t *frame = NULL; + changelog_event_t ev = { + 0, + }; + + ev.ev_type = CHANGELOG_OP_TYPE_BR_RELEASE; + ev.u.releasebr.version = releaseversion; + ev.u.releasebr.sign_info = sign_info; + gf_uuid_copy(ev.u.releasebr.gfid, fd->inode->gfid); + + xdata = dict_new(); + if (!xdata) { + gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, BRS_MSG_DICT_ALLOC_FAILED, + NULL); + goto out; + } + + ret = dict_set_static_bin(xdata, "RELEASE-EVENT", &ev, CHANGELOG_EV_SIZE); + if (ret) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SET_EVENT_FAILED, NULL); + goto dealloc_dict; + } + + frame = create_frame(this, this->ctx->pool); + if (!frame) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_CREATE_FRAME_FAILED, + NULL); + goto dealloc_dict; + } + + op = GF_IPC_TARGET_CHANGELOG; + STACK_WIND(frame, br_stub_noop, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ipc, op, xdata); + +dealloc_dict: + dict_unref(xdata); +out: + return; +} + +/** + * This is how the state machine of sign info works: + * 3 states: + * 1) BR_SIGN_NORMAL => The default State of the inode + * 2) BR_SIGN_REOPEN_WAIT => A release has been sent and is waiting for reopen + * 3) BR_SIGN_QUICK => reopen has happened and this release should trigger sign + * 2 events: + * 1) GF_FOP_RELEASE + * 2) GF_FOP_WRITE (actually a dummy write for BitD) + * + * This is how states are changed based on events: + * EVENT: GF_FOP_RELEASE: + * if (state == BR_SIGN_NORMAL) ; then + * set state = BR_SIGN_REOPEN_WAIT; + * if (state == BR_SIGN_QUICK); then + * set state = BR_SIGN_NORMAL; + * EVENT: GF_FOP_WRITE: + * if (state == BR_SIGN_REOPEN_WAIT); then + * set state = BR_SIGN_QUICK; + */ +br_sign_state_t +__br_stub_inode_sign_state(br_stub_inode_ctx_t *ctx, glusterfs_fop_t fop, + fd_t *fd) +{ + br_sign_state_t sign_info = BR_SIGN_INVALID; + + switch (fop) { + case GF_FOP_FSETXATTR: + sign_info = ctx->info_sign = BR_SIGN_QUICK; + break; + + case GF_FOP_RELEASE: + GF_ASSERT(ctx->info_sign != BR_SIGN_REOPEN_WAIT); + + if (ctx->info_sign == BR_SIGN_NORMAL) { + sign_info = ctx->info_sign = BR_SIGN_REOPEN_WAIT; + } else { + sign_info = ctx->info_sign; + ctx->info_sign = BR_SIGN_NORMAL; + } + + break; + default: + break; + } + + return sign_info; +} + +int32_t +br_stub_release(xlator_t *this, fd_t *fd) +{ + int32_t ret = 0; + int32_t flags = 0; + inode_t *inode = NULL; + unsigned long releaseversion = 0; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t tmp = 0; + br_stub_fd_t *br_stub_fd = NULL; + int32_t signinfo = 0; + + inode = fd->inode; + + LOCK(&inode->lock); + { + ctx = __br_stub_get_ongoing_version_ctx(this, inode, NULL); + if (ctx == NULL) + goto unblock; + br_stub_fd = br_stub_fd_ctx_get(this, fd); + if (br_stub_fd) { + list_del_init(&br_stub_fd->list); + } + + ret = __br_stub_can_trigger_release(inode, ctx, &releaseversion); + if (!ret) + goto unblock; + + signinfo = __br_stub_inode_sign_state(ctx, GF_FOP_RELEASE, fd); + signinfo = htonl(signinfo); + + /* inode back to initital state: mark dirty */ + if (ctx->info_sign == BR_SIGN_NORMAL) { + __br_stub_mark_inode_dirty(ctx); + __br_stub_unset_inode_modified(ctx); + } + } +unblock: + UNLOCK(&inode->lock); + + if (ret) { + gf_msg_debug(this->name, 0, + "releaseversion: %lu | flags: %d " + "| signinfo: %d", + (unsigned long)ntohl(releaseversion), flags, + ntohl(signinfo)); + br_stub_send_ipc_fop(this, fd, releaseversion, signinfo); + } + + ret = fd_ctx_del(fd, this, &tmp); + br_stub_fd = (br_stub_fd_t *)(long)tmp; + + GF_FREE(br_stub_fd); + + return 0; +} + +int32_t +br_stub_releasedir(xlator_t *this, fd_t *fd) +{ + br_stub_fd_t *fctx = NULL; + uint64_t ctx = 0; + int ret = 0; + + ret = fd_ctx_del(fd, this, &ctx); + if (ret < 0) + goto out; + + fctx = (br_stub_fd_t *)(long)ctx; + if (fctx->bad_object.dir) { + ret = sys_closedir(fctx->bad_object.dir); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_DIR_CLOSE_FAIL, + "error=%s", strerror(errno), NULL); + } + + GF_FREE(fctx); +out: + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* ictxmerge */ + +void +br_stub_ictxmerge(xlator_t *this, fd_t *fd, inode_t *inode, + inode_t *linked_inode) +{ + int32_t ret = 0; + uint64_t ctxaddr = 0; + uint64_t lctxaddr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_inode_ctx_t *lctx = NULL; + br_stub_fd_t *br_stub_fd = NULL; + + ret = br_stub_get_inode_ctx(this, inode, &ctxaddr); + if (ret < 0) + goto done; + ctx = (br_stub_inode_ctx_t *)(uintptr_t)ctxaddr; + + LOCK(&linked_inode->lock); + { + ret = __br_stub_get_inode_ctx(this, linked_inode, &lctxaddr); + if (ret < 0) + goto unblock; + lctx = (br_stub_inode_ctx_t *)(uintptr_t)lctxaddr; + + GF_ASSERT(list_is_singular(&ctx->fd_list)); + br_stub_fd = list_first_entry(&ctx->fd_list, br_stub_fd_t, list); + if (br_stub_fd) { + GF_ASSERT(br_stub_fd->fd == fd); + list_move_tail(&br_stub_fd->list, &lctx->fd_list); + } + } +unblock: + UNLOCK(&linked_inode->lock); + +done: + return; +} + +/** }}} */ + +struct xlator_fops fops = { + .lookup = br_stub_lookup, + .stat = br_stub_stat, + .fstat = br_stub_fstat, + .open = br_stub_open, + .create = br_stub_create, + .readdirp = br_stub_readdirp, + .getxattr = br_stub_getxattr, + .fgetxattr = br_stub_fgetxattr, + .fsetxattr = br_stub_fsetxattr, + .writev = br_stub_writev, + .truncate = br_stub_truncate, + .ftruncate = br_stub_ftruncate, + .mknod = br_stub_mknod, + .readv = br_stub_readv, + .removexattr = br_stub_removexattr, + .fremovexattr = br_stub_fremovexattr, + .setxattr = br_stub_setxattr, + .opendir = br_stub_opendir, + .readdir = br_stub_readdir, + .unlink = br_stub_unlink, +}; + +struct xlator_cbks cbks = { + .forget = br_stub_forget, + .release = br_stub_release, + .ictxmerge = br_stub_ictxmerge, +}; + +struct volume_options options[] = { + {.key = {"bitrot"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_FORCE, + .tags = {"bitrot"}, + .description = "enable/disable bitrot stub"}, + {.key = {"export"}, + .type = GF_OPTION_TYPE_PATH, + .op_version = {GD_OP_VERSION_3_7_0}, + .tags = {"bitrot"}, + .description = "brick path for versioning", + .default_value = "{{ brick.path }}"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "bitrot-stub", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.h b/xlators/features/bit-rot/src/stub/bit-rot-stub.h new file mode 100644 index 00000000000..edd79a77e4f --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.h @@ -0,0 +1,515 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __BIT_ROT_STUB_H__ +#define __BIT_ROT_STUB_H__ + +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include <glusterfs/call-stub.h> +#include "bit-rot-stub-mem-types.h" +#include <glusterfs/syscall.h> +#include <glusterfs/common-utils.h> +#include "bit-rot-common.h" +#include "bit-rot-stub-messages.h" +#include "glusterfs3-xdr.h" +#include <glusterfs/syncop.h> +#include <glusterfs/syncop-utils.h> + +#define BAD_OBJECT_THREAD_STACK_SIZE ((size_t)(1024 * 1024)) +#define BR_STUB_DUMP_STR_SIZE 65536 + +#define BR_PATH_MAX_EXTRA (PATH_MAX + 1024) +#define BR_PATH_MAX_PLUS (PATH_MAX + 2048) + +/* + * Oops. Spelling mistake. Correcting it + */ +#define OLD_BR_STUB_QUARANTINE_DIR GF_HIDDEN_PATH "/quanrantine" +#define BR_STUB_QUARANTINE_DIR GF_HIDDEN_PATH "/quarantine" + +/* do not reference frame->local in cbk unless initialized. + * Assigned 0x1 marks verisoning flag between call path and + * cbk path. + */ +#define BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, label) \ + do { \ + if (priv->do_versioning) \ + frame->local = (void *)0x1; \ + else \ + goto label; \ + } while (0) + +#define BR_STUB_VER_COND_GOTO(priv, cond, label) \ + do { \ + if (!priv->do_versioning || cond) \ + goto label; \ + } while (0) + +#define BR_STUB_VER_ENABLED_IN_CALLPATH(frame, flag) \ + do { \ + if (frame->local) \ + flag = _gf_true; \ + if (frame->local == (void *)0x1) \ + frame->local = NULL; \ + } while (0) + +#define BR_STUB_RESET_LOCAL_NULL(frame) \ + do { \ + if (frame->local == (void *)0x1) \ + frame->local = NULL; \ + } while (0) + +typedef int(br_stub_version_cbk)(call_frame_t *, void *, xlator_t *, int32_t, + int32_t, dict_t *); + +typedef struct br_stub_inode_ctx { + int need_writeback; /* does the inode need + a writeback to disk? */ + unsigned long currentversion; /* ongoing version */ + + int info_sign; + struct list_head fd_list; /* list of open fds or fds participating in + write operations */ + gf_boolean_t bad_object; +} br_stub_inode_ctx_t; + +typedef struct br_stub_fd { + fd_t *fd; + struct list_head list; + struct bad_object_dir { + DIR *dir; + off_t dir_eof; + } bad_object; +} br_stub_fd_t; + +#define I_DIRTY (1 << 0) /* inode needs writeback */ +#define I_MODIFIED (1 << 1) +#define WRITEBACK_DURABLE 1 /* writeback is durable */ + +/** + * This could just have been a plain struct without unions and all, + * but we may need additional things in the future. + */ +typedef struct br_stub_local { + call_stub_t *fopstub; /* stub for original fop */ + + int versioningtype; /* not much used atm */ + + union { + struct br_stub_ctx { + fd_t *fd; + uuid_t gfid; + inode_t *inode; + unsigned long version; + } context; + } u; +} br_stub_local_t; + +#define BR_STUB_NO_VERSIONING (1 << 0) +#define BR_STUB_INCREMENTAL_VERSIONING (1 << 1) + +typedef struct br_stub_private { + gf_boolean_t do_versioning; + + uint32_t boot[2]; + char export[PATH_MAX]; + + pthread_mutex_t lock; + pthread_cond_t cond; + + struct list_head squeue; /* ordered signing queue */ + pthread_t signth; + struct bad_objects_container { + pthread_t thread; + pthread_mutex_t bad_lock; + pthread_cond_t bad_cond; + struct list_head bad_queue; + } container; + struct mem_pool *local_pool; + + char stub_basepath[BR_PATH_MAX_EXTRA]; + + uuid_t bad_object_dir_gfid; +} br_stub_private_t; + +br_stub_fd_t * +br_stub_fd_new(void); + +int +__br_stub_fd_ctx_set(xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd); + +br_stub_fd_t * +__br_stub_fd_ctx_get(xlator_t *this, fd_t *fd); + +br_stub_fd_t * +br_stub_fd_ctx_get(xlator_t *this, fd_t *fd); + +int32_t +br_stub_fd_ctx_set(xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd); + +static inline gf_boolean_t +__br_stub_is_bad_object(br_stub_inode_ctx_t *ctx) +{ + return ctx->bad_object; +} + +static inline void +__br_stub_mark_object_bad(br_stub_inode_ctx_t *ctx) +{ + ctx->bad_object = _gf_true; +} + +/* inode writeback helpers */ +static inline void +__br_stub_mark_inode_dirty(br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback |= I_DIRTY; +} + +static inline void +__br_stub_mark_inode_synced(br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback &= ~I_DIRTY; +} + +static inline int +__br_stub_is_inode_dirty(br_stub_inode_ctx_t *ctx) +{ + return (ctx->need_writeback & I_DIRTY); +} + +/* inode mofification markers */ +static inline void +__br_stub_set_inode_modified(br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback |= I_MODIFIED; +} + +static inline void +__br_stub_unset_inode_modified(br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback &= ~I_MODIFIED; +} + +static inline int +__br_stub_is_inode_modified(br_stub_inode_ctx_t *ctx) +{ + return (ctx->need_writeback & I_MODIFIED); +} + +static inline int +br_stub_require_release_call(xlator_t *this, fd_t *fd, br_stub_fd_t **fd_ctx) +{ + int32_t ret = 0; + br_stub_fd_t *br_stub_fd = NULL; + + br_stub_fd = br_stub_fd_new(); + if (!br_stub_fd) + return -1; + + br_stub_fd->fd = fd; + INIT_LIST_HEAD(&br_stub_fd->list); + + ret = br_stub_fd_ctx_set(this, fd, br_stub_fd); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SET_CONTEXT_FAILED, + NULL); + else + *fd_ctx = br_stub_fd; + + return ret; +} + +/* get/set inode context helpers */ + +static inline int +__br_stub_get_inode_ctx(xlator_t *this, inode_t *inode, uint64_t *ctx) +{ + return __inode_ctx_get(inode, this, ctx); +} + +static inline int +br_stub_get_inode_ctx(xlator_t *this, inode_t *inode, uint64_t *ctx) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __br_stub_get_inode_ctx(this, inode, ctx); + } + UNLOCK(&inode->lock); + + return ret; +} + +static inline int +br_stub_set_inode_ctx(xlator_t *this, inode_t *inode, br_stub_inode_ctx_t *ctx) +{ + uint64_t ctx_addr = (uint64_t)(uintptr_t)ctx; + return inode_ctx_set(inode, this, &ctx_addr); +} + +/* version get/set helpers */ + +static inline unsigned long +__br_stub_writeback_version(br_stub_inode_ctx_t *ctx) +{ + return (ctx->currentversion + 1); +} + +static inline void +__br_stub_set_ongoing_version(br_stub_inode_ctx_t *ctx, unsigned long version) +{ + if (ctx->currentversion < version) + ctx->currentversion = version; + else + gf_smsg("bit-rot-stub", GF_LOG_WARNING, 0, + BRS_MSG_CHANGE_VERSION_FAILED, "current version=%lu", + ctx->currentversion, "new version=%lu", version, NULL); +} + +static inline int +__br_stub_can_trigger_release(inode_t *inode, br_stub_inode_ctx_t *ctx, + unsigned long *version) +{ + /** + * If the inode is modified, then it has to be dirty. An inode is + * marked dirty once version is increased. Its marked as modified + * when the modification call (write/truncate) which triggered + * the versioning is successful. + */ + if (__br_stub_is_inode_modified(ctx) && list_empty(&ctx->fd_list) && + (ctx->info_sign != BR_SIGN_REOPEN_WAIT)) { + GF_ASSERT(__br_stub_is_inode_dirty(ctx) == 0); + + if (version) + *version = htonl(ctx->currentversion); + return 1; + } + + return 0; +} + +static inline int32_t +br_stub_get_ongoing_version(xlator_t *this, inode_t *inode, + unsigned long *version) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + + LOCK(&inode->lock); + { + ret = __inode_ctx_get(inode, this, &ctx_addr); + if (ret < 0) + goto unblock; + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + *version = ctx->currentversion; + } +unblock: + UNLOCK(&inode->lock); + + return ret; +} + +/** + * fetch the current version from inode and return the context. + * inode->lock should be held before invoking this as context + * *needs* to be valid in the caller. + */ +static inline br_stub_inode_ctx_t * +__br_stub_get_ongoing_version_ctx(xlator_t *this, inode_t *inode, + unsigned long *version) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_addr); + if (ret < 0) + return NULL; + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + if (version) + *version = ctx->currentversion; + + return ctx; +} + +/* filter for xattr fetch */ +static inline int +br_stub_is_internal_xattr(const char *name) +{ + if (name && ((strncmp(name, BITROT_CURRENT_VERSION_KEY, + SLEN(BITROT_CURRENT_VERSION_KEY)) == 0) || + (strncmp(name, BITROT_SIGNING_VERSION_KEY, + SLEN(BITROT_SIGNING_VERSION_KEY)) == 0))) + return 1; + return 0; +} + +static inline void +br_stub_remove_vxattrs(dict_t *xattr, gf_boolean_t remove_bad_marker) +{ + if (xattr) { + /* + * When a file is corrupted, bad-object should be + * set in the dict. But, other info such as version, + * signature etc should not be set. Hence the flag + * remove_bad_marker. The consumer should know whether + * to send the bad-object info in the dict or not. + */ + if (remove_bad_marker) + dict_del(xattr, BITROT_OBJECT_BAD_KEY); + dict_del(xattr, BITROT_CURRENT_VERSION_KEY); + dict_del(xattr, BITROT_SIGNING_VERSION_KEY); + dict_del(xattr, BITROT_SIGNING_XATTR_SIZE_KEY); + } +} + +/** + * This function returns the below values for different situations + * 0 => as per the inode context object is not bad + * -1 => Failed to get the inode context itself + * -2 => As per the inode context object is bad + * Both -ve values means the fop which called this function is failed + * and error is returned upwards. + * In future if needed or more errors have to be handled, then those + * errors can be made into enums. + */ +static inline int +br_stub_is_bad_object(xlator_t *this, inode_t *inode) +{ + int bad_object = 0; + gf_boolean_t tmp = _gf_false; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = -1; + + ret = br_stub_get_inode_ctx(this, inode, &ctx_addr); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED, + "inode-gfid=%s", uuid_utoa(inode->gfid), NULL); + bad_object = -1; + goto out; + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + LOCK(&inode->lock); + { + tmp = __br_stub_is_bad_object(ctx); + if (tmp) + bad_object = -2; + } + UNLOCK(&inode->lock); + +out: + return bad_object; +} + +static inline int32_t +br_stub_mark_object_bad(xlator_t *this, inode_t *inode) +{ + int32_t ret = -1; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + + ret = br_stub_get_inode_ctx(this, inode, &ctx_addr); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED, + "inode-gfid=%s", uuid_utoa(inode->gfid), NULL); + goto out; + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + LOCK(&inode->lock); + { + __br_stub_mark_object_bad(ctx); + } + UNLOCK(&inode->lock); + +out: + return ret; +} + +/** + * There is a possibility that dict_set might fail. The o/p of dict_set is + * given to the caller and the caller has to decide what to do. + */ +static inline int32_t +br_stub_mark_xdata_bad_object(xlator_t *this, inode_t *inode, dict_t *xdata) +{ + int32_t ret = 0; + + if (br_stub_is_bad_object(this, inode) == -2) + ret = dict_set_int32(xdata, GLUSTERFS_BAD_INODE, 1); + + return ret; +} + +int32_t +br_stub_add_fd_to_inode(xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx); + +br_sign_state_t +__br_stub_inode_sign_state(br_stub_inode_ctx_t *ctx, glusterfs_fop_t fop, + fd_t *fd); + +int +br_stub_dir_create(xlator_t *this, br_stub_private_t *priv); + +int +br_stub_add(xlator_t *this, uuid_t gfid); + +int32_t +br_stub_create_stub_gfid(xlator_t *this, char *stub_gfid_path, uuid_t gfid); + +int +br_stub_dir_create(xlator_t *this, br_stub_private_t *priv); + +call_stub_t * +__br_stub_dequeue(struct list_head *callstubs); + +void +__br_stub_enqueue(struct list_head *callstubs, call_stub_t *stub); + +void +br_stub_worker_enqueue(xlator_t *this, call_stub_t *stub); + +void * +br_stub_worker(void *data); + +int32_t +br_stub_lookup_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xattr_req); + +int32_t +br_stub_readdir_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t off, dict_t *xdata); + +int +br_stub_del(xlator_t *this, uuid_t gfid); + +int +br_stub_bad_objects_path(xlator_t *this, fd_t *fd, gf_dirent_t *entries, + dict_t **dict); + +void +br_stub_entry_xattr_fill(xlator_t *this, char *hpath, gf_dirent_t *entry, + dict_t *dict); + +int +br_stub_get_path_of_gfid(xlator_t *this, inode_t *parent, inode_t *inode, + uuid_t gfid, char **path); + +#endif /* __BIT_ROT_STUB_H__ */ diff --git a/xlators/features/changelog/lib/examples/c/get-changes-multi.c b/xlators/features/changelog/lib/examples/c/get-changes-multi.c new file mode 100644 index 00000000000..5ea5bbb6630 --- /dev/null +++ b/xlators/features/changelog/lib/examples/c/get-changes-multi.c @@ -0,0 +1,90 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/** + * Compile it using: + * gcc -o getchanges-multi `pkg-config --cflags libgfchangelog` \ + * get-changes-multi.c `pkg-config --libs libgfchangelog` + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/un.h> +#include <limits.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <errno.h> + +#include "changelog.h" + +void * +brick_init(void *xl, struct gf_brick_spec *brick) +{ + return brick; +} + +void +brick_fini(void *xl, char *brick, void *data) +{ + return; +} + +void +brick_callback(void *xl, char *brick, void *data, changelog_event_t *ev) +{ + printf("->callback: (brick,type) [%s:%d]\n", brick, ev->ev_type); +} + +void +fill_brick_spec(struct gf_brick_spec *brick, char *path) +{ + brick->brick_path = strdup(path); + brick->filter = CHANGELOG_OP_TYPE_BR_RELEASE; + + brick->init = brick_init; + brick->fini = brick_fini; + brick->callback = brick_callback; + brick->connected = NULL; + brick->disconnected = NULL; +} + +int +main(int argc, char **argv) +{ + int ret = 0; + void *bricks = NULL; + struct gf_brick_spec *brick = NULL; + + bricks = calloc(2, sizeof(struct gf_brick_spec)); + if (!bricks) + goto error_return; + + brick = (struct gf_brick_spec *)bricks; + fill_brick_spec(brick, "/export/z1/zwoop"); + + brick++; + fill_brick_spec(brick, "/export/z2/zwoop"); + + ret = gf_changelog_init(NULL); + if (ret) + goto error_return; + + ret = gf_changelog_register_generic((struct gf_brick_spec *)bricks, 2, 0, + "/tmp/multi-changes.log", 9, NULL); + if (ret) + goto error_return; + + /* let callbacks do the job */ + select(0, NULL, NULL, NULL, NULL); + +error_return: + return -1; +} diff --git a/xlators/features/changelog/lib/examples/c/get-changes.c b/xlators/features/changelog/lib/examples/c/get-changes.c index 14562585aa9..8bc651c24a4 100644 --- a/xlators/features/changelog/lib/examples/c/get-changes.c +++ b/xlators/features/changelog/lib/examples/c/get-changes.c @@ -27,61 +27,67 @@ #include "changelog.h" -#define handle_error(fn) \ - printf ("%s (reason: %s)\n", fn, strerror (errno)) +#define handle_error(fn) printf("%s (reason: %s)\n", fn, strerror(errno)) int -main (int argc, char ** argv) +main(int argc, char **argv) { - int i = 0; - int ret = 0; - ssize_t nr_changes = 0; - ssize_t changes = 0; - char fbuf[PATH_MAX] = {0,}; - - /* get changes for brick "/home/vshankar/export/yow/yow-1" */ - ret = gf_changelog_register ("/home/vshankar/export/yow/yow-1", - "/tmp/scratch", "/tmp/change.log", 9, 5); - if (ret) { - handle_error ("register failed"); - goto out; + int i = 0; + int ret = 0; + ssize_t nr_changes = 0; + ssize_t changes = 0; + char fbuf[PATH_MAX] = { + 0, + }; + + ret = gf_changelog_init(NULL); + if (ret) { + handle_error("Init failed"); + goto out; + } + + /* get changes for brick "/home/vshankar/export/yow/yow-1" */ + ret = gf_changelog_register("/export/z1/zwoop", "/tmp/scratch", + "/tmp/change.log", 9, 5); + if (ret) { + handle_error("register failed"); + goto out; + } + + while (1) { + i = 0; + nr_changes = gf_changelog_scan(); + if (nr_changes < 0) { + handle_error("scan(): "); + break; } - while (1) { - i = 0; - nr_changes = gf_changelog_scan (); - if (nr_changes < 0) { - handle_error ("scan(): "); - break; - } + if (nr_changes == 0) + goto next; - if (nr_changes == 0) - goto next; + printf("Got %ld changelog files\n", nr_changes); - printf ("Got %ld changelog files\n", nr_changes); + while ((changes = gf_changelog_next_change(fbuf, PATH_MAX)) > 0) { + printf("changelog file [%d]: %s\n", ++i, fbuf); - while ( (changes = - gf_changelog_next_change (fbuf, PATH_MAX)) > 0) { - printf ("changelog file [%d]: %s\n", ++i, fbuf); + /* process changelog */ + /* ... */ + /* ... */ + /* ... */ + /* done processing */ - /* process changelog */ - /* ... */ - /* ... */ - /* ... */ - /* done processing */ - - ret = gf_changelog_done (fbuf); - if (ret) - handle_error ("gf_changelog_done"); - } + ret = gf_changelog_done(fbuf); + if (ret) + handle_error("gf_changelog_done"); + } - if (changes == -1) - handle_error ("gf_changelog_next_change"); + if (changes == -1) + handle_error("gf_changelog_next_change"); - next: - sleep (10); - } + next: + sleep(10); + } - out: - return ret; +out: + return ret; } diff --git a/xlators/features/changelog/lib/examples/c/get-history.c b/xlators/features/changelog/lib/examples/c/get-history.c new file mode 100644 index 00000000000..3e888d75ca6 --- /dev/null +++ b/xlators/features/changelog/lib/examples/c/get-history.c @@ -0,0 +1,116 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/** + * get set of new changes every 10 seconds (just print the file names) + * + * Compile it using: + * gcc -o gethistory `pkg-config --cflags libgfchangelog` get-history.c \ + * `pkg-config --libs libgfchangelog` + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/un.h> +#include <limits.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <errno.h> + +#include "changelog.h" + +#define handle_error(fn) printf("%s (reason: %s)\n", fn, strerror(errno)) + +int +main(int argc, char **argv) +{ + int i = 0; + int ret = 0; + ssize_t nr_changes = 0; + ssize_t changes = 0; + char fbuf[PATH_MAX] = { + 0, + }; + unsigned long end_ts = 0; + + ret = gf_changelog_init(NULL); + if (ret) { + handle_error("init failed"); + goto out; + } + + ret = gf_changelog_register("/export/z1/zwoop", "/tmp/scratch_v1", + "/tmp/changes.log", 9, 5); + if (ret) { + handle_error("register failed"); + goto out; + } + + int a, b; + printf("give the two numbers start and end\t"); + scanf("%d%d", &a, &b); + ret = gf_history_changelog("/export/z1/zwoop/.glusterfs/changelogs", a, b, + 3, &end_ts); + if (ret == -1) { + printf("history failed"); + goto out; + } + + printf("end time till when changelog available : %d , ret(%d) \t", end_ts, + ret); + fflush(stdout); + + while (1) { + nr_changes = gf_history_changelog_scan(); + printf("scanned, nr_changes : %d\n", nr_changes); + if (nr_changes < 0) { + handle_error("scan(): "); + break; + } + + if (nr_changes == 0) { + printf("done scanning \n"); + goto out; + } + + printf("Got %ld changelog files\n", nr_changes); + + while ((changes = gf_history_changelog_next_change(fbuf, PATH_MAX)) > + 0) { + printf("changelog file [%d]: %s\n", ++i, fbuf); + + /* process changelog */ + /* ... */ + /* ... */ + /* ... */ + /* done processing */ + + ret = gf_history_changelog_done(fbuf); + if (ret) + handle_error("gf_changelog_done"); + } + /* + if (changes == -1) + handle_error ("gf_changelog_next_change"); + if (nr_changes ==1){ + printf("continue scanning\n"); + } + + if(nr_changes == 0){ + printf("done scanning \n"); + goto out; + } + */ + } + +out: + return ret; +} diff --git a/xlators/features/changelog/lib/examples/python/changes.py b/xlators/features/changelog/lib/examples/python/changes.py index d21db8eab2e..c410d3b000d 100644..100755 --- a/xlators/features/changelog/lib/examples/python/changes.py +++ b/xlators/features/changelog/lib/examples/python/changes.py @@ -1,5 +1,6 @@ -#!/usr/bin/python +#!/usr/bin/python3 +from __future__ import print_function import os import sys import time @@ -10,23 +11,24 @@ cl = libgfchangelog.Changes() def get_changes(brick, scratch_dir, log_file, log_level, interval): change_list = [] try: + cl.cl_init() cl.cl_register(brick, scratch_dir, log_file, log_level) while True: cl.cl_scan() change_list = cl.cl_getchanges() if change_list: - print change_list + print(change_list) for change in change_list: - print('done with %s' % (change)) + print(('done with %s' % (change))) cl.cl_done(change) time.sleep(interval) except OSError: ex = sys.exc_info()[1] - print ex + print(ex) if __name__ == '__main__': - if len(sys.argv) != 5: - print("usage: %s <brick> <scratch-dir> <log-file> <fetch-interval>" - % (sys.argv[0])) + if len(sys.argv) != 6: + print(("usage: %s <brick> <scratch-dir> <log-file> <fetch-interval>" + % (sys.argv[0]))) sys.exit(1) get_changes(sys.argv[1], sys.argv[2], sys.argv[3], 9, int(sys.argv[4])) diff --git a/xlators/features/changelog/lib/examples/python/libgfchangelog.py b/xlators/features/changelog/lib/examples/python/libgfchangelog.py index 68ec3baf144..2da9f2d2a8c 100644 --- a/xlators/features/changelog/lib/examples/python/libgfchangelog.py +++ b/xlators/features/changelog/lib/examples/python/libgfchangelog.py @@ -3,7 +3,8 @@ from ctypes import * from ctypes.util import find_library class Changes(object): - libgfc = CDLL(find_library("gfchangelog"), use_errno=True) + libgfc = CDLL(find_library("gfchangelog"), mode=RTLD_GLOBAL, + use_errno=True) @classmethod def geterrno(cls): @@ -19,6 +20,12 @@ class Changes(object): return getattr(cls.libgfc, call) @classmethod + def cl_init(cls): + ret = cls._get_api('gf_changelog_init')(None) + if ret == -1: + cls.raise_changelog_err() + + @classmethod def cl_register(cls, brick, path, log_file, log_level, retries = 0): ret = cls._get_api('gf_changelog_register')(brick, path, log_file, log_level, retries) diff --git a/xlators/features/changelog/lib/src/Makefile.am b/xlators/features/changelog/lib/src/Makefile.am index fbaaea628b7..c933ec53ed2 100644 --- a/xlators/features/changelog/lib/src/Makefile.am +++ b/xlators/features/changelog/lib/src/Makefile.am @@ -1,37 +1,35 @@ libgfchangelog_la_CFLAGS = -Wall $(GF_CFLAGS) $(GF_DARWIN_LIBGLUSTERFS_CFLAGS) \ - -DDATADIR=\"$(localstatedir)\" + -DDATADIR=\"$(localstatedir)\" -libgfchangelog_la_CPPFLAGS = $(GF_CPPFLAGS) -D__USE_FILE_OFFSET64 -fpic \ - -I../../../src/ -I$(top_srcdir)/libglusterfs/src \ - -I$(top_srcdir)/xlators/features/changelog/src \ - -DDATADIR=\"$(localstatedir)\" +libgfchangelog_la_CPPFLAGS = $(GF_CPPFLAGS) -D__USE_FILE_OFFSET64 -D__USE_LARGEFILE64 -fpic \ + -I../../../src/ -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/xlators/features/changelog/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/rpc/rpc-transport/socket/src \ + -DDATADIR=\"$(localstatedir)\" libgfchangelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ - $(GF_GLUSTERFS_LIBS) + $(top_builddir)/rpc/xdr/src/libgfxdr.la \ + $(top_builddir)/rpc/rpc-lib/src/libgfrpc.la -libgfchangelog_la_LDFLAGS = $(GF_LDFLAGS) +libgfchangelog_la_LDFLAGS = $(GF_LDFLAGS) \ + -version-info $(LIBGFCHANGELOG_LT_VERSION) \ + $(GF_NO_UNDEFINED) -libgfchangelogdir = $(includedir)/glusterfs/gfchangelog lib_LTLIBRARIES = libgfchangelog.la CONTRIB_BUILDDIR = $(top_builddir)/contrib -libgfchangelog_la_SOURCES = gf-changelog.c gf-changelog-process.c \ - gf-changelog-helpers.c $(CONTRIBDIR)/uuid/clear.c \ - $(CONTRIBDIR)/uuid/copy.c $(CONTRIBDIR)/uuid/gen_uuid.c \ - $(CONTRIBDIR)/uuid/pack.c $(CONTRIBDIR)/uuid/parse.c \ - $(CONTRIBDIR)/uuid/unparse.c $(CONTRIBDIR)/uuid/uuid_time.c \ - $(CONTRIBDIR)/uuid/compare.c $(CONTRIBDIR)/uuid/isnull.c \ - $(CONTRIBDIR)/uuid/unpack.c +libgfchangelog_la_SOURCES = gf-changelog.c gf-changelog-journal-handler.c \ + gf-changelog-helpers.c gf-changelog-api.c gf-history-changelog.c \ + gf-changelog-rpc.c gf-changelog-reborp.c \ + $(top_srcdir)/xlators/features/changelog/src/changelog-rpc-common.c -noinst_HEADERS = gf-changelog-helpers.h $(CONTRIBDIR)/uuid/uuidd.h \ - $(CONTRIBDIR)/uuid/uuid.h $(CONTRIBDIR)/uuid/uuidP.h \ - $(CONTRIB_BUILDDIR)/uuid/uuid_types.h - -libgfchangelog_HEADERS = changelog.h +noinst_HEADERS = gf-changelog-helpers.h gf-changelog-rpc.h \ + gf-changelog-journal.h changelog-lib-messages.h CLEANFILES = -CONFIG_CLEAN_FILES = $(CONTRIB_BUILDDIR)/uuid/uuid_types.h $(top_builddir)/libglusterfs/src/libglusterfs.la: $(MAKE) -C $(top_builddir)/libglusterfs/src/ all diff --git a/xlators/features/changelog/lib/src/changelog-lib-messages.h b/xlators/features/changelog/lib/src/changelog-lib-messages.h new file mode 100644 index 00000000000..d7fe7274353 --- /dev/null +++ b/xlators/features/changelog/lib/src/changelog-lib-messages.h @@ -0,0 +1,74 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _CHANGELOG_LIB_MESSAGES_H_ +#define _CHANGELOG_LIB_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID( + CHANGELOG_LIB, CHANGELOG_LIB_MSG_OPEN_FAILED, + CHANGELOG_LIB_MSG_FAILED_TO_RMDIR, + CHANGELOG_LIB_MSG_SCRATCH_DIR_ENTRIES_CREATION_ERROR, + CHANGELOG_LIB_MSG_THREAD_CREATION_FAILED, CHANGELOG_LIB_MSG_OPENDIR_ERROR, + CHANGELOG_LIB_MSG_RENAME_FAILED, CHANGELOG_LIB_MSG_READ_ERROR, + CHANGELOG_LIB_MSG_HTIME_ERROR, CHANGELOG_LIB_MSG_GET_TIME_ERROR, + CHANGELOG_LIB_MSG_WRITE_FAILED, CHANGELOG_LIB_MSG_PTHREAD_ERROR, + CHANGELOG_LIB_MSG_MMAP_FAILED, CHANGELOG_LIB_MSG_MUNMAP_FAILED, + CHANGELOG_LIB_MSG_ASCII_ERROR, CHANGELOG_LIB_MSG_STAT_FAILED, + CHANGELOG_LIB_MSG_GET_XATTR_FAILED, CHANGELOG_LIB_MSG_PUBLISH_ERROR, + CHANGELOG_LIB_MSG_PARSE_ERROR, CHANGELOG_LIB_MSG_MIN_MAX_INFO, + CHANGELOG_LIB_MSG_CLEANUP_ERROR, CHANGELOG_LIB_MSG_UNLINK_FAILED, + CHANGELOG_LIB_MSG_NOTIFY_REGISTER_FAILED, + CHANGELOG_LIB_MSG_INVOKE_RPC_FAILED, CHANGELOG_LIB_MSG_DRAINING_EVENT_INFO, + CHANGELOG_LIB_MSG_CLEANING_BRICK_ENTRY_INFO, + CHANGELOG_LIB_MSG_FREEING_ENTRY_INFO, CHANGELOG_LIB_MSG_XDR_DECODING_FAILED, + CHANGELOG_LIB_MSG_NOTIFY_REGISTER_INFO, + CHANGELOG_LIB_MSG_THREAD_CLEANUP_WARNING, + CHANGELOG_LIB_MSG_COPY_FROM_BUFFER_FAILED, + CHANGELOG_LIB_MSG_PTHREAD_JOIN_FAILED, CHANGELOG_LIB_MSG_HIST_FAILED, + CHANGELOG_LIB_MSG_DRAINED_EVENT_INFO, CHANGELOG_LIB_MSG_PARSE_ERROR_CEASED, + CHANGELOG_LIB_MSG_REQUESTING_INFO, CHANGELOG_LIB_MSG_FINAL_INFO); + +#define CHANGELOG_LIB_MSG_NOTIFY_REGISTER_INFO_STR "Registering brick" +#define CHANGELOG_LIB_MSG_RENAME_FAILED_STR "error moving changelog file" +#define CHANGELOG_LIB_MSG_OPEN_FAILED_STR "cannot open changelog file" +#define CHANGELOG_LIB_MSG_UNLINK_FAILED_STR "failed to unlink" +#define CHANGELOG_LIB_MSG_FAILED_TO_RMDIR_STR "failed to rmdir" +#define CHANGELOG_LIB_MSG_STAT_FAILED_STR "stat failed on changelog file" +#define CHANGELOG_LIB_MSG_PARSE_ERROR_STR "could not parse changelog" +#define CHANGELOG_LIB_MSG_PARSE_ERROR_CEASED_STR \ + "parsing error, ceased publishing..." +#define CHANGELOG_LIB_MSG_HTIME_ERROR_STR "fop failed on htime file" +#define CHANGELOG_LIB_MSG_GET_XATTR_FAILED_STR \ + "error extracting max timstamp from htime file" +#define CHANGELOG_LIB_MSG_MIN_MAX_INFO_STR "changelogs min max" +#define CHANGELOG_LIB_MSG_REQUESTING_INFO_STR "Requesting historical changelogs" +#define CHANGELOG_LIB_MSG_FINAL_INFO_STR "FINAL" +#define CHANGELOG_LIB_MSG_HIST_FAILED_STR \ + "Requested changelog range is not available" +#define CHANGELOG_LIB_MSG_GET_TIME_ERROR_STR "wrong result" +#define CHANGELOG_LIB_MSG_CLEANING_BRICK_ENTRY_INFO_STR \ + "Cleaning brick entry for brick" +#define CHANGELOG_LIB_MSG_DRAINING_EVENT_INFO_STR "Draining event" +#define CHANGELOG_LIB_MSG_DRAINED_EVENT_INFO_STR "Drained event" +#define CHANGELOG_LIB_MSG_FREEING_ENTRY_INFO_STR "freeing entry" + +#endif /* !_CHANGELOG_MESSAGES_H_ */ diff --git a/xlators/features/changelog/lib/src/gf-changelog-api.c b/xlators/features/changelog/lib/src/gf-changelog-api.c new file mode 100644 index 00000000000..81a5cbfec10 --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog-api.c @@ -0,0 +1,224 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <glusterfs/compat-uuid.h> +#include <glusterfs/globals.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/syscall.h> + +#include "gf-changelog-helpers.h" +#include "gf-changelog-journal.h" +#include "changelog-mem-types.h" +#include "changelog-lib-messages.h" + +int +gf_changelog_done(char *file) +{ + int ret = -1; + char *buffer = NULL; + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + char to_path[PATH_MAX] = { + 0, + }; + + errno = EINVAL; + + this = THIS; + if (!this) + goto out; + + jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this); + if (!jnl) + goto out; + + if (!file || !strlen(file)) + goto out; + + /* make sure 'file' is inside ->jnl_working_dir */ + buffer = realpath(file, NULL); + if (!buffer) + goto out; + + if (strncmp(jnl->jnl_working_dir, buffer, strlen(jnl->jnl_working_dir))) + goto out; + + (void)snprintf(to_path, PATH_MAX, "%s%s", jnl->jnl_processed_dir, + basename(buffer)); + gf_msg_debug(this->name, 0, "moving %s to processed directory", file); + ret = sys_rename(buffer, to_path); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_LIB_MSG_RENAME_FAILED, "from=%s", file, "to=%s", + to_path, NULL); + goto out; + } + + ret = 0; + +out: + if (buffer) + free(buffer); /* allocated by realpath() */ + return ret; +} + +/** + * @API + * for a set of changelogs, start from the beginning + */ +int +gf_changelog_start_fresh() +{ + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + + this = THIS; + if (!this) + goto out; + + errno = EINVAL; + + jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this); + if (!jnl) + goto out; + + if (gf_ftruncate(jnl->jnl_fd, 0)) + goto out; + + return 0; + +out: + return -1; +} + +/** + * @API + * return the next changelog file entry. zero means all chanelogs + * consumed. + */ +ssize_t +gf_changelog_next_change(char *bufptr, size_t maxlen) +{ + ssize_t size = -1; + int tracker_fd = 0; + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + char buffer[PATH_MAX] = { + 0, + }; + + errno = EINVAL; + + this = THIS; + if (!this) + goto out; + + jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this); + if (!jnl) + goto out; + + tracker_fd = jnl->jnl_fd; + + size = gf_readline(tracker_fd, buffer, maxlen); + if (size < 0) { + size = -1; + goto out; + } + + if (size == 0) + goto out; + + memcpy(bufptr, buffer, size - 1); + bufptr[size - 1] = '\0'; + +out: + return size; +} + +/** + * @API + * gf_changelog_scan() - scan and generate a list of change entries + * + * calling this api multiple times (without calling gf_changlog_done()) + * would result new changelogs(s) being refreshed in the tracker file. + * This call also acts as a cancellation point for the consumer. + */ +ssize_t +gf_changelog_scan() +{ + int tracker_fd = 0; + size_t off = 0; + xlator_t *this = NULL; + size_t nr_entries = 0; + gf_changelog_journal_t *jnl = NULL; + struct dirent *entry = NULL; + struct dirent scratch[2] = { + { + 0, + }, + }; + char buffer[PATH_MAX] = { + 0, + }; + + this = THIS; + if (!this) + goto out; + + jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this); + if (!jnl) + goto out; + if (JNL_IS_API_DISCONNECTED(jnl)) { + errno = ENOTCONN; + goto out; + } + + errno = EINVAL; + + tracker_fd = jnl->jnl_fd; + if (gf_ftruncate(tracker_fd, 0)) + goto out; + + rewinddir(jnl->jnl_dir); + + for (;;) { + errno = 0; + entry = sys_readdir(jnl->jnl_dir, scratch); + if (!entry || errno != 0) + break; + + if (!strcmp(basename(entry->d_name), ".") || + !strcmp(basename(entry->d_name), "..")) + continue; + + nr_entries++; + + GF_CHANGELOG_FILL_BUFFER(jnl->jnl_processing_dir, buffer, off, + strlen(jnl->jnl_processing_dir)); + GF_CHANGELOG_FILL_BUFFER(entry->d_name, buffer, off, + strlen(entry->d_name)); + GF_CHANGELOG_FILL_BUFFER("\n", buffer, off, 1); + + if (gf_changelog_write(tracker_fd, buffer, off) != off) { + gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_WRITE_FAILED, + "error writing changelog filename" + " to tracker file"); + break; + } + off = 0; + } + + if (!entry) { + if (gf_lseek(tracker_fd, 0, SEEK_SET) != -1) + return nr_entries; + } +out: + return -1; +} diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.c b/xlators/features/changelog/lib/src/gf-changelog-helpers.c index 1eef8bf0479..75f8a6dfc08 100644 --- a/xlators/features/changelog/lib/src/gf-changelog-helpers.c +++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.c @@ -10,40 +10,37 @@ #include "changelog-mem-types.h" #include "gf-changelog-helpers.h" - -ssize_t gf_changelog_read_path (int fd, char *buffer, size_t bufsize) -{ - return read (fd, buffer, bufsize); -} +#include "changelog-lib-messages.h" +#include <glusterfs/syscall.h> size_t -gf_changelog_write (int fd, char *buffer, size_t len) +gf_changelog_write(int fd, char *buffer, size_t len) { - ssize_t size = 0; - size_t writen = 0; + ssize_t size = 0; + size_t written = 0; - while (writen < len) { - size = write (fd, - buffer + writen, len - writen); - if (size <= 0) - break; + while (written < len) { + size = sys_write(fd, buffer + written, len - written); + if (size <= 0) + break; - writen += size; - } + written += size; + } - return writen; + return written; } void -gf_rfc3986_encode (unsigned char *s, char *enc, char *estr) +gf_rfc3986_encode_space_newline(unsigned char *s, char *enc, char *estr) { - for (; *s; s++) { - if (estr[*s]) - sprintf(enc, "%c", estr[*s]); - else - sprintf(enc, "%%%02X", *s); - while (*++enc); - } + for (; *s; s++) { + if (estr[*s]) + sprintf(enc, "%c", estr[*s]); + else + sprintf(enc, "%%%02X", *s); + while (*++enc) + ; + } } /** @@ -56,125 +53,118 @@ gf_rfc3986_encode (unsigned char *s, char *enc, char *estr) * that can be done via @fflush(fp), @ftruncate(fd) and @fseek(fp), * but this involves mixing POSIX file descriptors and stream FILE *). * - * NOTE: This implmentation still does work with more than one fd's + * NOTE: This implementation still does work with more than one fd's * used to perform gf_readline(). For this very reason it's not * made a part of libglusterfs. */ -static pthread_key_t rl_key; -static pthread_once_t rl_once = PTHREAD_ONCE_INIT; - -static void -readline_destructor (void *ptr) -{ - GF_FREE (ptr); -} - -static void -readline_once (void) -{ - pthread_key_create (&rl_key, readline_destructor); -} +static __thread read_line_t thread_tsd = {}; static ssize_t -my_read (read_line_t *tsd, int fd, char *ptr) -{ - if (tsd->rl_cnt <= 0) { - if ( (tsd->rl_cnt = read (fd, tsd->rl_buf, MAXLINE)) < 0 ) - return -1; - else if (tsd->rl_cnt == 0) - return 0; - tsd->rl_bufptr = tsd->rl_buf; - } - - tsd->rl_cnt--; - *ptr = *tsd->rl_bufptr++; - return 1; -} - -static int -gf_readline_init_once (read_line_t **tsd) +my_read(read_line_t *tsd, int fd, char *ptr) { - if (pthread_once (&rl_once, readline_once) != 0) - return -1; - - *tsd = pthread_getspecific (rl_key); - if (*tsd) - goto out; - - *tsd = GF_CALLOC (1, sizeof (**tsd), - gf_changelog_mt_libgfchangelog_rl_t); - if (!*tsd) - return -1; + if (tsd->rl_cnt <= 0) { + tsd->rl_cnt = sys_read(fd, tsd->rl_buf, MAXLINE); - if (pthread_setspecific (rl_key, *tsd) != 0) - return -1; + if (tsd->rl_cnt < 0) + return -1; + else if (tsd->rl_cnt == 0) + return 0; + tsd->rl_bufptr = tsd->rl_buf; + } - out: - return 0; + tsd->rl_cnt--; + *ptr = *tsd->rl_bufptr++; + return 1; } ssize_t -gf_readline (int fd, void *vptr, size_t maxlen) +gf_readline(int fd, void *vptr, size_t maxlen) { - size_t n = 0; - size_t rc = 0; - char c = ' '; - char *ptr = NULL; - read_line_t *tsd = NULL; - - if (gf_readline_init_once (&tsd)) - return -1; - - ptr = vptr; - for (n = 1; n < maxlen; n++) { - if ( (rc = my_read (tsd, fd, &c)) == 1 ) { - *ptr++ = c; - if (c == '\n') - break; - } else if (rc == 0) { - *ptr = '\0'; - return (n - 1); - } else - return -1; - } - - *ptr = '\0'; - return n; - + size_t n = 0; + size_t rc = 0; + char c = ' '; + char *ptr = NULL; + read_line_t *tsd = &thread_tsd; + + ptr = vptr; + for (n = 1; n < maxlen; n++) { + if ((rc = my_read(tsd, fd, &c)) == 1) { + *ptr++ = c; + if (c == '\n') + break; + } else if (rc == 0) { + *ptr = '\0'; + return (n - 1); + } else + return -1; + } + + *ptr = '\0'; + return n; } off_t -gf_lseek (int fd, off_t offset, int whence) +gf_lseek(int fd, off_t offset, int whence) { - off_t off = 0; - read_line_t *tsd = NULL; + off_t off = 0; + read_line_t *tsd = &thread_tsd; - if (gf_readline_init_once (&tsd)) - return -1; + off = sys_lseek(fd, offset, whence); + if (off == -1) + return -1; - if ( (off = lseek (fd, offset, whence)) == -1) - return -1; + tsd->rl_cnt = 0; + tsd->rl_bufptr = tsd->rl_buf; - tsd->rl_cnt = 0; - tsd->rl_bufptr = tsd->rl_buf; - - return off; + return off; } int -gf_ftruncate (int fd, off_t length) +gf_ftruncate(int fd, off_t length) { - read_line_t *tsd = NULL; + read_line_t *tsd = &thread_tsd; - if (gf_readline_init_once (&tsd)) - return -1; + if (sys_ftruncate(fd, 0)) + return -1; - if (ftruncate (fd, 0)) - return -1; + tsd->rl_cnt = 0; + tsd->rl_bufptr = tsd->rl_buf; - tsd->rl_cnt = 0; - tsd->rl_bufptr = tsd->rl_buf; + return 0; +} - return 0; +int +gf_thread_cleanup(xlator_t *this, pthread_t thread) +{ + int ret = 0; + void *res = NULL; + + ret = pthread_cancel(thread); + if (ret != 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, + CHANGELOG_LIB_MSG_THREAD_CLEANUP_WARNING, + "Failed to send cancellation to thread"); + goto error_return; + } + + ret = pthread_join(thread, &res); + if (ret != 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, + CHANGELOG_LIB_MSG_THREAD_CLEANUP_WARNING, + "failed to join thread"); + goto error_return; + } + + if (res != PTHREAD_CANCELED) { + gf_msg(this->name, GF_LOG_WARNING, 0, + CHANGELOG_LIB_MSG_THREAD_CLEANUP_WARNING, + "Thread could not be cleaned up"); + goto error_return; + } + + return 0; + +error_return: + return -1; } diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.h b/xlators/features/changelog/lib/src/gf-changelog-helpers.h index 3aa6ed7b8e2..9c609d33172 100644 --- a/xlators/features/changelog/lib/src/gf-changelog-helpers.h +++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.h @@ -14,84 +14,242 @@ #include <unistd.h> #include <dirent.h> #include <limits.h> -#include <pthread.h> +#include <glusterfs/locking.h> -#include <xlator.h> +#include <glusterfs/xlator.h> -#define GF_CHANGELOG_TRACKER "tracker" +#include "changelog.h" -#define GF_CHANGELOG_CURRENT_DIR ".current" -#define GF_CHANGELOG_PROCESSED_DIR ".processed" +#include "changelog-rpc-common.h" +#include "gf-changelog-journal.h" + +#define GF_CHANGELOG_TRACKER "tracker" + +#define GF_CHANGELOG_CURRENT_DIR ".current" +#define GF_CHANGELOG_PROCESSED_DIR ".processed" #define GF_CHANGELOG_PROCESSING_DIR ".processing" +#define GF_CHANGELOG_HISTORY_DIR ".history" +#define TIMESTAMP_LENGTH 10 #ifndef MAXLINE #define MAXLINE 4096 #endif -#define GF_CHANGELOG_FILL_BUFFER(ptr, ascii, off, len) do { \ - memcpy (ascii + off, ptr, len); \ - off += len; \ - } while (0) +#define GF_CHANGELOG_FILL_BUFFER(ptr, ascii, off, len) \ + do { \ + memcpy(ascii + off, ptr, len); \ + off += len; \ + } while (0) typedef struct read_line { - int rl_cnt; - char *rl_bufptr; - char rl_buf[MAXLINE]; + int rl_cnt; + char *rl_bufptr; + char rl_buf[MAXLINE]; } read_line_t; +struct gf_changelog; +struct gf_event; + +/** + * Event list for ordered event notification + * + * ->next_seq holds the next _expected_ sequence number. + */ +struct gf_event_list { + pthread_mutex_t lock; /* protects this structure */ + pthread_cond_t cond; + + pthread_t invoker; + + unsigned long next_seq; /* next sequence number expected: + zero during bootstrap */ + + struct gf_changelog *entry; /* backpointer to it's brick + encapsulator (entry) */ + struct list_head events; /* list of events */ +}; + +/** + * include a refcount if it's of use by additional layers + */ +struct gf_event { + int count; + + unsigned long seq; + + struct list_head list; + + struct iovec iov[0]; +}; +#define GF_EVENT_CALLOC_SIZE(cnt, len) \ + (sizeof(struct gf_event) + (cnt * sizeof(struct iovec)) + len) + +/** + * assign the base address of the IO vector to the correct memory +o * area and set it's addressable length. + */ +#define GF_EVENT_ASSIGN_IOVEC(vec, event, len, pos) \ + do { \ + vec->iov_base = ((char *)event) + sizeof(struct gf_event) + \ + (event->count * sizeof(struct iovec)) + pos; \ + vec->iov_len = len; \ + pos += len; \ + } while (0) + +typedef enum gf_changelog_conn_state { + GF_CHANGELOG_CONN_STATE_PENDING = 0, + GF_CHANGELOG_CONN_STATE_ACCEPTED, + GF_CHANGELOG_CONN_STATE_DISCONNECTED, +} gf_changelog_conn_state_t; + +/** + * An instance of this structure is allocated for each brick for which + * notifications are streamed. + */ typedef struct gf_changelog { - xlator_t *this; + gf_lock_t statelock; + gf_changelog_conn_state_t connstate; - /* 'processing' directory stream */ - DIR *gfc_dir; + xlator_t *this; - /* fd to the tracker file */ - int gfc_fd; + struct list_head list; /* list of instances */ - /* connection retries */ - int gfc_connretries; + char brick[PATH_MAX]; /* brick path for this end-point */ - char gfc_sockpath[PATH_MAX]; + changelog_rpc_t grpc; /* rpc{-clnt,svc} for this brick */ +#define RPC_PROBER(ent) ent->grpc.rpc +#define RPC_REBORP(ent) ent->grpc.svc +#define RPC_SOCK(ent) ent->grpc.sock - char gfc_brickpath[PATH_MAX]; + unsigned int notify; /* notification flag(s) */ - /* socket for recieving notifications */ - int gfc_sockfd; + FINI *fini; /* destructor callback */ + CALLBACK *callback; /* event callback dispatcher */ + CONNECT *connected; /* connect callback */ + DISCONNECT *disconnected; /* disconnection callback */ - char *gfc_working_dir; + void *ptr; /* owner specific private data */ + xlator_t *invokerxl; /* consumers _this_, if valid, + assigned to THIS before cbk is + invoked */ - /* RFC 3986 string encoding */ - char rfc3986[256]; + gf_boolean_t ordered; - char gfc_current_dir[PATH_MAX]; - char gfc_processed_dir[PATH_MAX]; - char gfc_processing_dir[PATH_MAX]; + void (*queueevent)(struct gf_event_list *, struct gf_event *); + void (*pickevent)(struct gf_event_list *, struct gf_event **); - pthread_t gfc_changelog_processor; + struct gf_event_list event; } gf_changelog_t; -int -gf_changelog_notification_init (xlator_t *this, gf_changelog_t *gfc); +static inline int +gf_changelog_filter_check(gf_changelog_t *entry, changelog_event_t *event) +{ + if (event->ev_type & entry->notify) + return 1; + return 0; +} + +#define GF_NEED_ORDERED_EVENTS(ent) (ent->ordered == _gf_true) + +/** private structure */ +typedef struct gf_private { + pthread_mutex_t lock; /* protects ->connections, cleanups */ + pthread_cond_t cond; + + void *api; /* pointer for API access */ + + pthread_t poller; /* event poller thread */ + pthread_t connectionjanitor; /* connection cleaner */ + + struct list_head connections; /* list of connections */ + struct list_head cleanups; /* list of connection to be + cleaned up */ +} gf_private_t; + +#define GF_CHANGELOG_GET_API_PTR(this) (((gf_private_t *)this->private)->api) + +/** + * upcall: invoke callback with _correct_ THIS + */ +#define GF_CHANGELOG_INVOKE_CBK(this, cbk, brick, args...) \ + do { \ + xlator_t *old_this = NULL; \ + xlator_t *invokerxl = NULL; \ + \ + invokerxl = entry->invokerxl; \ + old_this = this; \ + \ + if (invokerxl) { \ + THIS = invokerxl; \ + } \ + \ + cbk(invokerxl, brick, args); \ + THIS = old_this; \ + \ + } while (0) + +#define SAVE_THIS(xl) \ + do { \ + old_this = xl; \ + THIS = master; \ + } while (0) + +#define RESTORE_THIS() \ + do { \ + if (old_this) \ + THIS = old_this; \ + } while (0) + +/** APIs and the rest */ void * -gf_changelog_process (void *data); - -ssize_t -gf_changelog_read_path (int fd, char *buffer, size_t bufsize); +gf_changelog_process(void *data); void -gf_rfc3986_encode (unsigned char *s, char *enc, char *estr); +gf_rfc3986_encode_space_newline(unsigned char *s, char *enc, char *estr); size_t -gf_changelog_write (int fd, char *buffer, size_t len); +gf_changelog_write(int fd, char *buffer, size_t len); ssize_t -gf_readline (int fd, void *vptr, size_t maxlen); +gf_readline(int fd, void *vptr, size_t maxlen); int -gf_ftruncate (int fd, off_t length); +gf_ftruncate(int fd, off_t length); off_t -gf_lseek (int fd, off_t offset, int whence); +gf_lseek(int fd, off_t offset, int whence); + +int +gf_changelog_consume(xlator_t *this, gf_changelog_journal_t *jnl, + char *from_path, gf_boolean_t no_publish); +int +gf_changelog_publish(xlator_t *this, gf_changelog_journal_t *jnl, + char *from_path); +int +gf_thread_cleanup(xlator_t *this, pthread_t thread); +void * +gf_changelog_callback_invoker(void *arg); + +int +gf_cleanup_event(xlator_t *, struct gf_event_list *); + +/* (un)ordered event queueing */ +void +queue_ordered_event(struct gf_event_list *, struct gf_event *); + +void +queue_unordered_event(struct gf_event_list *, struct gf_event *); + +/* (un)ordered event picking */ +void +pick_event_ordered(struct gf_event_list *, struct gf_event **); + +void +pick_event_unordered(struct gf_event_list *, struct gf_event **); + +/* connection janitor thread */ +void * +gf_changelog_connection_janitor(void *); #endif diff --git a/xlators/features/changelog/lib/src/gf-changelog-journal-handler.c b/xlators/features/changelog/lib/src/gf-changelog-journal-handler.c new file mode 100644 index 00000000000..7f6e2329e71 --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog-journal-handler.c @@ -0,0 +1,1029 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <glusterfs/compat-uuid.h> +#include <glusterfs/globals.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/syscall.h> +#include <glusterfs/compat-errno.h> + +#include "gf-changelog-helpers.h" + +/* from the changelog translator */ +#include "changelog-misc.h" +#include "changelog-mem-types.h" + +#include "gf-changelog-journal.h" +#include "changelog-lib-messages.h" + +extern int byebye; + +enum changelog_versions { VERSION_1_1 = 0, VERSION_1_2 = 1 }; + +/** + * number of gfid records after fop number + */ +int nr_gfids[2][GF_FOP_MAXVALUE] = {{ + [GF_FOP_MKNOD] = 1, + [GF_FOP_MKDIR] = 1, + [GF_FOP_UNLINK] = 1, + [GF_FOP_RMDIR] = 1, + [GF_FOP_SYMLINK] = 1, + [GF_FOP_RENAME] = 2, + [GF_FOP_LINK] = 1, + [GF_FOP_CREATE] = 1, + }, + { + [GF_FOP_MKNOD] = 1, + [GF_FOP_MKDIR] = 1, + [GF_FOP_UNLINK] = 2, + [GF_FOP_RMDIR] = 2, + [GF_FOP_SYMLINK] = 1, + [GF_FOP_RENAME] = 2, + [GF_FOP_LINK] = 1, + [GF_FOP_CREATE] = 1, + }}; + +int nr_extra_recs[2][GF_FOP_MAXVALUE] = {{ + [GF_FOP_MKNOD] = 3, + [GF_FOP_MKDIR] = 3, + [GF_FOP_UNLINK] = 0, + [GF_FOP_RMDIR] = 0, + [GF_FOP_SYMLINK] = 0, + [GF_FOP_RENAME] = 0, + [GF_FOP_LINK] = 0, + [GF_FOP_CREATE] = 3, + }, + { + [GF_FOP_MKNOD] = 3, + [GF_FOP_MKDIR] = 3, + [GF_FOP_UNLINK] = 0, + [GF_FOP_RMDIR] = 0, + [GF_FOP_SYMLINK] = 0, + [GF_FOP_RENAME] = 0, + [GF_FOP_LINK] = 0, + [GF_FOP_CREATE] = 3, + }}; + +static char * +binary_to_ascii(uuid_t uuid) +{ + return uuid_utoa(uuid); +} + +static char * +conv_noop(char *ptr) +{ + return ptr; +} + +#define VERIFY_SEPARATOR(ptr, plen, perr) \ + { \ + if (*(ptr + plen) != '\0') { \ + perr = 1; \ + break; \ + } \ + } + +#define MOVER_MOVE(mover, nleft, bytes) \ + { \ + mover += bytes; \ + nleft -= bytes; \ + } + +#define PARSE_GFID(mov, ptr, le, fn, perr) \ + { \ + VERIFY_SEPARATOR(mov, le, perr); \ + ptr = fn(mov); \ + if (!ptr) { \ + perr = 1; \ + break; \ + } \ + } + +#define FILL_AND_MOVE(pt, buf, of, mo, nl, le) \ + { \ + GF_CHANGELOG_FILL_BUFFER(pt, buf, of, strlen(pt)); \ + MOVER_MOVE(mo, nl, le); \ + } + +#define PARSE_GFID_MOVE(ptr, uuid, mover, nleft, perr) \ + { \ + memcpy(uuid, mover, sizeof(uuid_t)); \ + ptr = binary_to_ascii(uuid); \ + if (!ptr) { \ + perr = 1; \ + break; \ + } \ + MOVER_MOVE(mover, nleft, sizeof(uuid_t)); \ + } + +#define LINE_BUFSIZE (3 * PATH_MAX) /* enough buffer for extra chars too */ + +/** + * using mmap() makes parsing easy. fgets() cannot be used here as + * the binary gfid could contain a line-feed (0x0A), in that case fgets() + * would read an incomplete line and parsing would fail. using POSIX fds + * would result is additional code to maintain state in case of partial + * reads of data (where multiple entries do not fit extirely in the buffer). + * + * mmap() gives the flexibility of pointing to an offset in the file + * without us worrying about reading it in memory (VM does that for us for + * free). + */ + +static int +gf_changelog_parse_binary(xlator_t *this, gf_changelog_journal_t *jnl, + int from_fd, int to_fd, size_t start_offset, + struct stat *stbuf, int version_idx) + +{ + int ret = -1; + off_t off = 0; + off_t nleft = 0; + uuid_t uuid = { + 0, + }; + char *ptr = NULL; + char *bname_start = NULL; + char *bname_end = NULL; + char *mover = NULL; + void *start = NULL; + char current_mover = ' '; + size_t blen = 0; + int parse_err = 0; + char *ascii = NULL; + + ascii = GF_CALLOC(LINE_BUFSIZE, sizeof(char), gf_common_mt_char); + + nleft = stbuf->st_size; + + start = mmap(NULL, nleft, PROT_READ, MAP_PRIVATE, from_fd, 0); + if (start == MAP_FAILED) { + gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_MMAP_FAILED, + "mmap() error"); + goto out; + } + + mover = start; + + MOVER_MOVE(mover, nleft, start_offset); + + while (nleft > 0) { + off = blen = 0; + ptr = bname_start = bname_end = NULL; + + current_mover = *mover; + + switch (current_mover) { + case 'D': + case 'M': + MOVER_MOVE(mover, nleft, 1); + PARSE_GFID_MOVE(ptr, uuid, mover, nleft, parse_err); + + break; + + case 'E': + MOVER_MOVE(mover, nleft, 1); + PARSE_GFID_MOVE(ptr, uuid, mover, nleft, parse_err); + + bname_start = mover; + bname_end = strchr(mover, '\n'); + if (bname_end == NULL) { + parse_err = 1; + break; + } + + blen = bname_end - bname_start; + MOVER_MOVE(mover, nleft, blen); + + break; + + default: + parse_err = 1; + } + + if (parse_err) + break; + + GF_CHANGELOG_FILL_BUFFER(¤t_mover, ascii, off, 1); + GF_CHANGELOG_FILL_BUFFER(" ", ascii, off, 1); + GF_CHANGELOG_FILL_BUFFER(ptr, ascii, off, strlen(ptr)); + if (blen) + GF_CHANGELOG_FILL_BUFFER(bname_start, ascii, off, blen); + GF_CHANGELOG_FILL_BUFFER("\n", ascii, off, 1); + + if (gf_changelog_write(to_fd, ascii, off) != off) { + gf_msg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_LIB_MSG_ASCII_ERROR, + "processing binary changelog failed due to " + " error in writing ascii change"); + break; + } + + MOVER_MOVE(mover, nleft, 1); + } + + if ((nleft == 0) && (!parse_err)) + ret = 0; + + if (munmap(start, stbuf->st_size)) + gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_MUNMAP_FAILED, + "munmap() error"); +out: + if (ascii) + GF_FREE(ascii); + return ret; +} + +/** + * ascii decoder: + * - separate out one entry from another + * - use fop name rather than fop number + */ +static int +gf_changelog_parse_ascii(xlator_t *this, gf_changelog_journal_t *jnl, + int from_fd, int to_fd, size_t start_offset, + struct stat *stbuf, int version_idx) +{ + int ng = 0; + int ret = -1; + int fop = 0; + int len = 0; + off_t off = 0; + off_t nleft = 0; + char *ptr = NULL; + char *eptr = NULL; + void *start = NULL; + char *mover = NULL; + int parse_err = 0; + char current_mover = ' '; + char *ascii = NULL; + const char *fopname = NULL; + + ascii = GF_CALLOC(LINE_BUFSIZE, sizeof(char), gf_common_mt_char); + + nleft = stbuf->st_size; + + start = mmap(NULL, nleft, PROT_READ, MAP_PRIVATE, from_fd, 0); + if (start == MAP_FAILED) { + gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_MMAP_FAILED, + "mmap() error"); + goto out; + } + + mover = start; + + MOVER_MOVE(mover, nleft, start_offset); + + while (nleft > 0) { + off = 0; + current_mover = *mover; + + GF_CHANGELOG_FILL_BUFFER(¤t_mover, ascii, off, 1); + GF_CHANGELOG_FILL_BUFFER(" ", ascii, off, 1); + + switch (current_mover) { + case 'D': + MOVER_MOVE(mover, nleft, 1); + + /* target gfid */ + PARSE_GFID(mover, ptr, UUID_CANONICAL_FORM_LEN, conv_noop, + parse_err); + FILL_AND_MOVE(ptr, ascii, off, mover, nleft, + UUID_CANONICAL_FORM_LEN); + break; + case 'M': + MOVER_MOVE(mover, nleft, 1); + + /* target gfid */ + PARSE_GFID(mover, ptr, UUID_CANONICAL_FORM_LEN, conv_noop, + parse_err); + FILL_AND_MOVE(ptr, ascii, off, mover, nleft, + UUID_CANONICAL_FORM_LEN); + FILL_AND_MOVE(" ", ascii, off, mover, nleft, 1); + + /* fop */ + len = strlen(mover); + VERIFY_SEPARATOR(mover, len, parse_err); + + fop = atoi(mover); + fopname = gf_fop_list[fop]; + if (fopname == NULL) { + parse_err = 1; + break; + } + + MOVER_MOVE(mover, nleft, len); + + len = strlen(fopname); + GF_CHANGELOG_FILL_BUFFER(fopname, ascii, off, len); + + break; + + case 'E': + MOVER_MOVE(mover, nleft, 1); + + /* target gfid */ + PARSE_GFID(mover, ptr, UUID_CANONICAL_FORM_LEN, conv_noop, + parse_err); + FILL_AND_MOVE(ptr, ascii, off, mover, nleft, + UUID_CANONICAL_FORM_LEN); + FILL_AND_MOVE(" ", ascii, off, mover, nleft, 1); + + /* fop */ + len = strlen(mover); + VERIFY_SEPARATOR(mover, len, parse_err); + + fop = atoi(mover); + fopname = gf_fop_list[fop]; + if (fopname == NULL) { + parse_err = 1; + break; + } + + MOVER_MOVE(mover, nleft, len); + + len = strlen(fopname); + GF_CHANGELOG_FILL_BUFFER(fopname, ascii, off, len); + + ng = nr_extra_recs[version_idx][fop]; + for (; ng > 0; ng--) { + MOVER_MOVE(mover, nleft, 1); + len = strlen(mover); + VERIFY_SEPARATOR(mover, len, parse_err); + + GF_CHANGELOG_FILL_BUFFER(" ", ascii, off, 1); + FILL_AND_MOVE(mover, ascii, off, mover, nleft, len); + } + + /* pargfid + bname */ + ng = nr_gfids[version_idx][fop]; + while (ng-- > 0) { + MOVER_MOVE(mover, nleft, 1); + len = strlen(mover); + if (!len) { + MOVER_MOVE(mover, nleft, 1); + continue; + } + + GF_CHANGELOG_FILL_BUFFER(" ", ascii, off, 1); + + PARSE_GFID(mover, ptr, len, conv_noop, parse_err); + eptr = calloc(3, strlen(ptr)); + if (!eptr) { + parse_err = 1; + break; + } + + gf_rfc3986_encode_space_newline((unsigned char *)ptr, eptr, + jnl->rfc3986_space_newline); + FILL_AND_MOVE(eptr, ascii, off, mover, nleft, len); + free(eptr); + } + + break; + default: + parse_err = 1; + } + + if (parse_err) + break; + + GF_CHANGELOG_FILL_BUFFER("\n", ascii, off, 1); + + if (gf_changelog_write(to_fd, ascii, off) != off) { + gf_msg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_LIB_MSG_ASCII_ERROR, + "processing ascii changelog failed due to " + " error in writing change"); + break; + } + + MOVER_MOVE(mover, nleft, 1); + } + + if ((nleft == 0) && (!parse_err)) + ret = 0; + + if (munmap(start, stbuf->st_size)) + gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_MUNMAP_FAILED, + "munmap() error"); + +out: + if (ascii) + GF_FREE(ascii); + + return ret; +} + +static int +gf_changelog_decode(xlator_t *this, gf_changelog_journal_t *jnl, int from_fd, + int to_fd, struct stat *stbuf, int *zerob) +{ + int ret = -1; + int encoding = -1; + int major_version = -1; + int minor_version = -1; + int version_idx = -1; + size_t elen = 0; + char buffer[1024] = { + 0, + }; + + CHANGELOG_GET_HEADER_INFO(from_fd, buffer, sizeof(buffer), encoding, + major_version, minor_version, elen); + if (encoding == -1) /* unknown encoding */ + goto out; + + if (major_version == -1) /* unknown major version */ + goto out; + + if (minor_version == -1) /* unknown minor version */ + goto out; + + if (!CHANGELOG_VALID_ENCODING(encoding)) + goto out; + + if (elen == stbuf->st_size) { + *zerob = 1; + goto out; + } + + if (major_version == 1 && minor_version == 1) { + version_idx = VERSION_1_1; + } else if (major_version == 1 && minor_version == 2) { + version_idx = VERSION_1_2; + } + + if (version_idx == -1) /* unknown version number */ + goto out; + + /** + * start processing after the header + */ + if (sys_lseek(from_fd, elen, SEEK_SET) < 0) { + goto out; + } + switch (encoding) { + case CHANGELOG_ENCODE_BINARY: + /** + * this ideally should have been a part of changelog-encoders.c + * (ie. part of the changelog translator). + */ + ret = gf_changelog_parse_binary(this, jnl, from_fd, to_fd, elen, + stbuf, version_idx); + break; + + case CHANGELOG_ENCODE_ASCII: + ret = gf_changelog_parse_ascii(this, jnl, from_fd, to_fd, elen, + stbuf, version_idx); + break; + } + +out: + return ret; +} + +int +gf_changelog_publish(xlator_t *this, gf_changelog_journal_t *jnl, + char *from_path) +{ + int ret = 0; + char dest[PATH_MAX] = { + 0, + }; + char to_path[PATH_MAX] = { + 0, + }; + struct stat stbuf = { + 0, + }; + + if (snprintf(to_path, PATH_MAX, "%s%s", jnl->jnl_current_dir, + basename(from_path)) >= PATH_MAX) + return -1; + + /* handle zerob file that won't exist in current */ + ret = sys_stat(to_path, &stbuf); + if (ret) { + if (errno == ENOENT) + ret = 0; + goto out; + } + + if (snprintf(dest, PATH_MAX, "%s%s", jnl->jnl_processing_dir, + basename(from_path)) >= PATH_MAX) + return -1; + + ret = sys_rename(to_path, dest); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_LIB_MSG_RENAME_FAILED, "from=%s", to_path, "to=%s", + dest, NULL); + } + +out: + return ret; +} + +int +gf_changelog_consume(xlator_t *this, gf_changelog_journal_t *jnl, + char *from_path, gf_boolean_t no_publish) +{ + int ret = -1; + int fd1 = 0; + int fd2 = 0; + int zerob = 0; + struct stat stbuf = { + 0, + }; + char dest[PATH_MAX] = { + 0, + }; + char to_path[PATH_MAX] = { + 0, + }; + + if (snprintf(to_path, PATH_MAX, "%s%s", jnl->jnl_current_dir, + basename(from_path)) >= PATH_MAX) + goto out; + if (snprintf(dest, PATH_MAX, "%s%s", jnl->jnl_processing_dir, + basename(from_path)) >= PATH_MAX) + goto out; + + ret = sys_stat(from_path, &stbuf); + if (ret || !S_ISREG(stbuf.st_mode)) { + ret = -1; + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_STAT_FAILED, + "path=%s", from_path, NULL); + goto out; + } + + fd1 = open(from_path, O_RDONLY); + if (fd1 < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_OPEN_FAILED, + "path=%s", from_path, NULL); + goto out; + } + + fd2 = open(to_path, O_CREAT | O_TRUNC | O_RDWR, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd2 < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_OPEN_FAILED, + "path=%s", to_path, NULL); + goto close_fd; + } else { + ret = gf_changelog_decode(this, jnl, fd1, fd2, &stbuf, &zerob); + + sys_close(fd2); + + if (!ret) { + /* move it to processing on a successful + decode */ + if (no_publish == _gf_true) + goto close_fd; + ret = sys_rename(to_path, dest); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_LIB_MSG_RENAME_FAILED, "from=%s", to_path, + "to=%s", dest, NULL); + } + + /* remove it from .current if it's an empty file */ + if (zerob) { + /* zerob changelogs must be unlinked */ + ret = sys_unlink(to_path); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_LIB_MSG_UNLINK_FAILED, "name=empty changelog", + "path=%s", to_path, NULL); + } + } + +close_fd: + sys_close(fd1); + +out: + return ret; +} + +void * +gf_changelog_process(void *data) +{ + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_entry_t *entry = NULL; + gf_changelog_processor_t *jnl_proc = NULL; + + jnl = data; + jnl_proc = jnl->jnl_proc; + THIS = jnl->this; + this = jnl->this; + + while (1) { + pthread_mutex_lock(&jnl_proc->lock); + { + while (list_empty(&jnl_proc->entries)) { + jnl_proc->waiting = _gf_true; + pthread_cond_wait(&jnl_proc->cond, &jnl_proc->lock); + } + + entry = list_first_entry(&jnl_proc->entries, gf_changelog_entry_t, + list); + if (entry) + list_del(&entry->list); + + jnl_proc->waiting = _gf_false; + } + pthread_mutex_unlock(&jnl_proc->lock); + + if (entry) { + (void)gf_changelog_consume(this, jnl, entry->path, _gf_false); + GF_FREE(entry); + } + } + + return NULL; +} + +void +gf_changelog_queue_journal(gf_changelog_processor_t *jnl_proc, + changelog_event_t *event) +{ + size_t len = 0; + gf_changelog_entry_t *entry = NULL; + + entry = GF_CALLOC(1, sizeof(gf_changelog_entry_t), + gf_changelog_mt_libgfchangelog_entry_t); + if (!entry) + return; + INIT_LIST_HEAD(&entry->list); + + len = strlen(event->u.journal.path); + (void)memcpy(entry->path, event->u.journal.path, len + 1); + entry->path[len] = '\0'; + + pthread_mutex_lock(&jnl_proc->lock); + { + list_add_tail(&entry->list, &jnl_proc->entries); + if (jnl_proc->waiting) + pthread_cond_signal(&jnl_proc->cond); + } + pthread_mutex_unlock(&jnl_proc->lock); + + return; +} + +void +gf_changelog_handle_journal(void *xl, char *brick, void *cbkdata, + changelog_event_t *event) +{ + gf_changelog_journal_t *jnl = NULL; + gf_changelog_processor_t *jnl_proc = NULL; + + jnl = cbkdata; + jnl_proc = jnl->jnl_proc; + + gf_changelog_queue_journal(jnl_proc, event); +} + +void +gf_changelog_journal_disconnect(void *xl, char *brick, void *data) +{ + gf_changelog_journal_t *jnl = NULL; + + jnl = data; + + pthread_spin_lock(&jnl->lock); + { + JNL_SET_API_STATE(jnl, JNL_API_DISCONNECTED); + }; + pthread_spin_unlock(&jnl->lock); +} + +void +gf_changelog_journal_connect(void *xl, char *brick, void *data) +{ + gf_changelog_journal_t *jnl = NULL; + + jnl = data; + + pthread_spin_lock(&jnl->lock); + { + JNL_SET_API_STATE(jnl, JNL_API_CONNECTED); + }; + pthread_spin_unlock(&jnl->lock); + + return; +} + +void +gf_changelog_cleanup_processor(gf_changelog_journal_t *jnl) +{ + int ret = 0; + xlator_t *this = NULL; + gf_changelog_processor_t *jnl_proc = NULL; + + this = THIS; + if (!this || !jnl || !jnl->jnl_proc) + goto error_return; + + jnl_proc = jnl->jnl_proc; + + ret = gf_thread_cleanup(this, jnl_proc->processor); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_CLEANUP_ERROR, + "failed to cleanup processor thread"); + goto error_return; + } + + (void)pthread_mutex_destroy(&jnl_proc->lock); + (void)pthread_cond_destroy(&jnl_proc->cond); + + GF_FREE(jnl_proc); + +error_return: + return; +} + +int +gf_changelog_init_processor(gf_changelog_journal_t *jnl) +{ + int ret = -1; + gf_changelog_processor_t *jnl_proc = NULL; + + jnl_proc = GF_CALLOC(1, sizeof(gf_changelog_processor_t), + gf_changelog_mt_libgfchangelog_t); + if (!jnl_proc) + goto error_return; + + ret = pthread_mutex_init(&jnl_proc->lock, NULL); + if (ret != 0) + goto free_jnl_proc; + ret = pthread_cond_init(&jnl_proc->cond, NULL); + if (ret != 0) + goto cleanup_mutex; + + INIT_LIST_HEAD(&jnl_proc->entries); + jnl_proc->waiting = _gf_false; + jnl->jnl_proc = jnl_proc; + + ret = gf_thread_create(&jnl_proc->processor, NULL, gf_changelog_process, + jnl, "clogproc"); + if (ret != 0) { + jnl->jnl_proc = NULL; + goto cleanup_cond; + } + + return 0; + +cleanup_cond: + (void)pthread_cond_destroy(&jnl_proc->cond); +cleanup_mutex: + (void)pthread_mutex_destroy(&jnl_proc->lock); +free_jnl_proc: + GF_FREE(jnl_proc); +error_return: + return -1; +} + +static void +gf_changelog_cleanup_fds(gf_changelog_journal_t *jnl) +{ + /* tracker fd */ + if (jnl->jnl_fd != -1) + sys_close(jnl->jnl_fd); + /* processing dir */ + if (jnl->jnl_dir) + sys_closedir(jnl->jnl_dir); + + if (jnl->jnl_working_dir) + free(jnl->jnl_working_dir); /* allocated by realpath */ +} + +static int +gf_changelog_open_dirs(xlator_t *this, gf_changelog_journal_t *jnl) +{ + int ret = -1; + DIR *dir = NULL; + int tracker_fd = 0; + char tracker_path[PATH_MAX] = { + 0, + }; + + /* .current */ + (void)snprintf(jnl->jnl_current_dir, PATH_MAX, + "%s/" GF_CHANGELOG_CURRENT_DIR "/", jnl->jnl_working_dir); + ret = recursive_rmdir(jnl->jnl_current_dir); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_LIB_MSG_FAILED_TO_RMDIR, "path=%s", + jnl->jnl_current_dir, NULL); + goto out; + } + ret = mkdir_p(jnl->jnl_current_dir, 0600, _gf_false); + if (ret) + goto out; + + /* .processed */ + (void)snprintf(jnl->jnl_processed_dir, PATH_MAX, + "%s/" GF_CHANGELOG_PROCESSED_DIR "/", jnl->jnl_working_dir); + ret = mkdir_p(jnl->jnl_processed_dir, 0600, _gf_false); + if (ret) + goto out; + + /* .processing */ + (void)snprintf(jnl->jnl_processing_dir, PATH_MAX, + "%s/" GF_CHANGELOG_PROCESSING_DIR "/", jnl->jnl_working_dir); + ret = recursive_rmdir(jnl->jnl_processing_dir); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_LIB_MSG_FAILED_TO_RMDIR, "path=%s", + jnl->jnl_processing_dir, NULL); + goto out; + } + + ret = mkdir_p(jnl->jnl_processing_dir, 0600, _gf_false); + if (ret) + goto out; + + dir = sys_opendir(jnl->jnl_processing_dir); + if (!dir) { + gf_msg("", GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_OPENDIR_ERROR, + "opendir() error"); + goto out; + } + + jnl->jnl_dir = dir; + + (void)snprintf(tracker_path, PATH_MAX, "%s/" GF_CHANGELOG_TRACKER, + jnl->jnl_working_dir); + + tracker_fd = open(tracker_path, O_CREAT | O_APPEND | O_RDWR, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (tracker_fd < 0) { + sys_closedir(jnl->jnl_dir); + ret = -1; + goto out; + } + + jnl->jnl_fd = tracker_fd; + ret = 0; +out: + return ret; +} + +int +gf_changelog_init_history(xlator_t *this, gf_changelog_journal_t *jnl, + char *brick_path) +{ + int i = 0; + int ret = 0; + char hist_scratch_dir[PATH_MAX] = { + 0, + }; + + jnl->hist_jnl = GF_CALLOC(1, sizeof(*jnl), + gf_changelog_mt_libgfchangelog_t); + if (!jnl->hist_jnl) + goto error_return; + + jnl->hist_jnl->jnl_dir = NULL; + jnl->hist_jnl->jnl_fd = -1; + + (void)snprintf(hist_scratch_dir, PATH_MAX, + "%s/" GF_CHANGELOG_HISTORY_DIR "/", jnl->jnl_working_dir); + + ret = mkdir_p(hist_scratch_dir, 0600, _gf_false); + if (ret) + goto dealloc_hist; + + jnl->hist_jnl->jnl_working_dir = realpath(hist_scratch_dir, NULL); + if (!jnl->hist_jnl->jnl_working_dir) + goto dealloc_hist; + + ret = gf_changelog_open_dirs(this, jnl->hist_jnl); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_OPENDIR_ERROR, + "could not create entries in history scratch dir"); + goto dealloc_hist; + } + + if (snprintf(jnl->hist_jnl->jnl_brickpath, PATH_MAX, "%s", brick_path) >= + PATH_MAX) + goto dealloc_hist; + + for (i = 0; i < 256; i++) { + jnl->hist_jnl->rfc3986_space_newline[i] = (i == ' ' || i == '\n' || + i == '%') + ? 0 + : i; + } + + return 0; + +dealloc_hist: + GF_FREE(jnl->hist_jnl); + jnl->hist_jnl = NULL; +error_return: + return -1; +} + +void +gf_changelog_journal_fini(void *xl, char *brick, void *data) +{ + gf_changelog_journal_t *jnl = NULL; + + jnl = data; + + gf_changelog_cleanup_processor(jnl); + + gf_changelog_cleanup_fds(jnl); + if (jnl->hist_jnl) + gf_changelog_cleanup_fds(jnl->hist_jnl); + + GF_FREE(jnl); +} + +void * +gf_changelog_journal_init(void *xl, struct gf_brick_spec *brick) +{ + int i = 0; + int ret = 0; + xlator_t *this = NULL; + struct stat buf = { + 0, + }; + char *scratch_dir = NULL; + gf_changelog_journal_t *jnl = NULL; + + this = xl; + scratch_dir = (char *)brick->ptr; + + jnl = GF_CALLOC(1, sizeof(gf_changelog_journal_t), + gf_changelog_mt_libgfchangelog_t); + if (!jnl) + goto error_return; + + if (snprintf(jnl->jnl_brickpath, PATH_MAX, "%s", brick->brick_path) >= + PATH_MAX) + goto dealloc_private; + + if (sys_stat(scratch_dir, &buf) && errno == ENOENT) { + ret = mkdir_p(scratch_dir, 0600, _gf_true); + if (ret) + goto dealloc_private; + } + + jnl->jnl_working_dir = realpath(scratch_dir, NULL); + if (!jnl->jnl_working_dir) + goto dealloc_private; + + ret = gf_changelog_open_dirs(this, jnl); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_OPENDIR_ERROR, + "could not create entries in scratch dir"); + goto dealloc_private; + } + + /* RFC 3986 {de,en}coding */ + for (i = 0; i < 256; i++) { + jnl->rfc3986_space_newline[i] = (i == ' ' || i == '\n' || i == '%') ? 0 + : i; + } + + ret = gf_changelog_init_history(this, jnl, brick->brick_path); + if (ret) + goto cleanup_fds; + + /* initialize journal processor */ + jnl->this = this; + ret = gf_changelog_init_processor(jnl); + if (ret) + goto cleanup_fds; + + JNL_SET_API_STATE(jnl, JNL_API_CONN_INPROGESS); + ret = pthread_spin_init(&jnl->lock, 0); + if (ret != 0) + goto cleanup_processor; + return jnl; + +cleanup_processor: + gf_changelog_cleanup_processor(jnl); +cleanup_fds: + gf_changelog_cleanup_fds(jnl); + if (jnl->hist_jnl) + gf_changelog_cleanup_fds(jnl->hist_jnl); +dealloc_private: + GF_FREE(jnl); +error_return: + return NULL; +} diff --git a/xlators/features/changelog/lib/src/gf-changelog-journal.h b/xlators/features/changelog/lib/src/gf-changelog-journal.h new file mode 100644 index 00000000000..ba5b9bf827e --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog-journal.h @@ -0,0 +1,116 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __GF_CHANGELOG_JOURNAL_H +#define __GF_CHANGELOG_JOURNAL_H + +#include <unistd.h> +#include <pthread.h> + +#include "changelog.h" + +enum api_conn { + JNL_API_CONNECTED, + JNL_API_CONN_INPROGESS, + JNL_API_DISCONNECTED, +}; + +typedef struct gf_changelog_entry { + char path[PATH_MAX]; + + struct list_head list; +} gf_changelog_entry_t; + +typedef struct gf_changelog_processor { + pthread_mutex_t lock; /* protects ->entries */ + pthread_cond_t cond; /* waiter during empty list */ + gf_boolean_t waiting; + + pthread_t processor; /* thread-id of journal processing thread */ + + struct list_head entries; +} gf_changelog_processor_t; + +typedef struct gf_changelog_journal { + DIR *jnl_dir; /* 'processing' directory stream */ + + int jnl_fd; /* fd to the tracker file */ + + char jnl_brickpath[PATH_MAX]; /* brick path for this end-point */ + + gf_changelog_processor_t *jnl_proc; + + char *jnl_working_dir; /* scratch directory */ + + char jnl_current_dir[PATH_MAX]; + char jnl_processed_dir[PATH_MAX]; + char jnl_processing_dir[PATH_MAX]; + + char rfc3986_space_newline[256]; /* RFC 3986 string encoding */ + + struct gf_changelog_journal *hist_jnl; + int hist_done; /* holds 0 done scanning, + 1 keep scanning and -1 error */ + + pthread_spinlock_t lock; + int connected; + xlator_t *this; +} gf_changelog_journal_t; + +#define JNL_SET_API_STATE(jnl, state) (jnl->connected = state) +#define JNL_IS_API_DISCONNECTED(jnl) (jnl->connected == JNL_API_DISCONNECTED) + +/* History API */ +typedef struct gf_changelog_history_data { + int len; + + int htime_fd; + + /* parallelism count */ + int n_parallel; + + /* history from, to indexes */ + unsigned long from; + unsigned long to; + xlator_t *this; +} gf_changelog_history_data_t; + +typedef struct gf_changelog_consume_data { + /** set of inputs */ + + /* fd to read from */ + int fd; + + /* from @offset */ + off_t offset; + + xlator_t *this; + + gf_changelog_journal_t *jnl; + + /** set of outputs */ + + /* return value */ + int retval; + + /* journal processed */ + char changelog[PATH_MAX]; +} gf_changelog_consume_data_t; + +/* event handler */ +CALLBACK gf_changelog_handle_journal; + +/* init, connect & disconnect handler */ +INIT gf_changelog_journal_init; +FINI gf_changelog_journal_fini; +CONNECT gf_changelog_journal_connect; +DISCONNECT gf_changelog_journal_disconnect; + +#endif diff --git a/xlators/features/changelog/lib/src/gf-changelog-process.c b/xlators/features/changelog/lib/src/gf-changelog-process.c deleted file mode 100644 index df7204931a8..00000000000 --- a/xlators/features/changelog/lib/src/gf-changelog-process.c +++ /dev/null @@ -1,571 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#include <unistd.h> -#include <pthread.h> - -#include "uuid.h" -#include "globals.h" -#include "glusterfs.h" - -#include "gf-changelog-helpers.h" - -/* from the changelog translator */ -#include "changelog-misc.h" - -extern int byebye; - -/** - * number of gfid records after fop number - */ -int nr_gfids[] = { - [GF_FOP_MKNOD] = 1, - [GF_FOP_MKDIR] = 1, - [GF_FOP_UNLINK] = 1, - [GF_FOP_RMDIR] = 1, - [GF_FOP_SYMLINK] = 1, - [GF_FOP_RENAME] = 2, - [GF_FOP_LINK] = 1, - [GF_FOP_CREATE] = 1, -}; - -static char * -binary_to_ascii (uuid_t uuid) -{ - return uuid_utoa (uuid); -} - -static char * -conv_noop (char *ptr) { return ptr; } - -#define VERIFY_SEPARATOR(ptr, plen, perr) \ - { \ - if (*(ptr + plen) != '\0') { \ - perr = 1; \ - break; \ - } \ - } - -#define MOVER_MOVE(mover, nleft, bytes) \ - { \ - mover += bytes; \ - nleft -= bytes; \ - } \ - -#define PARSE_GFID(mov, ptr, le, fn, perr) \ - { \ - VERIFY_SEPARATOR (mov, le, perr); \ - ptr = fn (mov); \ - if (!ptr) { \ - perr = 1; \ - break; \ - } \ - } - -#define FILL_AND_MOVE(pt, buf, of, mo, nl, le) \ - { \ - GF_CHANGELOG_FILL_BUFFER (pt, buf, of, strlen (pt)); \ - MOVER_MOVE (mo, nl, le); \ - } - - -#define PARSE_GFID_MOVE(ptr, uuid, mover, nleft, perr) \ - { \ - memcpy (uuid, mover, sizeof (uuid_t)); \ - ptr = binary_to_ascii (uuid); \ - if (!ptr) { \ - perr = 1; \ - break; \ - } \ - MOVER_MOVE (mover, nleft, sizeof (uuid_t)); \ - } \ - -#define LINE_BUFSIZE 3*PATH_MAX /* enough buffer for extra chars too */ - -/** - * using mmap() makes parsing easy. fgets() cannot be used here as - * the binary gfid could contain a line-feed (0x0A), in that case fgets() - * would read an incomplete line and parsing would fail. using POSIX fds - * would result is additional code to maintain state in case of partial - * reads of data (where multiple entries do not fit extirely in the buffer). - * - * mmap() gives the flexibility of pointing to an offset in the file - * without us worrying about reading it in memory (VM does that for us for - * free). - */ - -static int -gf_changelog_parse_binary (xlator_t *this, - gf_changelog_t *gfc, int from_fd, int to_fd, - size_t start_offset, struct stat *stbuf) - -{ - int ret = -1; - off_t off = 0; - off_t nleft = 0; - uuid_t uuid = {0,}; - char *ptr = NULL; - char *bname_start = NULL; - char *bname_end = NULL; - char *mover = NULL; - char *start = NULL; - char current_mover = ' '; - size_t blen = 0; - int parse_err = 0; - char ascii[LINE_BUFSIZE] = {0,}; - - nleft = stbuf->st_size; - - start = (char *) mmap (NULL, nleft, - PROT_READ, MAP_PRIVATE, from_fd, 0); - if (!start) { - gf_log (this->name, GF_LOG_ERROR, - "mmap() error (reason: %s)", strerror (errno)); - goto out; - } - - mover = start; - - MOVER_MOVE (mover, nleft, start_offset); - - while (nleft > 0) { - - off = blen = 0; - ptr = bname_start = bname_end = NULL; - - current_mover = *mover; - - switch (current_mover) { - case 'D': - case 'M': - MOVER_MOVE (mover, nleft, 1); - PARSE_GFID_MOVE (ptr, uuid, mover, nleft, parse_err); - - break; - - case 'E': - MOVER_MOVE (mover, nleft, 1); - PARSE_GFID_MOVE (ptr, uuid, mover, nleft, parse_err); - - bname_start = mover; - if ( (bname_end = strchr (mover, '\n')) == NULL ) { - parse_err = 1; - break; - } - - blen = bname_end - bname_start; - MOVER_MOVE (mover, nleft, blen); - - break; - - default: - parse_err = 1; - } - - if (parse_err) - break; - - GF_CHANGELOG_FILL_BUFFER (¤t_mover, ascii, off, 1); - GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1); - GF_CHANGELOG_FILL_BUFFER (ptr, ascii, off, strlen (ptr)); - if (blen) - GF_CHANGELOG_FILL_BUFFER (bname_start, - ascii, off, blen); - GF_CHANGELOG_FILL_BUFFER ("\n", ascii, off, 1); - - if (gf_changelog_write (to_fd, ascii, off) != off) { - gf_log (this->name, GF_LOG_ERROR, - "processing binary changelog failed due to " - " error in writing ascii change (reason: %s)", - strerror (errno)); - break; - } - - MOVER_MOVE (mover, nleft, 1); - } - - if ( (nleft == 0) && (!parse_err)) - ret = 0; - - if (munmap (start, stbuf->st_size)) - gf_log (this->name, GF_LOG_ERROR, - "munmap() error (reason: %s)", strerror (errno)); - out: - return ret; -} - -/** - * ascii decoder: - * - separate out one entry from another - * - use fop name rather than fop number - */ -static int -gf_changelog_parse_ascii (xlator_t *this, - gf_changelog_t *gfc, int from_fd, int to_fd, - size_t start_offset, struct stat *stbuf) -{ - int ng = 0; - int ret = -1; - int fop = 0; - int len = 0; - off_t off = 0; - off_t nleft = 0; - char *ptr = NULL; - char *eptr = NULL; - char *start = NULL; - char *mover = NULL; - int parse_err = 0; - char current_mover = ' '; - char ascii[LINE_BUFSIZE] = {0,}; - const char *fopname = NULL; - - nleft = stbuf->st_size; - - start = (char *) mmap (NULL, nleft, - PROT_READ, MAP_PRIVATE, from_fd, 0); - if (!start) { - gf_log (this->name, GF_LOG_ERROR, - "mmap() error (reason: %s)", strerror (errno)); - goto out; - } - - mover = start; - - MOVER_MOVE (mover, nleft, start_offset); - - while (nleft > 0) { - off = 0; - current_mover = *mover; - - GF_CHANGELOG_FILL_BUFFER (¤t_mover, ascii, off, 1); - GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1); - - switch (current_mover) { - case 'D': - case 'M': - MOVER_MOVE (mover, nleft, 1); - - /* target gfid */ - PARSE_GFID (mover, ptr, UUID_CANONICAL_FORM_LEN, - conv_noop, parse_err); - FILL_AND_MOVE(ptr, ascii, off, - mover, nleft, UUID_CANONICAL_FORM_LEN); - break; - - case 'E': - MOVER_MOVE (mover, nleft, 1); - - /* target gfid */ - PARSE_GFID (mover, ptr, UUID_CANONICAL_FORM_LEN, - conv_noop, parse_err); - FILL_AND_MOVE (ptr, ascii, off, - mover, nleft, UUID_CANONICAL_FORM_LEN); - FILL_AND_MOVE (" ", ascii, off, - mover, nleft, 1); - - /* fop */ - len = strlen (mover); - VERIFY_SEPARATOR (mover, len, parse_err); - - fop = atoi (mover); - if ( (fopname = gf_fop_list[fop]) == NULL) { - parse_err = 1; - break; - } - - MOVER_MOVE (mover, nleft, len); - - len = strlen (fopname); - GF_CHANGELOG_FILL_BUFFER (fopname, ascii, off, len); - - /* pargfid + bname */ - ng = nr_gfids[fop]; - while (ng-- > 0) { - MOVER_MOVE (mover, nleft, 1); - len = strlen (mover); - GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1); - - PARSE_GFID (mover, ptr, len, - conv_noop, parse_err); - eptr = calloc (3, strlen (ptr)); - if (!eptr) { - parse_err = 1; - break; - } - - gf_rfc3986_encode ((unsigned char *) ptr, - eptr, gfc->rfc3986); - FILL_AND_MOVE (eptr, ascii, off, - mover, nleft, len); - free (eptr); - } - - break; - default: - parse_err = 1; - } - - if (parse_err) - break; - - GF_CHANGELOG_FILL_BUFFER ("\n", ascii, off, 1); - - if (gf_changelog_write (to_fd, ascii, off) != off) { - gf_log (this->name, GF_LOG_ERROR, - "processing ascii changelog failed due to " - " wrror in writing change (reason: %s)", - strerror (errno)); - break; - } - - MOVER_MOVE (mover, nleft, 1); - - } - - if ( (nleft == 0) && (!parse_err)) - ret = 0; - - if (munmap (start, stbuf->st_size)) - gf_log (this->name, GF_LOG_ERROR, - "munmap() error (reason: %s)", strerror (errno)); - - out: - return ret; -} - -#define COPY_BUFSIZE 8192 -static int -gf_changelog_copy (xlator_t *this, int from_fd, int to_fd) -{ - ssize_t size = 0; - char buffer[COPY_BUFSIZE+1] = {0,}; - - while (1) { - size = read (from_fd, buffer, COPY_BUFSIZE); - if (size <= 0) - break; - - if (gf_changelog_write (to_fd, - buffer, size) != size) { - gf_log (this->name, GF_LOG_ERROR, - "error processing ascii changlog"); - size = -1; - break; - } - } - - return (size < 0 ? -1 : 0); -} - -static int -gf_changelog_decode (xlator_t *this, gf_changelog_t *gfc, int from_fd, - int to_fd, struct stat *stbuf, int *zerob) -{ - int ret = -1; - int encoding = -1; - size_t elen = 0; - char buffer[1024] = {0,}; - - CHANGELOG_GET_ENCODING (from_fd, buffer, 1024, encoding, elen); - if (encoding == -1) /* unknown encoding */ - goto out; - - if (!CHANGELOG_VALID_ENCODING (encoding)) - goto out; - - if (elen == stbuf->st_size) { - *zerob = 1; - goto out; - } - - /** - * start processing after the header - */ - lseek (from_fd, elen, SEEK_SET); - - switch (encoding) { - case CHANGELOG_ENCODE_BINARY: - /** - * this ideally should have been a part of changelog-encoders.c - * (ie. part of the changelog translator). - */ - ret = gf_changelog_parse_binary (this, gfc, from_fd, - to_fd, elen, stbuf); - break; - - case CHANGELOG_ENCODE_ASCII: - ret = gf_changelog_parse_ascii (this, gfc, from_fd, - to_fd, elen, stbuf); - break; - default: - ret = gf_changelog_copy (this, from_fd, to_fd); - } - - out: - return ret; -} - -static int -gf_changelog_consume (xlator_t *this, gf_changelog_t *gfc, char *from_path) -{ - int ret = -1; - int fd1 = 0; - int fd2 = 0; - int zerob = 0; - struct stat stbuf = {0,}; - char dest[PATH_MAX] = {0,}; - char to_path[PATH_MAX] = {0,}; - - ret = stat (from_path, &stbuf); - if (ret || !S_ISREG(stbuf.st_mode)) { - gf_log (this->name, GF_LOG_ERROR, - "stat failed on changelog file: %s", from_path); - goto out; - } - - fd1 = open (from_path, O_RDONLY); - if (fd1 < 0) { - gf_log (this->name, GF_LOG_ERROR, - "cannot open changelog file: %s (reason: %s)", - from_path, strerror (errno)); - goto out; - } - - (void) snprintf (to_path, PATH_MAX, "%s%s", - gfc->gfc_current_dir, basename (from_path)); - (void) snprintf (dest, PATH_MAX, "%s%s", - gfc->gfc_processing_dir, basename (from_path)); - - fd2 = open (to_path, O_CREAT | O_TRUNC | O_RDWR, - S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); - if (fd2 < 0) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create ascii changelog file %s (reason %s)", - to_path, strerror (errno)); - goto close_fd; - } else { - ret = gf_changelog_decode (this, gfc, fd1, - fd2, &stbuf, &zerob); - - close (fd2); - - if (!ret) { - /* move it to processing on a successfull - decode */ - ret = rename (to_path, dest); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "error moving %s to processing dir" - " (reason: %s)", to_path, - strerror (errno)); - } - - /* remove it from .current if it's an empty file */ - if (zerob) { - ret = unlink (to_path); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "could not unlink %s (reason: %s", - to_path, strerror (errno)); - } - } - - close_fd: - close (fd1); - - out: - return ret; -} - -static char * -gf_changelog_ext_change (xlator_t *this, - gf_changelog_t *gfc, char *path, size_t readlen) -{ - int alo = 0; - int ret = 0; - size_t len = 0; - char *buf = NULL; - - buf = path; - while (len < readlen) { - if (*buf == '\0') { - alo = 1; - gf_log (this->name, GF_LOG_DEBUG, - "processing changelog: %s", path); - ret = gf_changelog_consume (this, gfc, path); - } - - if (ret) - break; - - len++; buf++; - if (alo) { - alo = 0; - path = buf; - } - } - - return (ret) ? NULL : path; -} - -void * -gf_changelog_process (void *data) -{ - ssize_t len = 0; - ssize_t offlen = 0; - xlator_t *this = NULL; - char *sbuf = NULL; - gf_changelog_t *gfc = NULL; - char from_path[PATH_MAX] = {0,}; - - gfc = (gf_changelog_t *) data; - this = gfc->this; - - pthread_detach (pthread_self()); - - for (;;) { - len = gf_changelog_read_path (gfc->gfc_sockfd, - from_path + offlen, - PATH_MAX - offlen); - if (len < 0) - continue; /* ignore it for now */ - - if (len == 0) { /* close() from the changelog translator */ - gf_log (this->name, GF_LOG_INFO, "close from changelog" - " notification translator."); - - if (gfc->gfc_connretries != 1) { - if (!gf_changelog_notification_init(this, gfc)) - continue; - } - - byebye = 1; - break; - } - - len += offlen; - sbuf = gf_changelog_ext_change (this, gfc, from_path, len); - if (!sbuf) { - gf_log (this->name, GF_LOG_ERROR, - "could not extract changelog filename"); - continue; - } - - offlen = 0; - if (sbuf != (from_path + len)) { - offlen = from_path + len - sbuf; - memmove (from_path, sbuf, offlen); - } - } - - gf_log (this->name, GF_LOG_DEBUG, - "byebye (%d) from processing thread...", byebye); - return NULL; -} diff --git a/xlators/features/changelog/lib/src/gf-changelog-reborp.c b/xlators/features/changelog/lib/src/gf-changelog-reborp.c new file mode 100644 index 00000000000..56b11cbb705 --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog-reborp.c @@ -0,0 +1,413 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "changelog-misc.h" +#include "changelog-mem-types.h" + +#include "gf-changelog-helpers.h" +#include "changelog-rpc-common.h" +#include "changelog-lib-messages.h" + +#include <glusterfs/syscall.h> + +/** + * Reverse socket: actual data transfer handler. Connection + * initiator is PROBER, data transfer is REBORP. + */ + +static struct rpcsvc_program *gf_changelog_reborp_programs[]; + +void * +gf_changelog_connection_janitor(void *arg) +{ + int32_t ret = 0; + xlator_t *this = NULL; + gf_private_t *priv = NULL; + gf_changelog_t *entry = NULL; + struct gf_event *event = NULL; + struct gf_event_list *ev = NULL; + unsigned long drained = 0; + + this = arg; + THIS = this; + + priv = this->private; + + while (1) { + pthread_mutex_lock(&priv->lock); + { + while (list_empty(&priv->cleanups)) + pthread_cond_wait(&priv->cond, &priv->lock); + + entry = list_first_entry(&priv->cleanups, gf_changelog_t, list); + list_del_init(&entry->list); + } + pthread_mutex_unlock(&priv->lock); + + drained = 0; + ev = &entry->event; + + gf_smsg(this->name, GF_LOG_INFO, 0, + CHANGELOG_LIB_MSG_CLEANING_BRICK_ENTRY_INFO, "brick=%s", + entry->brick, NULL); + + /* 0x0: disable rpc-clnt */ + rpc_clnt_disable(RPC_PROBER(entry)); + + /* 0x1: cleanup callback invoker thread */ + ret = gf_cleanup_event(this, ev); + if (ret) + continue; + + /* 0x2: drain pending events */ + while (!list_empty(&ev->events)) { + event = list_first_entry(&ev->events, struct gf_event, list); + gf_smsg(this->name, GF_LOG_INFO, 0, + CHANGELOG_LIB_MSG_DRAINING_EVENT_INFO, "seq=%lu", + event->seq, "payload=%d", event->count, NULL); + + GF_FREE(event); + drained++; + } + + gf_smsg(this->name, GF_LOG_INFO, 0, + CHANGELOG_LIB_MSG_DRAINED_EVENT_INFO, "num=%lu", drained, NULL); + + /* 0x3: freeup brick entry */ + gf_smsg(this->name, GF_LOG_INFO, 0, + CHANGELOG_LIB_MSG_FREEING_ENTRY_INFO, "entry=%p", entry, NULL); + LOCK_DESTROY(&entry->statelock); + GF_FREE(entry); + } + + return NULL; +} + +int +gf_changelog_reborp_rpcsvc_notify(rpcsvc_t *rpc, void *mydata, + rpcsvc_event_t event, void *data) +{ + int ret = 0; + xlator_t *this = NULL; + gf_changelog_t *entry = NULL; + + if (!(event == RPCSVC_EVENT_ACCEPT || event == RPCSVC_EVENT_DISCONNECT)) + return 0; + + entry = mydata; + this = entry->this; + + switch (event) { + case RPCSVC_EVENT_ACCEPT: + ret = sys_unlink(RPC_SOCK(entry)); + if (ret != 0) + gf_smsg(this->name, GF_LOG_WARNING, errno, + CHANGELOG_LIB_MSG_UNLINK_FAILED, "name=reverse socket", + "path=%s", RPC_SOCK(entry), NULL); + if (entry->connected) + GF_CHANGELOG_INVOKE_CBK(this, entry->connected, entry->brick, + entry->ptr); + break; + case RPCSVC_EVENT_DISCONNECT: + if (entry->disconnected) + GF_CHANGELOG_INVOKE_CBK(this, entry->disconnected, entry->brick, + entry->ptr); + /* passthrough */ + default: + break; + } + + return 0; +} + +rpcsvc_t * +gf_changelog_reborp_init_rpc_listner(xlator_t *this, char *path, char *sock, + void *cbkdata) +{ + CHANGELOG_MAKE_TMP_SOCKET_PATH(path, sock, UNIX_PATH_MAX); + return changelog_rpc_server_init(this, sock, cbkdata, + gf_changelog_reborp_rpcsvc_notify, + gf_changelog_reborp_programs); +} + +/** + * This is dirty and painful as of now until there is event filtering in the + * server. The entire event buffer is scanned and interested events are picked, + * whereas we _should_ be notified with the events we were interested in + * (selected at the time of probe). As of now this is complete BS and needs + * fixture ASAP. I just made it work, it needs to be better. + * + * @FIXME: cleanup this bugger once server filters events. + */ +void +gf_changelog_invoke_callback(gf_changelog_t *entry, struct iovec **vec, + int payloadcnt) +{ + int i = 0; + int evsize = 0; + xlator_t *this = NULL; + changelog_event_t *event = NULL; + + this = entry->this; + + for (; i < payloadcnt; i++) { + event = (changelog_event_t *)vec[i]->iov_base; + evsize = vec[i]->iov_len / CHANGELOG_EV_SIZE; + + for (; evsize > 0; evsize--, event++) { + if (gf_changelog_filter_check(entry, event)) { + GF_CHANGELOG_INVOKE_CBK(this, entry->callback, entry->brick, + entry->ptr, event); + } + } + } +} + +/** + * Ordered event handler is self-adaptive.. if the event sequence number + * is what's expected (->next_seq) there is no ordering list that's + * maintained. On out-of-order event notifications, event buffers are + * dynamically allocated and ordered. + */ + +int +__is_expected_sequence(struct gf_event_list *ev, struct gf_event *event) +{ + return (ev->next_seq == event->seq); +} + +int +__can_process_event(struct gf_event_list *ev, struct gf_event **event) +{ + *event = list_first_entry(&ev->events, struct gf_event, list); + + if (__is_expected_sequence(ev, *event)) { + list_del(&(*event)->list); + ev->next_seq++; + return 1; + } + + return 0; +} + +void +pick_event_ordered(struct gf_event_list *ev, struct gf_event **event) +{ + pthread_mutex_lock(&ev->lock); + { + while (list_empty(&ev->events) || !__can_process_event(ev, event)) + pthread_cond_wait(&ev->cond, &ev->lock); + } + pthread_mutex_unlock(&ev->lock); +} + +void +pick_event_unordered(struct gf_event_list *ev, struct gf_event **event) +{ + pthread_mutex_lock(&ev->lock); + { + while (list_empty(&ev->events)) + pthread_cond_wait(&ev->cond, &ev->lock); + *event = list_first_entry(&ev->events, struct gf_event, list); + list_del(&(*event)->list); + } + pthread_mutex_unlock(&ev->lock); +} + +void * +gf_changelog_callback_invoker(void *arg) +{ + xlator_t *this = NULL; + gf_changelog_t *entry = NULL; + struct iovec *vec = NULL; + struct gf_event *event = NULL; + struct gf_event_list *ev = NULL; + + ev = arg; + entry = ev->entry; + THIS = this = entry->this; + + while (1) { + entry->pickevent(ev, &event); + + vec = (struct iovec *)&event->iov; + gf_changelog_invoke_callback(entry, &vec, event->count); + + GF_FREE(event); + } + + return NULL; +} + +static int +orderfn(struct list_head *pos1, struct list_head *pos2) +{ + struct gf_event *event1 = NULL; + struct gf_event *event2 = NULL; + + event1 = list_entry(pos1, struct gf_event, list); + event2 = list_entry(pos2, struct gf_event, list); + + if (event1->seq > event2->seq) + return 1; + return -1; +} + +void +queue_ordered_event(struct gf_event_list *ev, struct gf_event *event) +{ + /* add event to the ordered event list and wake up listener(s) */ + pthread_mutex_lock(&ev->lock); + { + list_add_order(&event->list, &ev->events, orderfn); + if (!ev->next_seq) + ev->next_seq = event->seq; + if (ev->next_seq == event->seq) + pthread_cond_signal(&ev->cond); + } + pthread_mutex_unlock(&ev->lock); +} + +void +queue_unordered_event(struct gf_event_list *ev, struct gf_event *event) +{ + /* add event to the tail of the queue and wake up listener(s) */ + pthread_mutex_lock(&ev->lock); + { + list_add_tail(&event->list, &ev->events); + pthread_cond_signal(&ev->cond); + } + pthread_mutex_unlock(&ev->lock); +} + +int +gf_changelog_event_handler(rpcsvc_request_t *req, xlator_t *this, + gf_changelog_t *entry) +{ + int i = 0; + size_t payloadlen = 0; + ssize_t len = 0; + int payloadcnt = 0; + changelog_event_req rpc_req = { + 0, + }; + changelog_event_rsp rpc_rsp = { + 0, + }; + struct iovec *vec = NULL; + struct gf_event *event = NULL; + struct gf_event_list *ev = NULL; + + ev = &entry->event; + + len = xdr_to_generic(req->msg[0], &rpc_req, + (xdrproc_t)xdr_changelog_event_req); + if (len < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_LIB_MSG_XDR_DECODING_FAILED, "xdr decoding failed"); + req->rpc_err = GARBAGE_ARGS; + goto handle_xdr_error; + } + + if (len < req->msg[0].iov_len) { + payloadcnt = 1; + payloadlen = (req->msg[0].iov_len - len); + } + for (i = 1; i < req->count; i++) { + payloadcnt++; + payloadlen += req->msg[i].iov_len; + } + + event = GF_CALLOC(1, GF_EVENT_CALLOC_SIZE(payloadcnt, payloadlen), + gf_changelog_mt_libgfchangelog_event_t); + if (!event) + goto handle_xdr_error; + INIT_LIST_HEAD(&event->list); + + payloadlen = 0; + event->seq = rpc_req.seq; + event->count = payloadcnt; + + /* deep copy IO vectors */ + vec = &event->iov[0]; + GF_EVENT_ASSIGN_IOVEC(vec, event, (req->msg[0].iov_len - len), payloadlen); + (void)memcpy(vec->iov_base, req->msg[0].iov_base + len, vec->iov_len); + + for (i = 1; i < req->count; i++) { + vec = &event->iov[i]; + GF_EVENT_ASSIGN_IOVEC(vec, event, req->msg[i].iov_len, payloadlen); + (void)memcpy(event->iov[i].iov_base, req->msg[i].iov_base, + req->msg[i].iov_len); + } + + gf_msg_debug(this->name, 0, + "seq: %" PRIu64 " [%s] (time: %" PRIu64 ".%" PRIu64 + "), " + "(vec: %d, len: %zd)", + rpc_req.seq, entry->brick, rpc_req.tv_sec, rpc_req.tv_usec, + payloadcnt, payloadlen); + + /* dispatch event */ + entry->queueevent(ev, event); + + /* ack sequence number */ + rpc_rsp.op_ret = 0; + rpc_rsp.seq = rpc_req.seq; + + goto submit_rpc; + +handle_xdr_error: + rpc_rsp.op_ret = -1; + rpc_rsp.seq = 0; /* invalid */ +submit_rpc: + return changelog_rpc_sumbit_reply(req, &rpc_rsp, NULL, 0, NULL, + (xdrproc_t)xdr_changelog_event_rsp); +} + +int +gf_changelog_reborp_handle_event(rpcsvc_request_t *req) +{ + xlator_t *this = NULL; + rpcsvc_t *svc = NULL; + gf_changelog_t *entry = NULL; + + svc = rpcsvc_request_service(req); + entry = svc->mydata; + + this = THIS = entry->this; + + return gf_changelog_event_handler(req, this, entry); +} + +static rpcsvc_actor_t gf_changelog_reborp_actors[CHANGELOG_REV_PROC_MAX] = { + [CHANGELOG_REV_PROC_EVENT] = {"CHANGELOG EVENT HANDLER", + gf_changelog_reborp_handle_event, NULL, + CHANGELOG_REV_PROC_EVENT, DRC_NA, 0}, +}; + +/** + * Do not use synctask as the RPC layer dereferences ->mydata as THIS. + * In gf_changelog_setup_rpc(), @cbkdata is of type @gf_changelog_t, + * and that's required to invoke the callback with the appropriate + * brick path and it's private data. + */ +static struct rpcsvc_program gf_changelog_reborp_prog = { + .progname = "LIBGFCHANGELOG REBORP", + .prognum = CHANGELOG_REV_RPC_PROCNUM, + .progver = CHANGELOG_REV_RPC_PROCVER, + .numactors = CHANGELOG_REV_PROC_MAX, + .actors = gf_changelog_reborp_actors, + .synctask = _gf_false, +}; + +static struct rpcsvc_program *gf_changelog_reborp_programs[] = { + &gf_changelog_reborp_prog, + NULL, +}; diff --git a/xlators/features/changelog/lib/src/gf-changelog-rpc.c b/xlators/features/changelog/lib/src/gf-changelog-rpc.c new file mode 100644 index 00000000000..8ec6ffbcebc --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog-rpc.c @@ -0,0 +1,98 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "gf-changelog-rpc.h" +#include "changelog-misc.h" +#include "changelog-mem-types.h" + +struct rpc_clnt_program gf_changelog_clnt; + +/* TODO: piggyback reconnect to called (upcall) */ +int +gf_changelog_rpc_notify(struct rpc_clnt *rpc, void *mydata, + rpc_clnt_event_t event, void *data) +{ + switch (event) { + case RPC_CLNT_CONNECT: + break; + case RPC_CLNT_DISCONNECT: + case RPC_CLNT_MSG: + case RPC_CLNT_DESTROY: + case RPC_CLNT_PING: + break; + } + + return 0; +} + +struct rpc_clnt * +gf_changelog_rpc_init(xlator_t *this, gf_changelog_t *entry) +{ + char sockfile[UNIX_PATH_MAX] = { + 0, + }; + + CHANGELOG_MAKE_SOCKET_PATH(entry->brick, sockfile, UNIX_PATH_MAX); + return changelog_rpc_client_init(this, entry, sockfile, + gf_changelog_rpc_notify); +} + +/** + * remote procedure calls declarations. + */ + +int +gf_probe_changelog_cbk(struct rpc_req *req, struct iovec *iovec, int count, + void *myframe) +{ + return 0; +} + +int +gf_probe_changelog_filter(call_frame_t *frame, xlator_t *this, void *data) +{ + char *sock = NULL; + gf_changelog_t *entry = NULL; + changelog_probe_req req = { + 0, + }; + + entry = data; + sock = RPC_SOCK(entry); + + (void)memcpy(&req.sock, sock, strlen(sock)); + req.filter = entry->notify; + + /* invoke RPC */ + return changelog_rpc_sumbit_req( + RPC_PROBER(entry), (void *)&req, frame, &gf_changelog_clnt, + CHANGELOG_RPC_PROBE_FILTER, NULL, 0, NULL, this, gf_probe_changelog_cbk, + (xdrproc_t)xdr_changelog_probe_req); +} + +int +gf_changelog_invoke_rpc(xlator_t *this, gf_changelog_t *entry, int procidx) +{ + return changelog_invoke_rpc(this, RPC_PROBER(entry), &gf_changelog_clnt, + procidx, entry); +} + +struct rpc_clnt_procedure gf_changelog_procs[CHANGELOG_RPC_PROC_MAX] = { + [CHANGELOG_RPC_PROC_NULL] = {"NULL", NULL}, + [CHANGELOG_RPC_PROBE_FILTER] = {"PROBE FILTER", gf_probe_changelog_filter}, +}; + +struct rpc_clnt_program gf_changelog_clnt = { + .progname = "LIBGFCHANGELOG", + .prognum = CHANGELOG_RPC_PROGNUM, + .progver = CHANGELOG_RPC_PROGVER, + .numproc = CHANGELOG_RPC_PROC_MAX, + .proctable = gf_changelog_procs, +}; diff --git a/xlators/features/changelog/lib/src/changelog.h b/xlators/features/changelog/lib/src/gf-changelog-rpc.h index 5cddfb5839c..5c82d6f1c08 100644 --- a/xlators/features/changelog/lib/src/changelog.h +++ b/xlators/features/changelog/lib/src/gf-changelog-rpc.h @@ -8,24 +8,21 @@ cases as published by the Free Software Foundation. */ -#ifndef _GF_CHANGELOG_H -#define _GF_CHANGELOG_H +#ifndef __GF_CHANGELOG_RPC_H +#define __GF_CHANGELOG_RPC_H -/* API set */ +#include <glusterfs/xlator.h> -int -gf_changelog_register (char *brick_path, char *scratch_dir, - char *log_file, int log_levl, int max_reconnects); -ssize_t -gf_changelog_scan (); - -int -gf_changelog_start_fresh (); +#include "gf-changelog-helpers.h" +#include "changelog-rpc-common.h" -ssize_t -gf_changelog_next_change (char *bufptr, size_t maxlen); +struct rpc_clnt * +gf_changelog_rpc_init(xlator_t *, gf_changelog_t *); int -gf_changelog_done (char *file); +gf_changelog_invoke_rpc(xlator_t *, gf_changelog_t *, int); + +rpcsvc_t * +gf_changelog_reborp_init_rpc_listner(xlator_t *, char *, char *, void *); #endif diff --git a/xlators/features/changelog/lib/src/gf-changelog.c b/xlators/features/changelog/lib/src/gf-changelog.c index ca8e373e700..57c3d39ef76 100644 --- a/xlators/features/changelog/lib/src/gf-changelog.c +++ b/xlators/features/changelog/lib/src/gf-changelog.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -14,502 +14,639 @@ #include <sys/types.h> #include <sys/socket.h> #include <sys/un.h> +#include <sys/time.h> +#include <sys/resource.h> #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include <string.h> -#include "globals.h" -#include "glusterfs.h" -#include "logging.h" +#include <glusterfs/globals.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/defaults.h> +#include <glusterfs/syncop.h> +#include "gf-changelog-rpc.h" #include "gf-changelog-helpers.h" /* from the changelog translator */ #include "changelog-misc.h" #include "changelog-mem-types.h" +#include "changelog-lib-messages.h" -int byebye = 0; +/** + * Global singleton xlator pointer for the library, initialized + * during library load. This should probably be hidden inside + * an initialized object which is an handle for the consumer. + * + * TODO: do away with the global.. + */ +xlator_t *master = NULL; -static void -gf_changelog_cleanup (gf_changelog_t *gfc) +static inline gf_private_t * +gf_changelog_alloc_priv() { - /* socket */ - if (gfc->gfc_sockfd != -1) - close (gfc->gfc_sockfd); - /* tracker fd */ - if (gfc->gfc_fd != -1) - close (gfc->gfc_fd); - /* processing dir */ - if (gfc->gfc_dir) - closedir (gfc->gfc_dir); - - if (gfc->gfc_working_dir) - free (gfc->gfc_working_dir); /* allocated by realpath */ + int ret = 0; + gf_private_t *priv = NULL; + + priv = GF_CALLOC(1, sizeof(*priv), gf_changelog_mt_priv_t); + if (!priv) + goto error_return; + INIT_LIST_HEAD(&priv->connections); + INIT_LIST_HEAD(&priv->cleanups); + + ret = pthread_mutex_init(&priv->lock, NULL); + if (ret != 0) + goto free_priv; + ret = pthread_cond_init(&priv->cond, NULL); + if (ret != 0) + goto cleanup_mutex; + + priv->api = NULL; + return priv; + +cleanup_mutex: + (void)pthread_mutex_destroy(&priv->lock); +free_priv: + GF_FREE(priv); +error_return: + return NULL; } -void -__attribute__ ((constructor)) gf_changelog_ctor (void) +#define GF_CHANGELOG_EVENT_POOL_SIZE 16384 +#define GF_CHANGELOG_EVENT_THREAD_COUNT 4 + +static int +gf_changelog_ctx_defaults_init(glusterfs_ctx_t *ctx) { - glusterfs_ctx_t *ctx = NULL; + cmd_args_t *cmd_args = NULL; + struct rlimit lim = { + 0, + }; + call_pool_t *pool = NULL; + int ret = -1; + + ret = xlator_mem_acct_init(THIS, gf_changelog_mt_end); + if (ret != 0) + return -1; - ctx = glusterfs_ctx_new (); - if (!ctx) - return; + ctx->process_uuid = generate_glusterfs_ctx_id(); + if (!ctx->process_uuid) + return -1; - if (glusterfs_globals_init (ctx)) { - free (ctx); - ctx = NULL; - return; - } + ctx->page_size = 128 * GF_UNIT_KB; - THIS->ctx = ctx; -} + ctx->iobuf_pool = iobuf_pool_new(); + if (!ctx->iobuf_pool) + goto free_pool; -void -__attribute__ ((destructor)) gf_changelog_dtor (void) -{ - xlator_t *this = NULL; - glusterfs_ctx_t *ctx = NULL; - gf_changelog_t *gfc = NULL; + ctx->event_pool = gf_event_pool_new(GF_CHANGELOG_EVENT_POOL_SIZE, + GF_CHANGELOG_EVENT_THREAD_COUNT); + if (!ctx->event_pool) + goto free_pool; - this = THIS; - if (!this) - return; + pool = GF_CALLOC(1, sizeof(call_pool_t), + gf_changelog_mt_libgfchangelog_call_pool_t); + if (!pool) + goto free_pool; - ctx = this->ctx; - gfc = this->private; + /* frame_mem_pool size 112 * 64 */ + pool->frame_mem_pool = mem_pool_new(call_frame_t, 32); + if (!pool->frame_mem_pool) + goto free_pool; - if (gfc) { - gf_changelog_cleanup (gfc); - GF_FREE (gfc); - } + /* stack_mem_pool size 256 * 128 */ + pool->stack_mem_pool = mem_pool_new(call_stack_t, 16); - if (ctx) { - pthread_mutex_destroy (&ctx->lock); - free (ctx); - ctx = NULL; - } -} + if (!pool->stack_mem_pool) + goto free_pool; + ctx->stub_mem_pool = mem_pool_new(call_stub_t, 16); + if (!ctx->stub_mem_pool) + goto free_pool; -static int -gf_changelog_open_dirs (gf_changelog_t *gfc) -{ - int ret = -1; - DIR *dir = NULL; - int tracker_fd = 0; - char tracker_path[PATH_MAX] = {0,}; - - (void) snprintf (gfc->gfc_current_dir, PATH_MAX, - "%s/"GF_CHANGELOG_CURRENT_DIR"/", - gfc->gfc_working_dir); - ret = mkdir_p (gfc->gfc_current_dir, 0600, _gf_false); - if (ret) - goto out; + ctx->dict_pool = mem_pool_new(dict_t, 32); + if (!ctx->dict_pool) + goto free_pool; - (void) snprintf (gfc->gfc_processed_dir, PATH_MAX, - "%s/"GF_CHANGELOG_PROCESSED_DIR"/", - gfc->gfc_working_dir); - ret = mkdir_p (gfc->gfc_processed_dir, 0600, _gf_false); - if (ret) - goto out; + ctx->dict_pair_pool = mem_pool_new(data_pair_t, 512); + if (!ctx->dict_pair_pool) + goto free_pool; - (void) snprintf (gfc->gfc_processing_dir, PATH_MAX, - "%s/"GF_CHANGELOG_PROCESSING_DIR"/", - gfc->gfc_working_dir); - ret = mkdir_p (gfc->gfc_processing_dir, 0600, _gf_false); - if (ret) - goto out; + ctx->dict_data_pool = mem_pool_new(data_t, 512); + if (!ctx->dict_data_pool) + goto free_pool; - dir = opendir (gfc->gfc_processing_dir); - if (!dir) { - gf_log ("", GF_LOG_ERROR, - "opendir() error [reason: %s]", strerror (errno)); - goto out; - } + ctx->logbuf_pool = mem_pool_new(log_buf_t, 256); + if (!ctx->logbuf_pool) + goto free_pool; - gfc->gfc_dir = dir; + INIT_LIST_HEAD(&pool->all_frames); + LOCK_INIT(&pool->lock); + ctx->pool = pool; - (void) snprintf (tracker_path, PATH_MAX, - "%s/"GF_CHANGELOG_TRACKER, gfc->gfc_working_dir); + LOCK_INIT(&ctx->lock); - tracker_fd = open (tracker_path, O_CREAT | O_APPEND | O_RDWR, - S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); - if (tracker_fd < 0) { - closedir (gfc->gfc_dir); - ret = -1; - goto out; - } + cmd_args = &ctx->cmd_args; - gfc->gfc_fd = tracker_fd; - ret = 0; - out: - return ret; -} + INIT_LIST_HEAD(&cmd_args->xlator_options); -int -gf_changelog_notification_init (xlator_t *this, gf_changelog_t *gfc) -{ - int ret = 0; - int len = 0; - int tries = 0; - int sockfd = 0; - struct sockaddr_un remote; - - this = gfc->this; - - if (gfc->gfc_sockfd != -1) { - gf_log (this->name, GF_LOG_INFO, - "Reconnecting..."); - close (gfc->gfc_sockfd); - } + lim.rlim_cur = RLIM_INFINITY; + lim.rlim_max = RLIM_INFINITY; + setrlimit(RLIMIT_CORE, &lim); - sockfd = socket (AF_UNIX, SOCK_STREAM, 0); - if (sockfd < 0) { - ret = -1; - goto out; - } + return 0; - CHANGELOG_MAKE_SOCKET_PATH (gfc->gfc_brickpath, - gfc->gfc_sockpath, PATH_MAX); - gf_log (this->name, GF_LOG_INFO, - "connecting to changelog socket: %s (brick: %s)", - gfc->gfc_sockpath, gfc->gfc_brickpath); +free_pool: + if (pool) { + GF_FREE(pool->frame_mem_pool); - remote.sun_family = AF_UNIX; - strcpy (remote.sun_path, gfc->gfc_sockpath); + GF_FREE(pool->stack_mem_pool); - len = strlen (remote.sun_path) + sizeof (remote.sun_family); + GF_FREE(pool); + } - while (tries < gfc->gfc_connretries) { - gf_log (this->name, GF_LOG_WARNING, - "connection attempt %d/%d...", - tries + 1, gfc->gfc_connretries); + GF_FREE(ctx->stub_mem_pool); - /* initiate a connect */ - if (connect (sockfd, (struct sockaddr *) &remote, len) == 0) { - gfc->gfc_sockfd = sockfd; - break; - } + GF_FREE(ctx->dict_pool); - tries++; - sleep (2); - } + GF_FREE(ctx->dict_pair_pool); - if (tries == gfc->gfc_connretries) { - gf_log (this->name, GF_LOG_ERROR, - "could not connect to changelog socket!" - " bailing out..."); - ret = -1; - } else - gf_log (this->name, GF_LOG_INFO, - "connection successful"); - - out: - return ret; + GF_FREE(ctx->dict_data_pool); + + GF_FREE(ctx->logbuf_pool); + + GF_FREE(ctx->iobuf_pool); + + GF_FREE(ctx->event_pool); + + return -1; } -int -gf_changelog_done (char *file) +/* TODO: cleanup ctx defaults */ +void +gf_changelog_cleanup_this(xlator_t *this) { - int ret = -1; - char *buffer = NULL; - xlator_t *this = NULL; - gf_changelog_t *gfc = NULL; - char to_path[PATH_MAX] = {0,}; + glusterfs_ctx_t *ctx = NULL; - errno = EINVAL; + if (!this) + return; - this = THIS; - if (!this) - goto out; - - gfc = (gf_changelog_t *) this->private; - if (!gfc) - goto out; - - if (!file || !strlen (file)) - goto out; - - /* make sure 'file' is inside ->gfc_working_dir */ - buffer = realpath (file, NULL); - if (!buffer) - goto out; - - if (strncmp (gfc->gfc_working_dir, - buffer, strlen (gfc->gfc_working_dir))) - goto out; - - (void) snprintf (to_path, PATH_MAX, "%s%s", - gfc->gfc_processed_dir, basename (buffer)); - gf_log (this->name, GF_LOG_DEBUG, - "moving %s to processed directory", file); - ret = rename (buffer, to_path); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "cannot move %s to %s (reason: %s)", - file, to_path, strerror (errno)); - goto out; - } + ctx = this->ctx; + syncenv_destroy(ctx->env); + free(ctx); - ret = 0; + this->private = NULL; + this->ctx = NULL; - out: - if (buffer) - free (buffer); /* allocated by realpath() */ - return ret; + mem_pools_fini(); } -/** - * @API - * for a set of changelogs, start from the begining - */ -int -gf_changelog_start_fresh () +static int +gf_changelog_init_context() { - xlator_t *this = NULL; - gf_changelog_t *gfc = NULL; - - this = THIS; - if (!this) - goto out; + glusterfs_ctx_t *ctx = NULL; - errno = EINVAL; + ctx = glusterfs_ctx_new(); + if (!ctx) + goto error_return; - gfc = (gf_changelog_t *) this->private; - if (!gfc) - goto out; + if (glusterfs_globals_init(ctx)) + goto free_ctx; - if (gf_ftruncate (gfc->gfc_fd, 0)) - goto out; + THIS->ctx = ctx; + if (gf_changelog_ctx_defaults_init(ctx)) + goto free_ctx; - return 0; + ctx->env = syncenv_new(0, 0, 0); + if (!ctx->env) + goto free_ctx; + return 0; - out: - return -1; +free_ctx: + free(ctx); + THIS->ctx = NULL; +error_return: + return -1; } -/** - * @API - * return the next changelog file entry. zero means all chanelogs - * consumed. - */ -ssize_t -gf_changelog_next_change (char *bufptr, size_t maxlen) +static int +gf_changelog_init_master() { - ssize_t size = 0; - int tracker_fd = 0; - xlator_t *this = NULL; - gf_changelog_t *gfc = NULL; - char buffer[PATH_MAX] = {0,}; + int ret = 0; - errno = EINVAL; + ret = gf_changelog_init_context(); + mem_pools_init(); - this = THIS; - if (!this) - goto out; - - gfc = (gf_changelog_t *) this->private; - if (!gfc) - goto out; + return ret; +} - tracker_fd = gfc->gfc_fd; +/* TODO: cleanup clnt/svc on failure */ +int +gf_changelog_setup_rpc(xlator_t *this, gf_changelog_t *entry, int proc) +{ + int ret = 0; + rpcsvc_t *svc = NULL; + struct rpc_clnt *rpc = NULL; + + /** + * Initialize a connect back socket. A probe() RPC call to the server + * triggers a reverse connect. + */ + svc = gf_changelog_reborp_init_rpc_listner(this, entry->brick, + RPC_SOCK(entry), entry); + if (!svc) + goto error_return; + RPC_REBORP(entry) = svc; + + /* Initialize an RPC client */ + rpc = gf_changelog_rpc_init(this, entry); + if (!rpc) + goto error_return; + RPC_PROBER(entry) = rpc; + + /** + * @FIXME + * till we have connection state machine, let's delay the RPC call + * for now.. + */ + sleep(2); + + /** + * Probe changelog translator for reverse connection. After a successful + * call, there's less use of the client and can be disconnected, but + * let's leave the connection active for any future RPC calls. + */ + ret = gf_changelog_invoke_rpc(this, entry, proc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_INVOKE_RPC_FAILED, + "Could not initiate probe RPC, bailing out!!!"); + goto error_return; + } + + return 0; + +error_return: + return -1; +} - size = gf_readline (tracker_fd, buffer, maxlen); - if (size < 0) - goto out; - if (size == 0) - return 0; +int +gf_cleanup_event(xlator_t *this, struct gf_event_list *ev) +{ + int ret = 0; + + ret = gf_thread_cleanup(this, ev->invoker); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, + CHANGELOG_LIB_MSG_CLEANUP_ERROR, + "cannot cleanup callback invoker thread." + " Not freeing resources"); + return -1; + } - memcpy (bufptr, buffer, size - 1); - *(buffer + size) = '\0'; + ev->entry = NULL; - return size; + return 0; +} - out: - return -1; +static int +gf_init_event(gf_changelog_t *entry) +{ + int ret = 0; + struct gf_event_list *ev = NULL; + + ev = &entry->event; + ev->entry = entry; + + ret = pthread_mutex_init(&ev->lock, NULL); + if (ret != 0) + goto error_return; + ret = pthread_cond_init(&ev->cond, NULL); + if (ret != 0) + goto cleanup_mutex; + INIT_LIST_HEAD(&ev->events); + + ev->next_seq = 0; /* bootstrap sequencing */ + + if (GF_NEED_ORDERED_EVENTS(entry)) { + entry->pickevent = pick_event_ordered; + entry->queueevent = queue_ordered_event; + } else { + entry->pickevent = pick_event_unordered; + entry->queueevent = queue_unordered_event; + } + + ret = gf_thread_create(&ev->invoker, NULL, gf_changelog_callback_invoker, + ev, "clogcbki"); + if (ret != 0) { + entry->pickevent = NULL; + entry->queueevent = NULL; + goto cleanup_cond; + } + + return 0; + +cleanup_cond: + (void)pthread_cond_destroy(&ev->cond); +cleanup_mutex: + (void)pthread_mutex_destroy(&ev->lock); +error_return: + return -1; } /** - * @API - * gf_changelog_scan() - scan and generate a list of change entries - * - * calling this api multiple times (without calling gf_changlog_done()) - * would result new changelogs(s) being refreshed in the tracker file. - * This call also acts as a cancellation point for the consumer. + * TODO: + * - cleanup invoker thread + * - cleanup event list + * - destroy rpc{-clnt, svc} */ -ssize_t -gf_changelog_scan () +int +gf_cleanup_brick_connection(xlator_t *this, gf_changelog_t *entry) { - int ret = 0; - int tracker_fd = 0; - size_t len = 0; - size_t off = 0; - xlator_t *this = NULL; - size_t nr_entries = 0; - gf_changelog_t *gfc = NULL; - struct dirent *entryp = NULL; - struct dirent *result = NULL; - char buffer[PATH_MAX] = {0,}; + return 0; +} - this = THIS; - if (!this) - goto out; - - gfc = (gf_changelog_t *) this->private; - if (!gfc) - goto out; - - /** - * do we need to protect 'byebye' with locks? worst, the - * consumer would get notified during next scan(). - */ - if (byebye) { - errno = ECONNREFUSED; - goto out; - } +int +gf_cleanup_connections(xlator_t *this) +{ + return 0; +} - errno = EINVAL; - - tracker_fd = gfc->gfc_fd; - - if (gf_ftruncate (tracker_fd, 0)) - goto out; - - len = offsetof(struct dirent, d_name) - + pathconf(gfc->gfc_processing_dir, _PC_NAME_MAX) + 1; - entryp = GF_CALLOC (1, len, - gf_changelog_mt_libgfchangelog_dirent_t); - if (!entryp) - goto out; - - rewinddir (gfc->gfc_dir); - while (1) { - ret = readdir_r (gfc->gfc_dir, entryp, &result); - if (ret || !result) - break; - - if ( !strcmp (basename (entryp->d_name), ".") - || !strcmp (basename (entryp->d_name), "..") ) - continue; - - nr_entries++; - - GF_CHANGELOG_FILL_BUFFER (gfc->gfc_processing_dir, - buffer, off, - strlen (gfc->gfc_processing_dir)); - GF_CHANGELOG_FILL_BUFFER (entryp->d_name, buffer, - off, strlen (entryp->d_name)); - GF_CHANGELOG_FILL_BUFFER ("\n", buffer, off, 1); - - if (gf_changelog_write (tracker_fd, buffer, off) != off) { - gf_log (this->name, GF_LOG_ERROR, - "error writing changelog filename" - " to tracker file"); - break; - } - off = 0; - } +static int +gf_setup_brick_connection(xlator_t *this, struct gf_brick_spec *brick, + gf_boolean_t ordered, void *xl) +{ + int ret = 0; + gf_private_t *priv = NULL; + gf_changelog_t *entry = NULL; + + priv = this->private; + + if (!brick->callback || !brick->init || !brick->fini) + goto error_return; + + entry = GF_CALLOC(1, sizeof(*entry), gf_changelog_mt_libgfchangelog_t); + if (!entry) + goto error_return; + INIT_LIST_HEAD(&entry->list); + + LOCK_INIT(&entry->statelock); + entry->connstate = GF_CHANGELOG_CONN_STATE_PENDING; + + entry->notify = brick->filter; + if (snprintf(entry->brick, PATH_MAX, "%s", brick->brick_path) >= PATH_MAX) + goto free_entry; + + entry->this = this; + entry->invokerxl = xl; + + entry->ordered = ordered; + ret = gf_init_event(entry); + if (ret) + goto free_entry; + + entry->fini = brick->fini; + entry->callback = brick->callback; + entry->connected = brick->connected; + entry->disconnected = brick->disconnected; + + entry->ptr = brick->init(this, brick); + if (!entry->ptr) + goto cleanup_event; + priv->api = entry->ptr; /* pointer to API, if required */ + + pthread_mutex_lock(&priv->lock); + { + list_add_tail(&entry->list, &priv->connections); + } + pthread_mutex_unlock(&priv->lock); + + ret = gf_changelog_setup_rpc(this, entry, CHANGELOG_RPC_PROBE_FILTER); + if (ret) + goto cleanup_event; + return 0; + +cleanup_event: + (void)gf_cleanup_event(this, &entry->event); +free_entry: + gf_msg_debug(this->name, 0, "freeing entry %p", entry); + list_del(&entry->list); /* FIXME: kludge for now */ + GF_FREE(entry); +error_return: + return -1; +} - GF_FREE (entryp); +int +gf_changelog_register_brick(xlator_t *this, struct gf_brick_spec *brick, + gf_boolean_t ordered, void *xl) +{ + return gf_setup_brick_connection(this, brick, ordered, xl); +} - if (!result) { - if (gf_lseek (tracker_fd, 0, SEEK_SET) != -1) - return nr_entries; - } - out: +static int +gf_changelog_setup_logging(xlator_t *this, char *logfile, int loglevel) +{ + /* passing ident as NULL means to use default ident for syslog */ + if (gf_log_init(this->ctx, logfile, NULL)) return -1; + + gf_log_set_loglevel(this->ctx, (loglevel == -1) ? GF_LOG_INFO : loglevel); + return 0; } -/** - * @API - * gf_changelog_register() - register a client for updates. - */ -int -gf_changelog_register (char *brick_path, char *scratch_dir, - char *log_file, int log_level, int max_reconnects) +static int +gf_changelog_set_master(xlator_t *master, void *xl) { - int i = 0; - int ret = -1; - int errn = 0; - xlator_t *this = NULL; - gf_changelog_t *gfc = NULL; - + int32_t ret = 0; + xlator_t *this = NULL; + xlator_t *old_this = NULL; + gf_private_t *priv = NULL; + + this = xl; + if (!this || !this->ctx) { + ret = gf_changelog_init_master(); + if (ret) + return -1; this = THIS; - if (!this->ctx) - goto out; + } + + master->ctx = this->ctx; + + INIT_LIST_HEAD(&master->volume_options); + SAVE_THIS(THIS); + + ret = xlator_mem_acct_init(THIS, gf_changelog_mt_end); + if (ret != 0) + goto restore_this; + + priv = gf_changelog_alloc_priv(); + if (!priv) { + ret = -1; + goto restore_this; + } + + if (!xl) { + /* poller thread */ + ret = gf_thread_create(&priv->poller, NULL, changelog_rpc_poller, THIS, + "clogpoll"); + if (ret != 0) { + GF_FREE(priv); + gf_msg(master->name, GF_LOG_ERROR, 0, + CHANGELOG_LIB_MSG_THREAD_CREATION_FAILED, + "failed to spawn poller thread"); + goto restore_this; + } + } + + master->private = priv; - errno = ENOMEM; +restore_this: + RESTORE_THIS(); - gfc = GF_CALLOC (1, sizeof (*gfc), - gf_changelog_mt_libgfchangelog_t); - if (!gfc) - goto out; + return ret; +} - gfc->this = this; +int +gf_changelog_init(void *xl) +{ + int ret = 0; + gf_private_t *priv = NULL; - gfc->gfc_dir = NULL; - gfc->gfc_fd = gfc->gfc_sockfd = -1; + if (master) + return 0; - gfc->gfc_working_dir = realpath (scratch_dir, NULL); - if (!gfc->gfc_working_dir) { - errn = errno; - goto cleanup; - } + master = calloc(1, sizeof(*master)); + if (!master) + goto error_return; + + master->name = strdup("gfchangelog"); + if (!master->name) + goto dealloc_master; + + ret = gf_changelog_set_master(master, xl); + if (ret) + goto dealloc_name; + + priv = master->private; + ret = gf_thread_create(&priv->connectionjanitor, NULL, + gf_changelog_connection_janitor, master, "clogjan"); + if (ret != 0) { + /* TODO: cleanup priv, mutex (poller thread for !xl) */ + goto dealloc_name; + } + + return 0; + +dealloc_name: + free(master->name); +dealloc_master: + free(master); + master = NULL; +error_return: + return -1; +} - ret = gf_changelog_open_dirs (gfc); - if (ret) { - errn = errno; - gf_log (this->name, GF_LOG_ERROR, - "could not create entries in scratch dir"); - goto cleanup; +int +gf_changelog_register_generic(struct gf_brick_spec *bricks, int count, + int ordered, char *logfile, int lvl, void *xl) +{ + int ret = 0; + xlator_t *this = NULL; + xlator_t *old_this = NULL; + struct gf_brick_spec *brick = NULL; + gf_boolean_t need_order = _gf_false; + + SAVE_THIS(xl); + + this = THIS; + if (!this) + goto error_return; + + ret = gf_changelog_setup_logging(this, logfile, lvl); + if (ret) + goto error_return; + + need_order = (ordered) ? _gf_true : _gf_false; + + brick = bricks; + while (count--) { + gf_smsg(this->name, GF_LOG_INFO, 0, + CHANGELOG_LIB_MSG_NOTIFY_REGISTER_INFO, "brick=%s", + brick->brick_path, "notify_filter=%d", brick->filter, NULL); + + ret = gf_changelog_register_brick(this, brick, need_order, xl); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_LIB_MSG_NOTIFY_REGISTER_FAILED, + "Error registering with changelog xlator"); + break; } - /* passing ident as NULL means to use default ident for syslog */ - if (gf_log_init (this->ctx, log_file, NULL)) - goto cleanup; + brick++; + } - gf_log_set_loglevel ((log_level == -1) ? GF_LOG_INFO : - log_level); + if (ret != 0) + goto cleanup_inited_bricks; - gfc->gfc_connretries = (max_reconnects <= 0) ? 1 : max_reconnects; - (void) strncpy (gfc->gfc_brickpath, brick_path, PATH_MAX); + RESTORE_THIS(); + return 0; - ret = gf_changelog_notification_init (this, gfc); - if (ret) { - errn = errno; - goto cleanup; - } +cleanup_inited_bricks: + gf_cleanup_connections(this); +error_return: + RESTORE_THIS(); + return -1; +} - ret = gf_thread_create (&gfc->gfc_changelog_processor, - NULL, gf_changelog_process, gfc); - if (ret) { - errn = errno; - gf_log (this->name, GF_LOG_ERROR, - "error creating changelog processor thread" - " new changes won't be recorded!!!"); - goto cleanup; - } +/** + * @API + * gf_changelog_register() + * + * This is _NOT_ a generic register API. It's a special API to handle + * updates at a journal granulality. This is used by consumers wanting + * to process persistent journal such as geo-replication via a set of + * APIs. All of this is required to maintain backward compatibility. + * Owner specific private data is stored in ->api (in gf_private_t), + * which is used by APIs to access it's private data. This limits + * the library access to a single brick, but that's how it used to + * be anyway. Furthermore, this API solely _owns_ "this", therefore + * callers already having a notion of "this" are expected to use the + * newer API. + * + * Newer applications wanting to use this library need not face this + * limitation and reply of the much more feature rich generic register + * API, which is purely callback based. + * + * NOTE: @max_reconnects is not used but required for backward compat. + * + * For generic API, refer gf_changelog_register_generic(). + */ +int +gf_changelog_register(char *brick_path, char *scratch_dir, char *log_file, + int log_level, int max_reconnects) +{ + struct gf_brick_spec brick = { + 0, + }; - for (; i < 256; i++) { - gfc->rfc3986[i] = - (isalnum(i) || i == '~' || - i == '-' || i == '.' || i == '_') ? i : 0; - } + if (master) + THIS = master; + else + return -1; - ret = 0; - this->private = gfc; + brick.brick_path = brick_path; + brick.filter = CHANGELOG_OP_TYPE_JOURNAL; - goto out; + brick.init = gf_changelog_journal_init; + brick.fini = gf_changelog_journal_fini; + brick.callback = gf_changelog_handle_journal; + brick.connected = gf_changelog_journal_connect; + brick.disconnected = gf_changelog_journal_disconnect; - cleanup: - gf_changelog_cleanup (gfc); - GF_FREE (gfc); - this->private = NULL; - errno = errn; + brick.ptr = scratch_dir; - out: - return ret; + return gf_changelog_register_generic(&brick, 1, 1, log_file, log_level, + NULL); } diff --git a/xlators/features/changelog/lib/src/gf-history-changelog.c b/xlators/features/changelog/lib/src/gf-history-changelog.c new file mode 100644 index 00000000000..a16219f3664 --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-history-changelog.c @@ -0,0 +1,1020 @@ +#include <errno.h> +#include <dirent.h> +#include <stddef.h> +#include <sys/types.h> + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <string.h> + +#include <glusterfs/globals.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/syscall.h> + +#include "gf-changelog-helpers.h" +#include "gf-changelog-journal.h" + +/* from the changelog translator */ +#include "changelog-misc.h" +#include "changelog-lib-messages.h" +#include "changelog-mem-types.h" + +/** + * @API + * gf_history_changelog_done: + * Move processed history changelog file from .processing + * to .processed + * + * ARGUMENTS: + * file(IN): path to processed history changelog file in + * .processing directory. + * + * RETURN VALUE: + * 0: On success. + * -1: On error. + */ +int +gf_history_changelog_done(char *file) +{ + int ret = -1; + char *buffer = NULL; + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_journal_t *hist_jnl = NULL; + char to_path[PATH_MAX] = { + 0, + }; + + errno = EINVAL; + + this = THIS; + if (!this) + goto out; + + jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this); + if (!jnl) + goto out; + + hist_jnl = jnl->hist_jnl; + if (!hist_jnl) + goto out; + + if (!file || !strlen(file)) + goto out; + + /* make sure 'file' is inside ->jnl_working_dir */ + buffer = realpath(file, NULL); + if (!buffer) + goto out; + + if (strncmp(hist_jnl->jnl_working_dir, buffer, + strlen(hist_jnl->jnl_working_dir))) + goto out; + + (void)snprintf(to_path, PATH_MAX, "%s%s", hist_jnl->jnl_processed_dir, + basename(buffer)); + gf_msg_debug(this->name, 0, "moving %s to processed directory", file); + ret = sys_rename(buffer, to_path); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_LIB_MSG_RENAME_FAILED, "from=%s", file, "to=%s", + to_path, NULL); + goto out; + } + + ret = 0; + +out: + if (buffer) + free(buffer); /* allocated by realpath() */ + return ret; +} + +/** + * @API + * gf_history_changelog_start_fresh: + * For a set of changelogs, start from the beginning. + * It will truncates the history tracker fd. + * + * RETURN VALUES: + * 0: On success. + * -1: On error. + */ +int +gf_history_changelog_start_fresh() +{ + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_journal_t *hist_jnl = NULL; + + this = THIS; + if (!this) + goto out; + + errno = EINVAL; + + jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this); + if (!jnl) + goto out; + + hist_jnl = jnl->hist_jnl; + if (!hist_jnl) + goto out; + + if (gf_ftruncate(hist_jnl->jnl_fd, 0)) + goto out; + + return 0; + +out: + return -1; +} + +/** + * @API + * gf_history_changelog_next_change: + * Return the next history changelog file entry. Zero means all + * history chanelogs are consumed. + * + * ARGUMENTS: + * bufptr(OUT): Path to unprocessed history changelog file + * from tracker file. + * maxlen(IN): Usually PATH_MAX. + * + * RETURN VALUES: + * size: On success. + * -1 : On error. + */ +ssize_t +gf_history_changelog_next_change(char *bufptr, size_t maxlen) +{ + ssize_t size = -1; + int tracker_fd = 0; + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_journal_t *hist_jnl = NULL; + char buffer[PATH_MAX] = { + 0, + }; + + if (maxlen > PATH_MAX) { + errno = ENAMETOOLONG; + goto out; + } + + errno = EINVAL; + + this = THIS; + if (!this) + goto out; + + jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this); + if (!jnl) + goto out; + + hist_jnl = jnl->hist_jnl; + if (!hist_jnl) + goto out; + + tracker_fd = hist_jnl->jnl_fd; + + size = gf_readline(tracker_fd, buffer, maxlen); + if (size < 0) { + size = -1; + goto out; + } + + if (size == 0) + goto out; + + memcpy(bufptr, buffer, size - 1); + bufptr[size - 1] = '\0'; + +out: + return size; +} + +/** + * @API + * gf_history_changelog_scan: + * Scan and generate a list of change entries. + * Calling this api multiple times (without calling gf_changlog_done()) + * would result new changelogs(s) being refreshed in the tracker file. + * This call also acts as a cancellation point for the consumer. + * + * RETURN VALUES: + * +ve integer : success and keep scanning.(count of changelogs) + * 0 : success and done scanning. + * -1 : error. + * + * NOTE: After first 0 return call_get_next change for once more time + * to empty the tracker + * + */ +ssize_t +gf_history_changelog_scan() +{ + int tracker_fd = 0; + size_t off = 0; + xlator_t *this = NULL; + size_t nr_entries = 0; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_journal_t *hist_jnl = NULL; + struct dirent *entry = NULL; + struct dirent scratch[2] = { + { + 0, + }, + }; + char buffer[PATH_MAX] = { + 0, + }; + static int is_last_scan; + + this = THIS; + if (!this) + goto out; + + jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this); + if (!jnl) + goto out; + if (JNL_IS_API_DISCONNECTED(jnl)) { + errno = ENOTCONN; + goto out; + } + + hist_jnl = jnl->hist_jnl; + if (!hist_jnl) + goto out; + +retry: + if (is_last_scan == 1) + return 0; + if (hist_jnl->hist_done == 0) + is_last_scan = 1; + + errno = EINVAL; + if (hist_jnl->hist_done == -1) + goto out; + + tracker_fd = hist_jnl->jnl_fd; + + if (gf_ftruncate(tracker_fd, 0)) + goto out; + + rewinddir(hist_jnl->jnl_dir); + + for (;;) { + errno = 0; + entry = sys_readdir(hist_jnl->jnl_dir, scratch); + if (!entry || errno != 0) + break; + + if (strcmp(basename(entry->d_name), ".") == 0 || + strcmp(basename(entry->d_name), "..") == 0) + continue; + + nr_entries++; + + GF_CHANGELOG_FILL_BUFFER(hist_jnl->jnl_processing_dir, buffer, off, + strlen(hist_jnl->jnl_processing_dir)); + GF_CHANGELOG_FILL_BUFFER(entry->d_name, buffer, off, + strlen(entry->d_name)); + GF_CHANGELOG_FILL_BUFFER("\n", buffer, off, 1); + + if (gf_changelog_write(tracker_fd, buffer, off) != off) { + gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_WRITE_FAILED, + "error writing changelog filename" + " to tracker file"); + break; + } + off = 0; + } + + gf_msg_debug(this->name, 0, "hist_done %d, is_last_scan: %d", + hist_jnl->hist_done, is_last_scan); + + if (!entry) { + if (gf_lseek(tracker_fd, 0, SEEK_SET) != -1) { + if (nr_entries > 0) + return nr_entries; + else { + sleep(1); + goto retry; + } + } + } +out: + return -1; +} + +/* + * Gets timestamp value at the changelog path at index. + * Returns 0 on success(updates given time-stamp), -1 on failure. + */ +int +gf_history_get_timestamp(int fd, int index, int len, unsigned long *ts) +{ + xlator_t *this = NULL; + int n_read = -1; + char path_buf[PATH_MAX] = { + 0, + }; + char *iter = path_buf; + size_t offset = index * (len + 1); + unsigned long value = 0; + int ret = 0; + + this = THIS; + if (!this) { + return -1; + } + + n_read = sys_pread(fd, path_buf, len, offset); + if (n_read < 0) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_READ_ERROR, + "could not read from htime file"); + goto out; + } + iter += len - TIMESTAMP_LENGTH; + sscanf(iter, "%lu", &value); +out: + if (ret == 0) + *ts = value; + return ret; +} + +/* + * Function to ensure correctness of search + * Checks whether @value is there next to @target_index or not + */ +int +gf_history_check(int fd, int target_index, unsigned long value, int len) +{ + int ret = 0; + unsigned long ts1 = 0; + unsigned long ts2 = 0; + + if (target_index == 0) { + ret = gf_history_get_timestamp(fd, target_index, len, &ts1); + if (ret == -1) + goto out; + if (value <= ts1) + goto out; + else { + ret = -1; + goto out; + } + } + + ret = gf_history_get_timestamp(fd, target_index, len, &ts1); + if (ret == -1) + goto out; + ret = gf_history_get_timestamp(fd, target_index - 1, len, &ts2); + if (ret == -1) + goto out; + + if ((value <= ts1) && (value > ts2)) { + goto out; + } else + ret = -1; +out: + return ret; +} + +/* + * This is a "binary search" based search function which checks neighbours + * for in-range availability of the value to be searched and provides the + * index at which the changelog file nearest to the requested timestamp(value) + * can be read from. + * + * Actual offset can be calculated as (index* (len+1) ). + * "1" is because the changelog paths are null terminated. + * + * @path : Htime file to search in + * @value : time stamp to search + * @from : start index to search + * @to : end index to search + * @len : length of fixes length strings separated by null + */ + +int +gf_history_b_search(int fd, unsigned long value, unsigned long from, + unsigned long to, int len) +{ + int m_index = -1; + unsigned long cur_value = 0; + unsigned long ts1 = 0; + int ret = 0; + + m_index = (from + to) / 2; + + if ((to - from) <= 1) { + /* either one or 2 changelogs left */ + if (to != from) { + /* check if value is less or greater than to + * return accordingly + */ + ret = gf_history_get_timestamp(fd, from, len, &ts1); + if (ret == -1) + goto out; + if (ts1 >= value) { + /* actually compatision should be + * exactly == but considering + * + * case of only 2 changelogs in htime file + */ + return from; + } else + return to; + } else + return to; + } + + ret = gf_history_get_timestamp(fd, m_index, len, &cur_value); + if (ret == -1) + goto out; + if (cur_value == value) { + return m_index; + } else if (value > cur_value) { + ret = gf_history_get_timestamp(fd, m_index + 1, len, &cur_value); + if (ret == -1) + goto out; + if (value < cur_value) + return m_index + 1; + else + return gf_history_b_search(fd, value, m_index + 1, to, len); + } else { + if (m_index == 0) { + /* we are sure that values exists + * in this htime file + */ + return 0; + } else { + ret = gf_history_get_timestamp(fd, m_index - 1, len, &cur_value); + if (ret == -1) + goto out; + if (value > cur_value) { + return m_index; + } else + return gf_history_b_search(fd, value, from, m_index - 1, len); + } + } +out: + return -1; +} + +/* + * Description: Checks if the changelog path is usable or not, + * which is differentiated by checking for "changelog" + * in the path and not "CHANGELOG". + * + * Returns: + * 1 : Yes, usable ( contains "CHANGELOG" ) + * 0 : No, Not usable ( contains, "changelog") + */ +int +gf_is_changelog_usable(char *cl_path) +{ + int ret = -1; + const char low_c[] = "changelog"; + char *str_ret = NULL; + char *bname = NULL; + + bname = basename(cl_path); + + str_ret = strstr(bname, low_c); + + if (str_ret != NULL) + ret = 0; + else + ret = 1; + + return ret; +} + +void * +gf_changelog_consume_wrap(void *data) +{ + int ret = -1; + ssize_t nread = 0; + xlator_t *this = NULL; + gf_changelog_consume_data_t *ccd = NULL; + + ccd = (gf_changelog_consume_data_t *)data; + this = ccd->this; + + ccd->retval = -1; + + nread = sys_pread(ccd->fd, ccd->changelog, PATH_MAX - 1, ccd->offset); + if (nread < 0) { + gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_READ_ERROR, + "cannot read from history metadata file"); + goto out; + } + + /* TODO: handle short reads and EOF. */ + if (gf_is_changelog_usable(ccd->changelog) == 1) { + ret = gf_changelog_consume(ccd->this, ccd->jnl, ccd->changelog, + _gf_true); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_PARSE_ERROR, + "name=%s", ccd->changelog, NULL); + goto out; + } + } + ccd->retval = 0; + +out: + return NULL; +} + +/** + * "gf_history_consume" is a worker function for history. + * parses and moves changelogs files from index "from" + * to index "to" in open htime file whose fd is "fd". + */ + +#define MAX_PARALLELS 10 + +void * +gf_history_consume(void *data) +{ + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_journal_t *hist_jnl = NULL; + int ret = 0; + int iter = 0; + int fd = -1; + int from = -1; + int to = -1; + int len = -1; + int n_parallel = 0; + int n_envoked = 0; + gf_boolean_t publish = _gf_true; + pthread_t th_id[MAX_PARALLELS] = { + 0, + }; + gf_changelog_history_data_t *hist_data = NULL; + gf_changelog_consume_data_t ccd[MAX_PARALLELS] = { + {0}, + }; + gf_changelog_consume_data_t *curr = NULL; + + hist_data = (gf_changelog_history_data_t *)data; + if (hist_data == NULL) { + ret = -1; + goto out; + } + + fd = hist_data->htime_fd; + from = hist_data->from; + to = hist_data->to; + len = hist_data->len; + n_parallel = hist_data->n_parallel; + + THIS = hist_data->this; + this = hist_data->this; + if (!this) { + ret = -1; + goto out; + } + + jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this); + if (!jnl) { + ret = -1; + goto out; + } + + hist_jnl = jnl->hist_jnl; + if (!hist_jnl) { + ret = -1; + goto out; + } + + while (from <= to) { + n_envoked = 0; + + for (iter = 0; (iter < n_parallel) && (from <= to); iter++) { + curr = &ccd[iter]; + + curr->this = this; + curr->jnl = hist_jnl; + curr->fd = fd; + curr->offset = from * (len + 1); + + curr->retval = 0; + memset(curr->changelog, '\0', PATH_MAX); + + ret = gf_thread_create(&th_id[iter], NULL, + gf_changelog_consume_wrap, curr, + "clogc%03hx", (iter + 1) & 0x3ff); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ret, + CHANGELOG_LIB_MSG_THREAD_CREATION_FAILED, + "could not create consume-thread"); + goto sync; + } else + n_envoked++; + + from++; + } + + sync: + for (iter = 0; iter < n_envoked; iter++) { + ret = pthread_join(th_id[iter], NULL); + if (ret) { + publish = _gf_false; + gf_msg(this->name, GF_LOG_ERROR, ret, + CHANGELOG_LIB_MSG_PTHREAD_JOIN_FAILED, + "pthread_join() error"); + /* try to join the rest */ + continue; + } + + if (publish == _gf_false) + continue; + + curr = &ccd[iter]; + if (ccd->retval) { + publish = _gf_false; + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_LIB_MSG_PARSE_ERROR_CEASED, NULL); + continue; + } + + ret = gf_changelog_publish(curr->this, curr->jnl, curr->changelog); + if (ret) { + publish = _gf_false; + gf_msg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_LIB_MSG_PUBLISH_ERROR, + "publish error, ceased publishing..."); + } + } + } + + /* informing "parsing done". */ + hist_jnl->hist_done = (publish == _gf_true) ? 0 : -1; + +out: + if (fd != -1) + (void)sys_close(fd); + GF_FREE(hist_data); + return NULL; +} + +/** + * @API + * gf_history_changelog() : Get/parse historical changelogs and get them ready + * for consumption. + * + * Arguments: + * @changelog_dir : Directory location from where history changelogs are + * supposed to be consumed. + * @start: Unix timestamp FROM where changelogs should be consumed. + * @end: Unix timestamp TO where changelogsshould be consumed. + * @n_parallel : degree of parallelism while changelog parsing. + * @actual_end : the end time till where changelogs are available. + * + * Return: + * Returns <timestamp> on success, the last time till where changelogs are + * available. + * Returns -1 on failure(error). + */ + +/** + * Extract timestamp range from a historical metadata file + * Returns: + * 0 : Success ({min,max}_ts with the appropriate values) + * -1 : Failure + * -2 : Ignore this metadata file and process next + */ +int +gf_changelog_extract_min_max(const char *dname, const char *htime_dir, int *fd, + unsigned long *total, unsigned long *min_ts, + unsigned long *max_ts) +{ + int ret = -1; + xlator_t *this = NULL; + char htime_file[PATH_MAX] = { + 0, + }; + struct stat stbuf = { + 0, + }; + char *iter = NULL; + char x_value[30] = { + 0, + }; + + this = THIS; + + snprintf(htime_file, PATH_MAX, "%s/%s", htime_dir, dname); + + iter = (htime_file + strlen(htime_file) - TIMESTAMP_LENGTH); + sscanf(iter, "%lu", min_ts); + + ret = sys_stat(htime_file, &stbuf); + if (ret) { + ret = -1; + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_HTIME_ERROR, + "op=stat", "path=%s", htime_file, NULL); + goto out; + } + + /* ignore everything except regular files */ + if (!S_ISREG(stbuf.st_mode)) { + ret = -2; + goto out; + } + + *fd = open(htime_file, O_RDONLY); + if (*fd < 0) { + ret = -1; + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_HTIME_ERROR, + "op=open", "path=%s", htime_file, NULL); + goto out; + } + + /* Looks good, extract max timestamp */ + ret = sys_fgetxattr(*fd, HTIME_KEY, x_value, sizeof(x_value)); + if (ret < 0) { + ret = -1; + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_LIB_MSG_GET_XATTR_FAILED, "path=%s", htime_file, + NULL); + goto out; + } + + sscanf(x_value, "%lu:%lu", max_ts, total); + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_LIB_MSG_MIN_MAX_INFO, + "min=%lu", *min_ts, "max=%lu", *max_ts, "total_changelogs=%lu", + *total, NULL); + + ret = 0; + +out: + return ret; +} + +/* gf_history_changelog returns actual_end and spawns threads to + * parse historical changelogs. The return values are as follows. + * 0 : On success + * 1 : Successful, but partial historical changelogs available, + * end time falls into different htime file or future time + * -2 : Error, requested historical changelog not available, not + * even partial + * -1 : On any error + */ +int +gf_history_changelog(char *changelog_dir, unsigned long start, + unsigned long end, int n_parallel, + unsigned long *actual_end) +{ + int ret = 0; + int len = -1; + int fd = -1; + int n_read = -1; + unsigned long min_ts = 0; + unsigned long max_ts = 0; + unsigned long end2 = 0; + unsigned long ts1 = 0; + unsigned long ts2 = 0; + unsigned long to = 0; + unsigned long from = 0; + unsigned long total_changelog = 0; + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_journal_t *hist_jnl = NULL; + gf_changelog_history_data_t *hist_data = NULL; + DIR *dirp = NULL; + struct dirent *entry = NULL; + struct dirent scratch[2] = { + { + 0, + }, + }; + pthread_t consume_th = 0; + char htime_dir[PATH_MAX] = { + 0, + }; + char buffer[PATH_MAX] = { + 0, + }; + gf_boolean_t partial_history = _gf_false; + + pthread_attr_t attr; + + this = THIS; + if (!this) { + ret = -1; + goto out; + } + + ret = pthread_attr_init(&attr); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_PTHREAD_ERROR, + "Pthread init failed"); + return -1; + } + + jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this); + if (!jnl) { + ret = -1; + goto out; + } + + hist_jnl = (gf_changelog_journal_t *)jnl->hist_jnl; + if (!hist_jnl) { + ret = -1; + goto out; + } + + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_LIB_MSG_REQUESTING_INFO, + "start=%lu", start, "end=%lu", end, NULL); + + /* basic sanity check */ + if (start > end || n_parallel <= 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_HIST_FAILED, + "start=%lu", start, "end=%lu", end, "thread_count=%d", + n_parallel, NULL); + ret = -1; + goto out; + } + + /* cap parallelism count */ + if (n_parallel > MAX_PARALLELS) + n_parallel = MAX_PARALLELS; + + CHANGELOG_FILL_HTIME_DIR(changelog_dir, htime_dir); + + dirp = sys_opendir(htime_dir); + if (dirp == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_HTIME_ERROR, + "op=opendir", "path=%s", htime_dir, NULL); + ret = -1; + goto out; + } + + for (;;) { + errno = 0; + + entry = sys_readdir(dirp, scratch); + + if (!entry || errno != 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_LIB_MSG_HIST_FAILED, "start=%lu", start, + "end=%lu", end, NULL); + ret = -2; + break; + } + + ret = gf_changelog_extract_min_max(entry->d_name, htime_dir, &fd, + &total_changelog, &min_ts, &max_ts); + if (ret) { + if (-2 == ret) + continue; + goto out; + } + + if (start >= min_ts && start < max_ts) { + /** + * TODO: handle short reads later... + */ + n_read = sys_read(fd, buffer, PATH_MAX); + if (n_read < 0) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_LIB_MSG_READ_ERROR, + "unable to read htime file"); + goto out; + } + + len = strlen(buffer); + + /** + * search @start in the htime file returning it's index + * (@from) + */ + from = gf_history_b_search(fd, start, 0, total_changelog - 1, len); + + /* ensuring correctness of gf_b_search */ + if (gf_history_check(fd, from, start, len) != 0) { + ret = -1; + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_LIB_MSG_GET_TIME_ERROR, "for=start", + "start=%lu", start, "idx=%lu", from, NULL); + goto out; + } + + end2 = (end <= max_ts) ? end : max_ts; + + /* Check if end falls out of same HTIME file. The end + * falling to a different htime file or changelog + * disable-enable is detected only after 20 seconds. + * This is required because, applications generally + * asks historical changelogs till current time and + * it is possible changelog is not rolled over yet. + * So, buffer time of default rollover time plus 5 + * seconds is subtracted. If the application requests + * the end time with in half a minute of changelog + * disable, it's not detected as changelog disable and + * it's application's responsibility to retry after + * 20 seconds before confirming it as partial history. + */ + if ((end - 20) > max_ts) { + partial_history = _gf_true; + } + + /** + * search @end2 in htime file returning it's index (@to) + */ + to = gf_history_b_search(fd, end2, 0, total_changelog - 1, len); + + if (gf_history_check(fd, to, end2, len) != 0) { + ret = -1; + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_LIB_MSG_GET_TIME_ERROR, "for=end", + "start=%lu", end2, "idx=%lu", to, NULL); + goto out; + } + + ret = gf_history_get_timestamp(fd, from, len, &ts1); + if (ret == -1) + goto out; + + ret = gf_history_get_timestamp(fd, to, len, &ts2); + if (ret == -1) + goto out; + + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_LIB_MSG_FINAL_INFO, + "from=%lu", ts1, "to=%lu", ts2, "changes=%lu", + (to - from + 1), NULL); + + hist_data = GF_CALLOC(1, sizeof(gf_changelog_history_data_t), + gf_changelog_mt_history_data_t); + + hist_data->htime_fd = fd; + hist_data->from = from; + hist_data->to = to; + hist_data->len = len; + hist_data->n_parallel = n_parallel; + hist_data->this = this; + + ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, ret, + CHANGELOG_LIB_MSG_PTHREAD_ERROR, + "unable to sets the detach" + " state attribute"); + ret = -1; + goto out; + } + + /* spawn a thread for background parsing & publishing */ + ret = gf_thread_create(&consume_th, &attr, gf_history_consume, + hist_data, "cloghcon"); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ret, + CHANGELOG_LIB_MSG_THREAD_CREATION_FAILED, + "creation of consume parent-thread" + " failed."); + ret = -1; + goto out; + } + + goto out; + + } else { /* end of range check */ + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_LIB_MSG_HIST_FAILED, "start=%lu", start, + "end=%lu", end, "chlog_min=%lu", min_ts, "chlog_max=%lu", + max_ts, NULL); + } + } /* end of readdir() */ + +out: + if (dirp != NULL) + (void)sys_closedir(dirp); + + if (ret < 0) { + if (fd != -1) + (void)sys_close(fd); + GF_FREE(hist_data); + (void)pthread_attr_destroy(&attr); + + return ret; + } + + hist_jnl->hist_done = 1; + *actual_end = ts2; + + if (partial_history) { + ret = 1; + } + + return ret; +} diff --git a/xlators/features/changelog/src/Makefile.am b/xlators/features/changelog/src/Makefile.am index e85031ad496..eee7dfa238d 100644 --- a/xlators/features/changelog/src/Makefile.am +++ b/xlators/features/changelog/src/Makefile.am @@ -3,16 +3,26 @@ xlator_LTLIBRARIES = changelog.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features noinst_HEADERS = changelog-helpers.h changelog-mem-types.h changelog-rt.h \ - changelog-misc.h changelog-encoders.h changelog-notifier.h + changelog-rpc-common.h changelog-misc.h changelog-encoders.h \ + changelog-rpc-common.h changelog-rpc.h changelog-ev-handle.h \ + changelog-messages.h -changelog_la_LDFLAGS = -module -avoidversion +changelog_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) changelog_la_SOURCES = changelog.c changelog-rt.c changelog-helpers.c \ - changelog-encoders.c changelog-notifier.c -changelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + changelog-encoders.c changelog-rpc.c changelog-barrier.c \ + changelog-rpc-common.c changelog-ev-handle.c +changelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/rpc/xdr/src/libgfxdr.la \ + $(top_builddir)/rpc/rpc-lib/src/libgfrpc.la -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -fPIC -D_FILE_OFFSET_BITS=64 \ - -D_GNU_SOURCE -D$(GF_HOST_OS) -shared -nostartfiles -DDATADIR=\"$(localstatedir)\" +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/rpc/rpc-transport/socket/src \ + -I$(top_srcdir)/xlators/features/changelog/lib/src/ \ + -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) \ + -DDATADIR=\"$(localstatedir)\" AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/features/changelog/src/changelog-barrier.c b/xlators/features/changelog/src/changelog-barrier.c new file mode 100644 index 00000000000..0fb89ddb127 --- /dev/null +++ b/xlators/features/changelog/src/changelog-barrier.c @@ -0,0 +1,131 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "changelog-helpers.h" +#include "changelog-messages.h" +#include <glusterfs/call-stub.h> + +/* Enqueue a stub*/ +void +__chlog_barrier_enqueue(xlator_t *this, call_stub_t *stub) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + GF_ASSERT(priv); + + list_add_tail(&stub->list, &priv->queue); + priv->queue_size++; + + return; +} + +/* Dequeue a stub */ +call_stub_t * +__chlog_barrier_dequeue(xlator_t *this, struct list_head *queue) +{ + call_stub_t *stub = NULL; + changelog_priv_t *priv = NULL; + + priv = this->private; + GF_ASSERT(priv); + + if (list_empty(queue)) + goto out; + + stub = list_entry(queue->next, call_stub_t, list); + list_del_init(&stub->list); + +out: + return stub; +} + +/* Dequeue all the stubs and call corresponding resume functions */ +void +chlog_barrier_dequeue_all(xlator_t *this, struct list_head *queue) +{ + call_stub_t *stub = NULL; + + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS, + NULL); + + while ((stub = __chlog_barrier_dequeue(this, queue))) + call_resume(stub); + + gf_smsg(this->name, GF_LOG_INFO, 0, + CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS_FINISHED, NULL); + return; +} + +/* Function called on changelog barrier timeout */ +void +chlog_barrier_timeout(void *data) +{ + xlator_t *this = NULL; + changelog_priv_t *priv = NULL; + struct list_head queue = { + 0, + }; + + this = data; + THIS = this; + priv = this->private; + + INIT_LIST_HEAD(&queue); + + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_BARRIER_TIMEOUT, NULL); + + LOCK(&priv->lock); + { + __chlog_barrier_disable(this, &queue); + } + UNLOCK(&priv->lock); + + chlog_barrier_dequeue_all(this, &queue); + + return; +} + +/* Disable changelog barrier enable flag */ +void +__chlog_barrier_disable(xlator_t *this, struct list_head *queue) +{ + changelog_priv_t *priv = this->private; + GF_ASSERT(priv); + + if (priv->timer) { + gf_timer_call_cancel(this->ctx, priv->timer); + priv->timer = NULL; + } + + list_splice_init(&priv->queue, queue); + priv->queue_size = 0; + priv->barrier_enabled = _gf_false; +} + +/* Enable chagelog barrier enable with timer */ +int +__chlog_barrier_enable(xlator_t *this, changelog_priv_t *priv) +{ + int ret = -1; + + priv->timer = gf_timer_call_after(this->ctx, priv->timeout, + chlog_barrier_timeout, (void *)this); + if (!priv->timer) { + gf_smsg(this->name, GF_LOG_CRITICAL, 0, + CHANGELOG_MSG_TIMEOUT_ADD_FAILED, NULL); + goto out; + } + + priv->barrier_enabled = _gf_true; + ret = 0; +out: + return ret; +} diff --git a/xlators/features/changelog/src/changelog-encoders.c b/xlators/features/changelog/src/changelog-encoders.c index 553eec85c30..63754516c2e 100644 --- a/xlators/features/changelog/src/changelog-encoders.c +++ b/xlators/features/changelog/src/changelog-encoders.c @@ -8,169 +8,225 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include "changelog-encoders.h" size_t -entry_fn (void *data, char *buffer, gf_boolean_t encode) +entry_fn(void *data, char *buffer, gf_boolean_t encode) { - char *tmpbuf = NULL; - size_t bufsz = 0; - struct changelog_entry_fields *ce = NULL; - - ce = (struct changelog_entry_fields *) data; - - if (encode) { - tmpbuf = uuid_utoa (ce->cef_uuid); - CHANGELOG_FILL_BUFFER (buffer, bufsz, tmpbuf, strlen (tmpbuf)); - } else { - CHANGELOG_FILL_BUFFER (buffer, bufsz, - ce->cef_uuid, sizeof (uuid_t)); - } + char *tmpbuf = NULL; + size_t bufsz = 0; + struct changelog_entry_fields *ce = NULL; + + ce = (struct changelog_entry_fields *)data; + + if (encode) { + tmpbuf = uuid_utoa(ce->cef_uuid); + CHANGELOG_FILL_BUFFER(buffer, bufsz, tmpbuf, strlen(tmpbuf)); + } else { + CHANGELOG_FILL_BUFFER(buffer, bufsz, ce->cef_uuid, sizeof(uuid_t)); + } + + CHANGELOG_FILL_BUFFER(buffer, bufsz, "/", 1); + CHANGELOG_FILL_BUFFER(buffer, bufsz, ce->cef_bname, strlen(ce->cef_bname)); + return bufsz; +} - CHANGELOG_FILL_BUFFER (buffer, bufsz, "/", 1); - CHANGELOG_FILL_BUFFER (buffer, bufsz, - ce->cef_bname, strlen (ce->cef_bname)); - return bufsz; +size_t +del_entry_fn(void *data, char *buffer, gf_boolean_t encode) +{ + char *tmpbuf = NULL; + size_t bufsz = 0; + struct changelog_entry_fields *ce = NULL; + + ce = (struct changelog_entry_fields *)data; + + if (encode) { + tmpbuf = uuid_utoa(ce->cef_uuid); + CHANGELOG_FILL_BUFFER(buffer, bufsz, tmpbuf, strlen(tmpbuf)); + } else { + CHANGELOG_FILL_BUFFER(buffer, bufsz, ce->cef_uuid, sizeof(uuid_t)); + } + + CHANGELOG_FILL_BUFFER(buffer, bufsz, "/", 1); + CHANGELOG_FILL_BUFFER(buffer, bufsz, ce->cef_bname, strlen(ce->cef_bname)); + CHANGELOG_FILL_BUFFER(buffer, bufsz, "\0", 1); + + if (ce->cef_path[0] == '\0') { + CHANGELOG_FILL_BUFFER(buffer, bufsz, "\0", 1); + } else { + CHANGELOG_FILL_BUFFER(buffer, bufsz, ce->cef_path, + strlen(ce->cef_path)); + } + + return bufsz; } size_t -fop_fn (void *data, char *buffer, gf_boolean_t encode) +fop_fn(void *data, char *buffer, gf_boolean_t encode) { - char buf[10] = {0,}; - size_t bufsz = 0; - glusterfs_fop_t fop = 0; + char buf[10] = { + 0, + }; + size_t bufsz = 0; + glusterfs_fop_t fop = 0; - fop = *(glusterfs_fop_t *) data; + fop = *(glusterfs_fop_t *)data; - if (encode) { - (void) snprintf (buf, sizeof (buf), "%d", fop); - CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf)); - } else - CHANGELOG_FILL_BUFFER (buffer, bufsz, &fop, sizeof (fop)); + if (encode) { + (void)snprintf(buf, sizeof(buf), "%d", fop); + CHANGELOG_FILL_BUFFER(buffer, bufsz, buf, strlen(buf)); + } else + CHANGELOG_FILL_BUFFER(buffer, bufsz, &fop, sizeof(fop)); - return bufsz; + return bufsz; +} + +size_t +number_fn(void *data, char *buffer, gf_boolean_t encode) +{ + size_t bufsz = 0; + unsigned int nr = 0; + char buf[20] = { + 0, + }; + + nr = *(unsigned int *)data; + + if (encode) { + (void)snprintf(buf, sizeof(buf), "%u", nr); + CHANGELOG_FILL_BUFFER(buffer, bufsz, buf, strlen(buf)); + } else + CHANGELOG_FILL_BUFFER(buffer, bufsz, &nr, sizeof(unsigned int)); + + return bufsz; } void -entry_free_fn (void *data) +entry_free_fn(void *data) { - changelog_opt_t *co = data; + changelog_opt_t *co = data; - if (!co) - return; + if (!co) + return; - GF_FREE (co->co_entry.cef_bname); + GF_FREE(co->co_entry.cef_bname); +} + +void +del_entry_free_fn(void *data) +{ + changelog_opt_t *co = data; + + if (!co) + return; + + GF_FREE(co->co_entry.cef_bname); + GF_FREE(co->co_entry.cef_path); } /** * try to write all data in one shot */ -static inline void -changelog_encode_write_xtra (changelog_log_data_t *cld, - char *buffer, size_t *off, gf_boolean_t encode) +static void +changelog_encode_write_xtra(changelog_log_data_t *cld, char *buffer, + size_t *off, gf_boolean_t encode) { - int i = 0; - size_t offset = 0; - void *data = NULL; - changelog_opt_t *co = NULL; - - offset = *off; - - co = (changelog_opt_t *) cld->cld_ptr; - - for (; i < cld->cld_xtra_records; i++, co++) { - CHANGELOG_FILL_BUFFER (buffer, offset, "\0", 1); - - switch (co->co_type) { - case CHANGELOG_OPT_REC_FOP: - data = &co->co_fop; - break; - case CHANGELOG_OPT_REC_ENTRY: - data = &co->co_entry; - break; - } - - if (co->co_convert) - offset += co->co_convert (data, - buffer + offset, encode); - else /* no coversion: write it out as it is */ - CHANGELOG_FILL_BUFFER (buffer, offset, - data, co->co_len); + int i = 0; + size_t offset = 0; + void *data = NULL; + changelog_opt_t *co = NULL; + + offset = *off; + + co = (changelog_opt_t *)cld->cld_ptr; + + for (; i < cld->cld_xtra_records; i++, co++) { + CHANGELOG_FILL_BUFFER(buffer, offset, "\0", 1); + + switch (co->co_type) { + case CHANGELOG_OPT_REC_FOP: + data = &co->co_fop; + break; + case CHANGELOG_OPT_REC_ENTRY: + data = &co->co_entry; + break; + case CHANGELOG_OPT_REC_UINT32: + data = &co->co_uint32; + break; } - *off = offset; + if (co->co_convert) + offset += co->co_convert(data, buffer + offset, encode); + else /* no coversion: write it out as it is */ + CHANGELOG_FILL_BUFFER(buffer, offset, data, co->co_len); + } + + *off = offset; } int -changelog_encode_ascii (xlator_t *this, changelog_log_data_t *cld) +changelog_encode_ascii(xlator_t *this, changelog_log_data_t *cld) { - size_t off = 0; - size_t gfid_len = 0; - char *gfid_str = NULL; - char *buffer = NULL; - changelog_priv_t *priv = NULL; + size_t off = 0; + size_t gfid_len = 0; + char *gfid_str = NULL; + char *buffer = NULL; + changelog_priv_t *priv = NULL; - priv = this->private; + priv = this->private; - gfid_str = uuid_utoa (cld->cld_gfid); - gfid_len = strlen (gfid_str); + gfid_str = uuid_utoa(cld->cld_gfid); + gfid_len = strlen(gfid_str); - /* extra bytes for decorations */ - buffer = alloca (gfid_len + cld->cld_ptr_len + 10); - CHANGELOG_STORE_ASCII (priv, buffer, - off, gfid_str, gfid_len, cld); + /* extra bytes for decorations */ + buffer = alloca(gfid_len + cld->cld_ptr_len + 10); + CHANGELOG_STORE_ASCII(priv, buffer, off, gfid_str, gfid_len, cld); - if (cld->cld_xtra_records) - changelog_encode_write_xtra (cld, buffer, &off, _gf_true); + if (cld->cld_xtra_records) + changelog_encode_write_xtra(cld, buffer, &off, _gf_true); - CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); + CHANGELOG_FILL_BUFFER(buffer, off, "\0", 1); - return changelog_write_change (priv, buffer, off); + return changelog_write_change(priv, buffer, off); } int -changelog_encode_binary (xlator_t *this, changelog_log_data_t *cld) +changelog_encode_binary(xlator_t *this, changelog_log_data_t *cld) { - size_t off = 0; - char *buffer = NULL; - changelog_priv_t *priv = NULL; + size_t off = 0; + char *buffer = NULL; + changelog_priv_t *priv = NULL; - priv = this->private; + priv = this->private; - /* extra bytes for decorations */ - buffer = alloca (sizeof (uuid_t) + cld->cld_ptr_len + 10); - CHANGELOG_STORE_BINARY (priv, buffer, off, cld->cld_gfid, cld); + /* extra bytes for decorations */ + buffer = alloca(sizeof(uuid_t) + cld->cld_ptr_len + 10); + CHANGELOG_STORE_BINARY(priv, buffer, off, cld->cld_gfid, cld); - if (cld->cld_xtra_records) - changelog_encode_write_xtra (cld, buffer, &off, _gf_false); + if (cld->cld_xtra_records) + changelog_encode_write_xtra(cld, buffer, &off, _gf_false); - CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); + CHANGELOG_FILL_BUFFER(buffer, off, "\0", 1); - return changelog_write_change (priv, buffer, off); + return changelog_write_change(priv, buffer, off); } -static struct changelog_encoder -cb_encoder[] = { - [CHANGELOG_ENCODE_BINARY] = +static struct changelog_encoder cb_encoder[] = { + [CHANGELOG_ENCODE_BINARY] = { - .encoder = CHANGELOG_ENCODE_BINARY, - .encode = changelog_encode_binary, + .encoder = CHANGELOG_ENCODE_BINARY, + .encode = changelog_encode_binary, }, - [CHANGELOG_ENCODE_ASCII] = + [CHANGELOG_ENCODE_ASCII] = { - .encoder = CHANGELOG_ENCODE_ASCII, - .encode = changelog_encode_ascii, + .encoder = CHANGELOG_ENCODE_ASCII, + .encode = changelog_encode_ascii, }, }; void -changelog_encode_change( changelog_priv_t * priv) +changelog_encode_change(changelog_priv_t *priv) { - priv->ce = &cb_encoder[priv->encode_mode]; + priv->ce = &cb_encoder[priv->encode_mode]; } diff --git a/xlators/features/changelog/src/changelog-encoders.h b/xlators/features/changelog/src/changelog-encoders.h index a3efbee05ee..26252696d56 100644 --- a/xlators/features/changelog/src/changelog-encoders.h +++ b/xlators/features/changelog/src/changelog-encoders.h @@ -11,35 +11,39 @@ #ifndef _CHANGELOG_ENCODERS_H #define _CHANGELOG_ENCODERS_H -#include "xlator.h" -#include "defaults.h" +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> #include "changelog-helpers.h" -#define CHANGELOG_STORE_ASCII(priv, buf, off, gfid, gfid_len, cld) do { \ - CHANGELOG_FILL_BUFFER (buffer, off, \ - priv->maps[cld->cld_type], 1); \ - CHANGELOG_FILL_BUFFER (buffer, \ - off, gfid, gfid_len); \ - } while (0) +#define CHANGELOG_STORE_ASCII(priv, buf, off, gfid, gfid_len, cld) \ + do { \ + CHANGELOG_FILL_BUFFER(buffer, off, priv->maps[cld->cld_type], 1); \ + CHANGELOG_FILL_BUFFER(buffer, off, gfid, gfid_len); \ + } while (0) -#define CHANGELOG_STORE_BINARY(priv, buf, off, gfid, cld) do { \ - CHANGELOG_FILL_BUFFER (buffer, off, \ - priv->maps[cld->cld_type], 1); \ - CHANGELOG_FILL_BUFFER (buffer, \ - off, gfid, sizeof (uuid_t)); \ - } while (0) +#define CHANGELOG_STORE_BINARY(priv, buf, off, gfid, cld) \ + do { \ + CHANGELOG_FILL_BUFFER(buffer, off, priv->maps[cld->cld_type], 1); \ + CHANGELOG_FILL_BUFFER(buffer, off, gfid, sizeof(uuid_t)); \ + } while (0) size_t -entry_fn (void *data, char *buffer, gf_boolean_t encode); +entry_fn(void *data, char *buffer, gf_boolean_t encode); size_t -fop_fn (void *data, char *buffer, gf_boolean_t encode); +del_entry_fn(void *data, char *buffer, gf_boolean_t encode); +size_t +fop_fn(void *data, char *buffer, gf_boolean_t encode); +size_t +number_fn(void *data, char *buffer, gf_boolean_t encode); +void +entry_free_fn(void *data); void -entry_free_fn (void *data); +del_entry_free_fn(void *data); int -changelog_encode_binary (xlator_t *, changelog_log_data_t *); +changelog_encode_binary(xlator_t *, changelog_log_data_t *); int -changelog_encode_ascii (xlator_t *, changelog_log_data_t *); +changelog_encode_ascii(xlator_t *, changelog_log_data_t *); void changelog_encode_change(changelog_priv_t *); diff --git a/xlators/features/changelog/src/changelog-ev-handle.c b/xlators/features/changelog/src/changelog-ev-handle.c new file mode 100644 index 00000000000..aa94459de5a --- /dev/null +++ b/xlators/features/changelog/src/changelog-ev-handle.c @@ -0,0 +1,412 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "changelog-ev-handle.h" +#include "changelog-rpc-common.h" +#include "changelog-helpers.h" + +struct rpc_clnt_program changelog_ev_program; + +#define NR_IOVEC (MAX_IOVEC - 3) +struct ev_rpc_vec { + int count; + struct iovec vector[NR_IOVEC]; + + /* sequence number */ + unsigned long seq; +}; + +struct ev_rpc { + rbuf_list_t *rlist; + struct rpc_clnt *rpc; + struct ev_rpc_vec vec; +}; + +/** + * As of now this just does the minimal (retval logging). Going further + * un-acknowledges sequence numbers can be retransmitted and other + * intelligence can be built into the server. + */ +int +changelog_event_dispatch_cbk(struct rpc_req *req, struct iovec *iov, int count, + void *myframe) +{ + return 0; +} + +/* dispatcher RPC */ +int +changelog_dispatch_vec(call_frame_t *frame, xlator_t *this, + struct rpc_clnt *rpc, struct ev_rpc_vec *vec) +{ + struct timeval tv = { + 0, + }; + changelog_event_req req = { + 0, + }; + + (void)gettimeofday(&tv, NULL); + + /** + * Event dispatch RPC header contains a sequence number for each + * dispatch. This allows the receiver to order the request before + * processing. + */ + req.seq = vec->seq; + req.tv_sec = tv.tv_sec; + req.tv_usec = tv.tv_usec; + + return changelog_rpc_sumbit_req( + rpc, (void *)&req, frame, &changelog_ev_program, + CHANGELOG_REV_PROC_EVENT, vec->vector, vec->count, NULL, this, + changelog_event_dispatch_cbk, (xdrproc_t)xdr_changelog_event_req); +} + +int +changelog_event_dispatch_rpc(call_frame_t *frame, xlator_t *this, void *data) +{ + int idx = 0; + int count = 0; + int ret = 0; + unsigned long sequence = 0; + rbuf_iovec_t *rvec = NULL; + struct ev_rpc *erpc = NULL; + struct rlist_iter riter = { + { + 0, + }, + }; + + /* dispatch NR_IOVEC IO vectors at a time. */ + + erpc = data; + sequence = erpc->rlist->seq[0]; + + rlist_iter_init(&riter, erpc->rlist); + + rvec_for_each_entry(rvec, &riter) + { + idx = count % NR_IOVEC; + if (++count == NR_IOVEC) { + erpc->vec.vector[idx] = rvec->iov; + erpc->vec.seq = sequence++; + erpc->vec.count = NR_IOVEC; + + ret = changelog_dispatch_vec(frame, this, erpc->rpc, &erpc->vec); + if (ret) + break; + count = 0; + continue; + } + + erpc->vec.vector[idx] = rvec->iov; + } + + if (ret) + goto error_return; + + idx = count % NR_IOVEC; + if (idx) { + erpc->vec.seq = sequence; + erpc->vec.count = idx; + + ret = changelog_dispatch_vec(frame, this, erpc->rpc, &erpc->vec); + } + +error_return: + return ret; +} + +int +changelog_rpc_notify(struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, + void *data) +{ + xlator_t *this = NULL; + changelog_rpc_clnt_t *crpc = NULL; + changelog_clnt_t *c_clnt = NULL; + changelog_priv_t *priv = NULL; + changelog_ev_selector_t *selection = NULL; + uint64_t clntcnt = 0; + uint64_t xprtcnt = 0; + + crpc = mydata; + this = crpc->this; + c_clnt = crpc->c_clnt; + + priv = this->private; + + switch (event) { + case RPC_CLNT_CONNECT: + selection = &priv->ev_selection; + GF_ATOMIC_INC(priv->clntcnt); + + LOCK(&c_clnt->wait_lock); + { + LOCK(&c_clnt->active_lock); + { + changelog_select_event(this, selection, crpc->filter); + list_move_tail(&crpc->list, &c_clnt->active); + } + UNLOCK(&c_clnt->active_lock); + } + UNLOCK(&c_clnt->wait_lock); + + break; + case RPC_CLNT_DISCONNECT: + rpc_clnt_disable(crpc->rpc); + + /* rpc_clnt_disable doesn't unref the rpc. It just marks + * the rpc as disabled and cancels reconnection timer. + * Hence unref the rpc object to free it. + */ + rpc_clnt_unref(crpc->rpc); + + if (priv) + selection = &priv->ev_selection; + + LOCK(&crpc->lock); + { + if (selection) + changelog_deselect_event(this, selection, crpc->filter); + changelog_set_disconnect_flag(crpc, _gf_true); + } + UNLOCK(&crpc->lock); + LOCK(&c_clnt->active_lock); + { + list_del_init(&crpc->list); + } + UNLOCK(&c_clnt->active_lock); + + break; + case RPC_CLNT_MSG: + case RPC_CLNT_DESTROY: + /* Free up mydata */ + changelog_rpc_clnt_unref(crpc); + clntcnt = GF_ATOMIC_DEC(priv->clntcnt); + xprtcnt = GF_ATOMIC_GET(priv->xprtcnt); + if (this->cleanup_starting) { + if (!clntcnt && !xprtcnt) + changelog_process_cleanup_event(this); + } + break; + case RPC_CLNT_PING: + break; + } + + return 0; +} + +void * +changelog_ev_connector(void *data) +{ + xlator_t *this = NULL; + changelog_clnt_t *c_clnt = NULL; + changelog_rpc_clnt_t *crpc = NULL; + + c_clnt = data; + this = c_clnt->this; + + while (1) { + pthread_mutex_lock(&c_clnt->pending_lock); + { + while (list_empty(&c_clnt->pending)) + pthread_cond_wait(&c_clnt->pending_cond, &c_clnt->pending_lock); + crpc = list_first_entry(&c_clnt->pending, changelog_rpc_clnt_t, + list); + crpc->rpc = changelog_rpc_client_init(this, crpc, crpc->sock, + changelog_rpc_notify); + if (!crpc->rpc) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_MSG_RPC_CONNECT_ERROR, "path=%s", crpc->sock, + NULL); + crpc->cleanup(crpc); + goto mutex_unlock; + } + + LOCK(&c_clnt->wait_lock); + { + list_move_tail(&crpc->list, &c_clnt->waitq); + } + UNLOCK(&c_clnt->wait_lock); + } + mutex_unlock: + pthread_mutex_unlock(&c_clnt->pending_lock); + } + + return NULL; +} + +void +changelog_ev_cleanup_connections(xlator_t *this, changelog_clnt_t *c_clnt) +{ + changelog_rpc_clnt_t *crpc = NULL; + + /* cleanup active connections */ + LOCK(&c_clnt->active_lock); + { + list_for_each_entry(crpc, &c_clnt->active, list) + { + rpc_clnt_disable(crpc->rpc); + } + } + UNLOCK(&c_clnt->active_lock); +} + +/** + * TODO: granularize lock + * + * If we have multiple threads dispatching events, doing it this way is + * a performance bottleneck. + */ + +static changelog_rpc_clnt_t * +get_client(changelog_clnt_t *c_clnt, struct list_head **next) +{ + changelog_rpc_clnt_t *crpc = NULL; + + LOCK(&c_clnt->active_lock); + { + if (*next == &c_clnt->active) + goto unblock; + crpc = list_entry(*next, changelog_rpc_clnt_t, list); + /* ref rpc as DISCONNECT might unref the rpc asynchronously */ + changelog_rpc_clnt_ref(crpc); + rpc_clnt_ref(crpc->rpc); + *next = (*next)->next; + } +unblock: + UNLOCK(&c_clnt->active_lock); + + return crpc; +} + +static void +put_client(changelog_clnt_t *c_clnt, changelog_rpc_clnt_t *crpc) +{ + LOCK(&c_clnt->active_lock); + { + rpc_clnt_unref(crpc->rpc); + changelog_rpc_clnt_unref(crpc); + } + UNLOCK(&c_clnt->active_lock); +} + +void +_dispatcher(rbuf_list_t *rlist, void *arg) +{ + xlator_t *this = NULL; + changelog_clnt_t *c_clnt = NULL; + changelog_rpc_clnt_t *crpc = NULL; + struct ev_rpc erpc = { + 0, + }; + struct list_head *next = NULL; + + c_clnt = arg; + this = c_clnt->this; + + erpc.rlist = rlist; + next = c_clnt->active.next; + + while (1) { + crpc = get_client(c_clnt, &next); + if (!crpc) + break; + erpc.rpc = crpc->rpc; + (void)changelog_invoke_rpc(this, crpc->rpc, &changelog_ev_program, + CHANGELOG_REV_PROC_EVENT, &erpc); + put_client(c_clnt, crpc); + } +} + +/** this is called under rotbuff's lock */ +void +sequencer(rbuf_list_t *rlist, void *mydata) +{ + unsigned long range = 0; + changelog_clnt_t *c_clnt = 0; + + c_clnt = mydata; + + range = (RLIST_ENTRY_COUNT(rlist)) / NR_IOVEC; + if ((RLIST_ENTRY_COUNT(rlist)) % NR_IOVEC) + range++; + RLIST_STORE_SEQ(rlist, c_clnt->sequence, range); + + c_clnt->sequence += range; +} + +void * +changelog_ev_dispatch(void *data) +{ + int ret = 0; + void *opaque = NULL; + xlator_t *this = NULL; + changelog_clnt_t *c_clnt = NULL; + struct timeval tv = { + 0, + }; + + c_clnt = data; + this = c_clnt->this; + + while (1) { + /* TODO: change this to be pthread cond based.. later */ + + tv.tv_sec = 1; + tv.tv_usec = 0; + select(0, NULL, NULL, NULL, &tv); + + ret = rbuf_get_buffer(c_clnt->rbuf, &opaque, sequencer, c_clnt); + if (ret != RBUF_CONSUMABLE) { + if (ret != RBUF_EMPTY) + gf_smsg(this->name, GF_LOG_WARNING, 0, + CHANGELOG_MSG_BUFFER_STARVATION_ERROR, + "Failed to get buffer for RPC dispatch", + "rbuf_retval=%d", ret, NULL); + continue; + } + + ret = rbuf_wait_for_completion(c_clnt->rbuf, opaque, _dispatcher, + c_clnt); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, + CHANGELOG_MSG_PUT_BUFFER_FAILED, NULL); + } + + return NULL; +} + +void +changelog_ev_queue_connection(changelog_clnt_t *c_clnt, + changelog_rpc_clnt_t *crpc) +{ + pthread_mutex_lock(&c_clnt->pending_lock); + { + list_add_tail(&crpc->list, &c_clnt->pending); + pthread_cond_signal(&c_clnt->pending_cond); + } + pthread_mutex_unlock(&c_clnt->pending_lock); +} + +struct rpc_clnt_procedure changelog_ev_procs[CHANGELOG_REV_PROC_MAX] = { + [CHANGELOG_REV_PROC_NULL] = {"NULL", NULL}, + [CHANGELOG_REV_PROC_EVENT] = {"EVENT DISPATCH", + changelog_event_dispatch_rpc}, +}; + +struct rpc_clnt_program changelog_ev_program = { + .progname = "CHANGELOG EVENT DISPATCHER", + .prognum = CHANGELOG_REV_RPC_PROCNUM, + .progver = CHANGELOG_REV_RPC_PROCVER, + .numproc = CHANGELOG_REV_PROC_MAX, + .proctable = changelog_ev_procs, +}; diff --git a/xlators/features/changelog/src/changelog-ev-handle.h b/xlators/features/changelog/src/changelog-ev-handle.h new file mode 100644 index 00000000000..cc1af58a276 --- /dev/null +++ b/xlators/features/changelog/src/changelog-ev-handle.h @@ -0,0 +1,136 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __CHANGELOG_EV_HANDLE_H +#define __CHANGELOG_EV_HANDLE_H + +#include <glusterfs/list.h> +#include <glusterfs/xlator.h> +#include "rpc-clnt.h" + +#include <glusterfs/rot-buffs.h> + +struct changelog_clnt; + +typedef struct changelog_rpc_clnt { + xlator_t *this; + + gf_lock_t lock; + + gf_atomic_t ref; + gf_boolean_t disconnected; + + unsigned int filter; + char sock[UNIX_PATH_MAX]; + + struct changelog_clnt *c_clnt; /* back pointer to list holder */ + + struct rpc_clnt *rpc; /* RPC client endpoint */ + + struct list_head list; /* ->pending, ->waitq, ->active */ + + void (*cleanup)(struct changelog_rpc_clnt *); /* cleanup handler */ +} changelog_rpc_clnt_t; + +static inline void +changelog_rpc_clnt_ref(changelog_rpc_clnt_t *crpc) +{ + GF_ATOMIC_INC(crpc->ref); +} + +static inline void +changelog_set_disconnect_flag(changelog_rpc_clnt_t *crpc, gf_boolean_t flag) +{ + crpc->disconnected = flag; +} + +static inline int +changelog_rpc_clnt_is_disconnected(changelog_rpc_clnt_t *crpc) +{ + return (crpc->disconnected == _gf_true); +} + +static inline void +changelog_rpc_clnt_unref(changelog_rpc_clnt_t *crpc) +{ + gf_boolean_t gone = _gf_false; + uint64_t ref = 0; + + ref = GF_ATOMIC_DEC(crpc->ref); + + if (!ref && changelog_rpc_clnt_is_disconnected(crpc)) { + list_del(&crpc->list); + gone = _gf_true; + } + + if (gone) + crpc->cleanup(crpc); +} + +/** + * This structure holds pending and active clients. On probe RPC all + * an instance of the above structure (@changelog_rpc_clnt) is placed + * in ->pending and gets moved to ->active on a successful connect. + * + * locking rules: + * + * Manipulating ->pending + * ->pending_lock + * ->pending + * + * Manipulating ->active + * ->active_lock + * ->active + * + * Moving object from ->pending to ->active + * ->pending_lock + * ->active_lock + * + * Objects are _never_ moved from ->active to ->pending, i.e., during + * disconnection, the object is destroyed. Well, we could have tried + * to reconnect, but that's pure waste.. let the other end reconnect. + */ + +typedef struct changelog_clnt { + xlator_t *this; + + /* pending connections */ + pthread_mutex_t pending_lock; + pthread_cond_t pending_cond; + struct list_head pending; + + /* current active connections */ + gf_lock_t active_lock; + struct list_head active; + + gf_lock_t wait_lock; + struct list_head waitq; + + /* consumer part of rot-buffs */ + rbuf_t *rbuf; + unsigned long sequence; +} changelog_clnt_t; + +void * +changelog_ev_connector(void *); + +void * +changelog_ev_dispatch(void *); + +/* APIs */ +void +changelog_ev_queue_connection(changelog_clnt_t *, changelog_rpc_clnt_t *); + +void +changelog_ev_cleanup_connections(xlator_t *, changelog_clnt_t *); + +void +changelog_process_cleanup_event(xlator_t *); +#endif diff --git a/xlators/features/changelog/src/changelog-helpers.c b/xlators/features/changelog/src/changelog-helpers.c index 7ab0091b592..e561997d858 100644 --- a/xlators/features/changelog/src/changelog-helpers.c +++ b/xlators/features/changelog/src/changelog-helpers.c @@ -8,371 +8,1233 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "defaults.h" -#include "logging.h" -#include "iobuf.h" +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include <glusterfs/logging.h> +#include <glusterfs/iobuf.h> +#include <glusterfs/syscall.h> #include "changelog-helpers.h" +#include "changelog-encoders.h" #include "changelog-mem-types.h" +#include "changelog-messages.h" #include "changelog-encoders.h" +#include "changelog-rpc-common.h" #include <pthread.h> +#include <time.h> + +static void +changelog_cleanup_free_mutex(void *arg_mutex) +{ + pthread_mutex_t *p_mutex = (pthread_mutex_t *)arg_mutex; + + if (p_mutex) + pthread_mutex_unlock(p_mutex); +} + +int +changelog_thread_cleanup(xlator_t *this, pthread_t thr_id) +{ + int ret = 0; + void *retval = NULL; + + /* send a cancel request to the thread */ + ret = pthread_cancel(thr_id); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_PTHREAD_CANCEL_FAILED, NULL); + goto out; + } + + ret = pthread_join(thr_id, &retval); + if ((ret != 0) || (retval != PTHREAD_CANCELED)) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_PTHREAD_CANCEL_FAILED, NULL); + } + +out: + return ret; +} + +void * +changelog_get_usable_buffer(changelog_local_t *local) +{ + changelog_log_data_t *cld = NULL; + + if (!local) + return NULL; + + cld = &local->cld; + if (!cld->cld_iobuf) + return NULL; + + return cld->cld_iobuf->ptr; +} + +static int +changelog_selector_index(unsigned int selector) +{ + return (ffs(selector) - 1); +} + +int +changelog_ev_selected(xlator_t *this, changelog_ev_selector_t *selection, + unsigned int selector) +{ + int idx = 0; + + idx = changelog_selector_index(selector); + gf_msg_debug(this->name, 0, "selector ref count for %d (idx: %d): %d", + selector, idx, selection->ref[idx]); + /* this can be lockless */ + return (idx < CHANGELOG_EV_SELECTION_RANGE && (selection->ref[idx] > 0)); +} void -changelog_thread_cleanup (xlator_t *this, pthread_t thr_id) +changelog_select_event(xlator_t *this, changelog_ev_selector_t *selection, + unsigned int selector) { - int ret = 0; - void *retval = NULL; + int idx = 0; + + LOCK(&selection->reflock); + { + while (selector) { + idx = changelog_selector_index(selector); + if (idx < CHANGELOG_EV_SELECTION_RANGE) { + selection->ref[idx]++; + gf_msg_debug(this->name, 0, "selecting event %d", idx); + } + selector &= ~(1 << idx); + } + } + UNLOCK(&selection->reflock); +} - /* send a cancel request to the thread */ - ret = pthread_cancel (thr_id); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "could not cancel thread (reason: %s)", - strerror (errno)); - goto out; +void +changelog_deselect_event(xlator_t *this, changelog_ev_selector_t *selection, + unsigned int selector) +{ + int idx = 0; + + LOCK(&selection->reflock); + { + while (selector) { + idx = changelog_selector_index(selector); + if (idx < CHANGELOG_EV_SELECTION_RANGE) { + selection->ref[idx]--; + gf_msg_debug(this->name, 0, "de-selecting event %d", idx); + } + selector &= ~(1 << idx); } + } + UNLOCK(&selection->reflock); +} + +int +changelog_init_event_selection(xlator_t *this, + changelog_ev_selector_t *selection) +{ + int ret = 0; + int j = CHANGELOG_EV_SELECTION_RANGE; + + ret = LOCK_INIT(&selection->reflock); + if (ret != 0) + return -1; - ret = pthread_join (thr_id, &retval); - if (ret || (retval != PTHREAD_CANCELED)) { - gf_log (this->name, GF_LOG_ERROR, - "cancel request not adhered as expected" - " (reason: %s)", strerror (errno)); + LOCK(&selection->reflock); + { + while (j--) { + selection->ref[j] = 0; } + } + UNLOCK(&selection->reflock); - out: - return; + return 0; } -inline void * -changelog_get_usable_buffer (changelog_local_t *local) +static void +changelog_perform_dispatch(xlator_t *this, changelog_priv_t *priv, void *mem, + size_t size) { - changelog_log_data_t *cld = NULL; + char *buf = NULL; + void *opaque = NULL; + + buf = rbuf_reserve_write_area(priv->rbuf, size, &opaque); + if (!buf) { + gf_msg_callingfn(this->name, GF_LOG_WARNING, 0, + CHANGELOG_MSG_DISPATCH_EVENT_FAILED, + "failed to dispatch event"); + return; + } - cld = &local->cld; - if (!cld->cld_iobuf) - return NULL; + memcpy(buf, mem, size); + rbuf_write_complete(opaque); +} - return cld->cld_iobuf->ptr; +void +changelog_dispatch_event(xlator_t *this, changelog_priv_t *priv, + changelog_event_t *ev) +{ + changelog_ev_selector_t *selection = NULL; + + selection = &priv->ev_selection; + if (changelog_ev_selected(this, selection, ev->ev_type)) { + changelog_perform_dispatch(this, priv, ev, CHANGELOG_EV_SIZE); + } } -inline void -changelog_set_usable_record_and_length (changelog_local_t *local, - size_t len, int xr) +void +changelog_set_usable_record_and_length(changelog_local_t *local, size_t len, + int xr) { - changelog_log_data_t *cld = NULL; + changelog_log_data_t *cld = NULL; - cld = &local->cld; + cld = &local->cld; - cld->cld_ptr_len = len; - cld->cld_xtra_records = xr; + cld->cld_ptr_len = len; + cld->cld_xtra_records = xr; } void -changelog_local_cleanup (xlator_t *xl, changelog_local_t *local) +changelog_local_cleanup(xlator_t *xl, changelog_local_t *local) { - int i = 0; - changelog_opt_t *co = NULL; - changelog_log_data_t *cld = NULL; + int i = 0; + changelog_opt_t *co = NULL; + changelog_log_data_t *cld = NULL; - if (!local) - return; + if (!local) + return; - cld = &local->cld; + cld = &local->cld; - /* cleanup dynamic allocation for extra records */ - if (cld->cld_xtra_records) { - co = (changelog_opt_t *) cld->cld_ptr; - for (; i < cld->cld_xtra_records; i++, co++) - if (co->co_free) - co->co_free (co); - } + /* cleanup dynamic allocation for extra records */ + if (cld->cld_xtra_records) { + co = (changelog_opt_t *)cld->cld_ptr; + for (; i < cld->cld_xtra_records; i++, co++) + if (co->co_free) + co->co_free(co); + } - CHANGELOG_IOBUF_UNREF (cld->cld_iobuf); + CHANGELOG_IOBUF_UNREF(cld->cld_iobuf); - if (local->inode) - inode_unref (local->inode); + if (local->inode) + inode_unref(local->inode); - mem_put (local); + mem_put(local); } -inline int -changelog_write (int fd, char *buffer, size_t len) +int +changelog_write(int fd, char *buffer, size_t len) { - ssize_t size = 0; - size_t writen = 0; + ssize_t size = 0; + size_t written = 0; - while (writen < len) { - size = write (fd, - buffer + writen, len - writen); - if (size <= 0) - break; + while (written < len) { + size = sys_write(fd, buffer + written, len - written); + if (size <= 0) + break; - writen += size; + written += size; + } + + return (written != len); +} + +int +htime_update(xlator_t *this, changelog_priv_t *priv, time_t ts, char *buffer) +{ + char changelog_path[PATH_MAX + 1] = { + 0, + }; + int len = -1; + char x_value[25] = { + 0, + }; + /* time stamp(10) + : (1) + rolltime (12 ) + buffer (2) */ + int ret = 0; + + if (priv->htime_fd == -1) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_HTIME_ERROR, + "reason=fd not available", NULL); + ret = -1; + goto out; + } + len = snprintf(changelog_path, PATH_MAX, "%s", buffer); + if (len >= PATH_MAX) { + ret = -1; + goto out; + } + if (changelog_write(priv->htime_fd, (void *)changelog_path, len + 1) < 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_HTIME_ERROR, + "reason=write failed", NULL); + ret = -1; + goto out; + } + + len = snprintf(x_value, sizeof(x_value), "%ld:%d", ts, + priv->rollover_count); + if (len >= sizeof(x_value)) { + ret = -1; + goto out; + } + + if (sys_fsetxattr(priv->htime_fd, HTIME_KEY, x_value, len, XATTR_REPLACE)) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_HTIME_ERROR, + "reason=xattr updation failed", "XATTR_REPLACE=true", + "changelog=%s", changelog_path, NULL); + + if (sys_fsetxattr(priv->htime_fd, HTIME_KEY, x_value, len, 0)) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_HTIME_ERROR, + "reason=xattr updation failed", "changelog=%s", + changelog_path, NULL); + ret = -1; + goto out; } + } + + priv->rollover_count += 1; + +out: + return ret; +} + +/* + * Description: Check if the changelog to rollover is empty or not. + * It is assumed that fd passed is already verified. + * + * Returns: + * 1 : If found empty, changed path from "CHANGELOG.<TS>" to "changelog.<TS>" + * 0 : If NOT empty, proceed usual. + */ +int +cl_is_empty(xlator_t *this, int fd) +{ + int ret = -1; + size_t elen = 0; + int encoding = -1; + char buffer[1024] = { + 0, + }; + struct stat stbuf = { + 0, + }; + int major_version = -1; + int minor_version = -1; + + ret = sys_fstat(fd, &stbuf); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSTAT_OP_FAILED, + NULL); + goto out; + } + + ret = sys_lseek(fd, 0, SEEK_SET); + if (ret == -1) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_LSEEK_OP_FAILED, + NULL); + goto out; + } + + CHANGELOG_GET_HEADER_INFO(fd, buffer, sizeof(buffer), encoding, + major_version, minor_version, elen); + + if (elen == stbuf.st_size) { + ret = 1; + } else { + ret = 0; + } + +out: + return ret; +} - return (writen != len); +/* + * Description: Updates "CHANGELOG" to "changelog" for writing changelog path + * to htime file. + * + * Returns: + * 0 : Success + * -1 : Error + */ +int +update_path(xlator_t *this, char *cl_path) +{ + const char low_cl[] = "changelog"; + const char up_cl[] = "CHANGELOG"; + char *found = NULL; + int ret = -1; + + found = strstr(cl_path, up_cl); + + if (found == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PATH_NOT_FOUND, + NULL); + goto out; + } else { + memcpy(found, low_cl, sizeof(low_cl) - 1); + } + + ret = 0; +out: + return ret; } static int -changelog_rollover_changelog (xlator_t *this, - changelog_priv_t *priv, unsigned long ts) -{ - int ret = -1; - int notify = 0; - char *bname = NULL; - char ofile[PATH_MAX] = {0,}; - char nfile[PATH_MAX] = {0,}; - - if (priv->changelog_fd != -1) { - close (priv->changelog_fd); - priv->changelog_fd = -1; +changelog_rollover_changelog(xlator_t *this, changelog_priv_t *priv, time_t ts) +{ + int ret = -1; + int notify = 0; + int cl_empty_flag = 0; + struct tm *gmt; + char yyyymmdd[40]; + char ofile[PATH_MAX] = { + 0, + }; + char nfile[PATH_MAX] = { + 0, + }; + char nfile_dir[PATH_MAX] = { + 0, + }; + changelog_event_t ev = { + 0, + }; + + if (priv->changelog_fd != -1) { + ret = sys_fsync(priv->changelog_fd); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_FSYNC_OP_FAILED, NULL); } + ret = cl_is_empty(this, priv->changelog_fd); + if (ret == 1) { + cl_empty_flag = 1; + } else if (ret == -1) { + /* Log error but proceed as usual */ + gf_smsg(this->name, GF_LOG_WARNING, 0, + CHANGELOG_MSG_DETECT_EMPTY_CHANGELOG_FAILED, NULL); + } + sys_close(priv->changelog_fd); + priv->changelog_fd = -1; + } - (void) snprintf (ofile, PATH_MAX, - "%s/"CHANGELOG_FILE_NAME, priv->changelog_dir); - (void) snprintf (nfile, PATH_MAX, - "%s/"CHANGELOG_FILE_NAME".%lu", - priv->changelog_dir, ts); + /* Get GMT time. */ + gmt = gmtime(&ts); - ret = rename (ofile, nfile); - if (!ret) - notify = 1; + strftime(yyyymmdd, sizeof(yyyymmdd), "%Y/%m/%d", gmt); - if (ret && (errno == ENOENT)) { - ret = 0; + (void)snprintf(ofile, PATH_MAX, "%s/" CHANGELOG_FILE_NAME, + priv->changelog_dir); + (void)snprintf(nfile, PATH_MAX, "%s/%s/" CHANGELOG_FILE_NAME ".%ld", + priv->changelog_dir, yyyymmdd, ts); + (void)snprintf(nfile_dir, PATH_MAX, "%s/%s", priv->changelog_dir, yyyymmdd); + + if (cl_empty_flag == 1) { + ret = sys_unlink(ofile); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_UNLINK_OP_FAILED, "path=%s", ofile, NULL); + ret = 0; /* Error in unlinking empty changelog should + not break further changelog operation, so + reset return value to 0*/ } + } else { + ret = sys_rename(ofile, nfile); + /* Changelog file rename gets ENOENT when parent dir doesn't exist */ + if (errno == ENOENT) { + ret = mkdir_p(nfile_dir, 0600, _gf_true); + + if ((ret == -1) && (EEXIST != errno)) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_MKDIR_ERROR, "%s", nfile_dir, NULL); + goto out; + } + + ret = sys_rename(ofile, nfile); + } + + if (ret && (errno == ENOENT)) { + ret = 0; + goto out; + } if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "error renaming %s -> %s (reason %s)", - ofile, nfile, strerror (errno)); + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_RENAME_ERROR, + "from=%s", ofile, "to=%s", nfile, NULL); } + } - if (notify) { - bname = basename (nfile); - gf_log (this->name, GF_LOG_DEBUG, "notifying: %s", bname); - ret = changelog_write (priv->wfd, bname, strlen (bname) + 1); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to send file name to notify thread" - " (reason: %s)", strerror (errno)); - } + if (!ret && (cl_empty_flag == 0)) { + notify = 1; + } + + if (!ret) { + if (cl_empty_flag) { + update_path(this, nfile); + } + ret = htime_update(this, priv, ts, nfile); + if (ret == -1) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_HTIME_ERROR, + NULL); + goto out; } + } + + if (notify) { + ev.ev_type = CHANGELOG_OP_TYPE_JOURNAL; + memcpy(ev.u.journal.path, nfile, strlen(nfile) + 1); + changelog_dispatch_event(this, priv, &ev); + } +out: + /* If this is explicit rollover initiated by snapshot, + * wakeup reconfigure thread waiting for changelog to + * rollover. This should happen even in failure cases as + * well otherwise snapshot will timeout and fail. Hence + * moved under out. + */ + if (priv->explicit_rollover) { + priv->explicit_rollover = _gf_false; + + pthread_mutex_lock(&priv->bn.bnotify_mutex); + { + if (ret) { + priv->bn.bnotify_error = _gf_true; + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_MSG_EXPLICIT_ROLLOVER_FAILED, NULL); + } else { + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_BNOTIFY_INFO, + "changelog=%s", nfile, NULL); + } + priv->bn.bnotify = _gf_false; + pthread_cond_signal(&priv->bn.bnotify_cond); + } + pthread_mutex_unlock(&priv->bn.bnotify_mutex); + } + return ret; +} - return ret; +int +filter_cur_par_dirs(const struct dirent *entry) +{ + if (entry == NULL) + return 0; + + if ((strcmp(entry->d_name, ".") == 0) || (strcmp(entry->d_name, "..") == 0)) + return 0; + else + return 1; } +/* + * find_current_htime: + * It finds the latest htime file and sets the HTIME_CURRENT + * xattr. + * RETURN VALUE: + * -1 : Error + * ret: Number of directory entries; + */ + int -changelog_open (xlator_t *this, - changelog_priv_t *priv) -{ - int fd = 0; - int ret = -1; - int flags = 0; - char buffer[1024] = {0,}; - char changelog_path[PATH_MAX] = {0,}; - - (void) snprintf (changelog_path, PATH_MAX, - "%s/"CHANGELOG_FILE_NAME, - priv->changelog_dir); - - flags |= (O_CREAT | O_RDWR); - if (priv->fsync_interval == 0) - flags |= O_SYNC; - - fd = open (changelog_path, flags, - S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); - if (fd < 0) { - gf_log (this->name, GF_LOG_ERROR, - "unable to open/create changelog file %s" - " (reason: %s). change-logging will be" - " inactive", changelog_path, strerror (errno)); - goto out; +find_current_htime(int ht_dir_fd, const char *ht_dir_path, char *ht_file_bname) +{ + struct dirent **namelist = NULL; + int ret = 0; + int cnt = 0; + int i = 0; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT(ht_dir_path); + + cnt = scandir(ht_dir_path, &namelist, filter_cur_par_dirs, alphasort); + if (cnt < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_SCAN_DIR_FAILED, + NULL); + } else if (cnt > 0) { + if (snprintf(ht_file_bname, NAME_MAX, "%s", + namelist[cnt - 1]->d_name) >= NAME_MAX) { + ret = -1; + goto out; + } + if (sys_fsetxattr(ht_dir_fd, HTIME_CURRENT, ht_file_bname, + strlen(ht_file_bname), 0)) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_FSETXATTR_FAILED, "HTIME_CURRENT", NULL); + ret = -1; + goto out; } - priv->changelog_fd = fd; - - (void) snprintf (buffer, 1024, CHANGELOG_HEADER, - CHANGELOG_VERSION_MAJOR, - CHANGELOG_VERSION_MINOR, - priv->ce->encoder); - ret = changelog_write_change (priv, buffer, strlen (buffer)); - if (ret) { - close (priv->changelog_fd); - priv->changelog_fd = -1; - goto out; + if (sys_fsync(ht_dir_fd) < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_FSYNC_OP_FAILED, NULL); + ret = -1; + goto out; } + } - ret = 0; +out: + for (i = 0; i < cnt; i++) + free(namelist[i]); + free(namelist); - out: - return ret; + if (ret) + cnt = ret; + + return cnt; } +/* Returns 0 on successful open of htime file + * returns -1 on failure or error + */ int -changelog_start_next_change (xlator_t *this, - changelog_priv_t *priv, - unsigned long ts, gf_boolean_t finale) +htime_open(xlator_t *this, changelog_priv_t *priv, time_t ts) { - int ret = -1; + int ht_file_fd = -1; + int ht_dir_fd = -1; + int ret = 0; + int cnt = 0; + char ht_dir_path[PATH_MAX] = { + 0, + }; + char ht_file_path[PATH_MAX] = { + 0, + }; + char ht_file_bname[NAME_MAX] = { + 0, + }; + char x_value[NAME_MAX] = { + 0, + }; + int flags = 0; + unsigned long min_ts = 0; + unsigned long max_ts = 0; + unsigned long total = 0; + unsigned long total1 = 0; + ssize_t size = 0; + struct stat stat_buf = { + 0, + }; + unsigned long record_len = 0; + int32_t len = 0; + + CHANGELOG_FILL_HTIME_DIR(priv->changelog_dir, ht_dir_path); + + /* Open htime directory to get HTIME_CURRENT */ + ht_dir_fd = open(ht_dir_path, O_RDONLY); + if (ht_dir_fd == -1) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED, + "path=%s", ht_dir_path, NULL); + ret = -1; + goto out; + } + + size = sys_fgetxattr(ht_dir_fd, HTIME_CURRENT, ht_file_bname, + sizeof(ht_file_bname)); + if (size < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FGETXATTR_FAILED, + "name=HTIME_CURRENT", NULL); + + /* If upgrade scenario, find the latest HTIME.TSTAMP file + * and use the same. If error, create a new HTIME.TSTAMP + * file. + */ + cnt = find_current_htime(ht_dir_fd, ht_dir_path, ht_file_bname); + if (cnt <= 0) { + gf_smsg(this->name, GF_LOG_INFO, errno, + CHANGELOG_MSG_NO_HTIME_CURRENT, NULL); + sys_close(ht_dir_fd); + return htime_create(this, priv, ts); + } - ret = changelog_rollover_changelog (this, priv, ts); + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_HTIME_CURRENT_ERROR, NULL); + } + + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_HTIME_CURRENT, "path=%s", + ht_file_bname, NULL); + len = snprintf(ht_file_path, PATH_MAX, "%s/%s", ht_dir_path, ht_file_bname); + if ((len < 0) || (len >= PATH_MAX)) { + ret = -1; + goto out; + } + + /* Open in append mode as existing htime file is used */ + flags |= (O_RDWR | O_SYNC | O_APPEND); + ht_file_fd = open(ht_file_path, flags, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (ht_file_fd < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED, + "path=%s", ht_file_path, NULL); + ret = -1; + goto out; + } + + /* save this htime_fd in priv->htime_fd */ + priv->htime_fd = ht_file_fd; + + ret = sys_fstat(ht_file_fd, &stat_buf); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_HTIME_STAT_ERROR, + "path=%s", ht_file_path, NULL); + ret = -1; + goto out; + } + + /* Initialize rollover-number in priv to current number */ + size = sys_fgetxattr(ht_file_fd, HTIME_KEY, x_value, sizeof(x_value)); + if (size < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FGETXATTR_FAILED, + "name=%s", HTIME_KEY, "path=%s", ht_file_path, NULL); + ret = -1; + goto out; + } + + sscanf(x_value, "%lu:%lu", &max_ts, &total); + + /* 22 = 1(/) + 20(CHANGELOG.TIMESTAMP) + 1(\x00) */ + record_len = strlen(priv->changelog_dir) + 22; + total1 = stat_buf.st_size / record_len; + if (total != total1) { + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_TOTAL_LOG_INFO, + "xattr_total=%lu", total, "size_total=%lu", total1, NULL); + } + + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_TOTAL_LOG_INFO, "min=%lu", + min_ts, "max=%lu", max_ts, "total_changelogs=%lu", total, NULL); + + if (total < total1) + priv->rollover_count = total1 + 1; + else + priv->rollover_count = total + 1; + +out: + if (ht_dir_fd != -1) + sys_close(ht_dir_fd); + return ret; +} - if (!ret && !finale) - ret = changelog_open (this, priv); +/* Returns 0 on successful creation of htime file + * returns -1 on failure or error + */ +int +htime_create(xlator_t *this, changelog_priv_t *priv, time_t ts) +{ + int ht_file_fd = -1; + int ht_dir_fd = -1; + int ret = 0; + char ht_dir_path[PATH_MAX] = { + 0, + }; + char ht_file_path[PATH_MAX] = { + 0, + }; + char ht_file_bname[NAME_MAX + 1] = { + 0, + }; + int flags = 0; + int32_t len = 0; + + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_NEW_HTIME_FILE, + "name=%ld", ts, NULL); + + CHANGELOG_FILL_HTIME_DIR(priv->changelog_dir, ht_dir_path); + + /* get the htime file name in ht_file_path */ + len = snprintf(ht_file_path, PATH_MAX, "%s/%s.%ld", ht_dir_path, + HTIME_FILE_NAME, ts); + if ((len < 0) || (len >= PATH_MAX)) { + ret = -1; + goto out; + } + + flags |= (O_CREAT | O_RDWR | O_SYNC); + ht_file_fd = open(ht_file_path, flags, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (ht_file_fd < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED, + "path=%s", ht_file_path, NULL); + ret = -1; + goto out; + } + + if (sys_fsetxattr(ht_file_fd, HTIME_KEY, HTIME_INITIAL_VALUE, + sizeof(HTIME_INITIAL_VALUE) - 1, 0)) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_XATTR_INIT_FAILED, NULL); + ret = -1; + goto out; + } + + ret = sys_fsync(ht_file_fd); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSYNC_OP_FAILED, + NULL); + goto out; + } + + /* save this htime_fd in priv->htime_fd */ + priv->htime_fd = ht_file_fd; + + ht_file_fd = -1; + + /* Set xattr HTIME_CURRENT on htime directory to htime filename */ + ht_dir_fd = open(ht_dir_path, O_RDONLY); + if (ht_dir_fd == -1) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED, + "path=%s", ht_dir_path, NULL); + ret = -1; + goto out; + } + + (void)snprintf(ht_file_bname, sizeof(ht_file_bname), "%s.%ld", + HTIME_FILE_NAME, ts); + if (sys_fsetxattr(ht_dir_fd, HTIME_CURRENT, ht_file_bname, + strlen(ht_file_bname), 0)) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSETXATTR_FAILED, + " HTIME_CURRENT", NULL); + ret = -1; + goto out; + } + + ret = sys_fsync(ht_dir_fd); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSYNC_OP_FAILED, + NULL); + goto out; + } + + /* initialize rollover-number in priv to 1 */ + priv->rollover_count = 1; + +out: + if (ht_dir_fd != -1) + sys_close(ht_dir_fd); + if (ht_file_fd != -1) + sys_close(ht_file_fd); + return ret; +} - return ret; +/* Description: + * Opens the snap changelog to log call path fops in it. + * This changelos name is "CHANGELOG.SNAP", stored in + * path ".glusterfs/changelogs/csnap". + * Returns: + * 0 : On success. + * -1 : On failure. + */ +int +changelog_snap_open(xlator_t *this, changelog_priv_t *priv) +{ + int fd = -1; + int ret = 0; + int flags = 0; + char buffer[1024] = { + 0, + }; + char c_snap_path[PATH_MAX] = { + 0, + }; + char csnap_dir_path[PATH_MAX] = { + 0, + }; + int32_t len = 0; + + CHANGELOG_FILL_CSNAP_DIR(priv->changelog_dir, csnap_dir_path); + + len = snprintf(c_snap_path, PATH_MAX, "%s/" CSNAP_FILE_NAME, + csnap_dir_path); + if ((len < 0) || (len >= PATH_MAX)) { + ret = -1; + goto out; + } + + flags |= (O_CREAT | O_RDWR | O_TRUNC); + + fd = open(c_snap_path, flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED, + "path=%s", c_snap_path, NULL); + ret = -1; + goto out; + } + priv->c_snap_fd = fd; + + (void)snprintf(buffer, 1024, CHANGELOG_HEADER, CHANGELOG_VERSION_MAJOR, + CHANGELOG_VERSION_MINOR, priv->ce->encoder); + ret = changelog_snap_write_change(priv, buffer, strlen(buffer)); + if (ret < 0) { + sys_close(priv->c_snap_fd); + priv->c_snap_fd = -1; + goto out; + } + +out: + return ret; } -/** - * return the length of entry +/* + * Description: + * Starts logging fop details in CSNAP journal. + * Returns: + * 0 : On success. + * -1 : On Failure. */ -inline size_t -changelog_entry_length () +int +changelog_snap_logging_start(xlator_t *this, changelog_priv_t *priv) { - return sizeof (changelog_log_data_t); + int ret = 0; + + ret = changelog_snap_open(this, priv); + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_SNAP_INFO, "starting", + NULL); + + return ret; } +/* + * Description: + * Stops logging fop details in CSNAP journal. + * Returns: + * 0 : On success. + * -1 : On Failure. + */ int -changelog_fill_rollover_data (changelog_log_data_t *cld, gf_boolean_t is_last) +changelog_snap_logging_stop(xlator_t *this, changelog_priv_t *priv) { - struct timeval tv = {0,}; + int ret = 0; - cld->cld_type = CHANGELOG_TYPE_ROLLOVER; + sys_close(priv->c_snap_fd); + priv->c_snap_fd = -1; - if (gettimeofday (&tv, NULL)) - return -1; + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_SNAP_INFO, "Stopped", + NULL); - cld->cld_roll_time = (unsigned long) tv.tv_sec; - cld->cld_finale = is_last; - return 0; + return ret; } int -changelog_write_change (changelog_priv_t *priv, char *buffer, size_t len) +changelog_open_journal(xlator_t *this, changelog_priv_t *priv) { - return changelog_write (priv->changelog_fd, buffer, len); + int fd = 0; + int ret = -1; + int flags = 0; + char buffer[1024] = { + 0, + }; + char changelog_path[PATH_MAX] = { + 0, + }; + + (void)snprintf(changelog_path, PATH_MAX, "%s/" CHANGELOG_FILE_NAME, + priv->changelog_dir); + + flags |= (O_CREAT | O_RDWR); + if (priv->fsync_interval == 0) + flags |= O_SYNC; + + fd = open(changelog_path, flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED, + "path=%s", changelog_path, NULL); + goto out; + } + + priv->changelog_fd = fd; + + (void)snprintf(buffer, 1024, CHANGELOG_HEADER, CHANGELOG_VERSION_MAJOR, + CHANGELOG_VERSION_MINOR, priv->ce->encoder); + ret = changelog_write_change(priv, buffer, strlen(buffer)); + if (ret) { + sys_close(priv->changelog_fd); + priv->changelog_fd = -1; + goto out; + } + + ret = 0; + +out: + return ret; } -inline int -changelog_handle_change (xlator_t *this, - changelog_priv_t *priv, changelog_log_data_t *cld) +int +changelog_start_next_change(xlator_t *this, changelog_priv_t *priv, time_t ts, + gf_boolean_t finale) { - int ret = 0; + int ret = -1; - if (CHANGELOG_TYPE_IS_ROLLOVER (cld->cld_type)) { - changelog_encode_change(priv); - ret = changelog_start_next_change (this, priv, - cld->cld_roll_time, - cld->cld_finale); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "Problem rolling over changelog(s)"); - goto out; - } + ret = changelog_rollover_changelog(this, priv, ts); - /** - * case when there is reconfigure done (disabling changelog) and there - * are still fops that have updates in prgress. - */ - if (priv->changelog_fd == -1) - return 0; - - if (CHANGELOG_TYPE_IS_FSYNC (cld->cld_type)) { - ret = fsync (priv->changelog_fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "fsync failed (reason: %s)", - strerror (errno)); - } - goto out; - } + if (!ret && !finale) + ret = changelog_open_journal(this, priv); - ret = priv->ce->encode (this, cld); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "error writing changelog to disk"); - } + return ret; +} - out: - return ret; +/** + * return the length of entry + */ +size_t +changelog_entry_length() +{ + return sizeof(changelog_log_data_t); } -changelog_local_t * -changelog_local_init (xlator_t *this, inode_t *inode, - uuid_t gfid, int xtra_records, - gf_boolean_t update_flag) +void +changelog_fill_rollover_data(changelog_log_data_t *cld, gf_boolean_t is_last) { - changelog_local_t *local = NULL; - struct iobuf *iobuf = NULL; + cld->cld_type = CHANGELOG_TYPE_ROLLOVER; + cld->cld_roll_time = gf_time(); + cld->cld_finale = is_last; +} - /** - * We relax the presence of inode if @update_flag is true. - * The caller (implmentation of the fop) needs to be careful to - * not blindly use local->inode. - */ - if (!update_flag && !inode) { - gf_log_callingfn (this->name, GF_LOG_WARNING, - "inode needed for version checking !!!"); - goto out; - } +int +changelog_snap_write_change(changelog_priv_t *priv, char *buffer, size_t len) +{ + return changelog_write(priv->c_snap_fd, buffer, len); +} - if (xtra_records) { - iobuf = iobuf_get2 (this->ctx->iobuf_pool, - xtra_records * CHANGELOG_OPT_RECORD_LEN); - if (!iobuf) - goto out; - } +int +changelog_write_change(changelog_priv_t *priv, char *buffer, size_t len) +{ + return changelog_write(priv->changelog_fd, buffer, len); +} - local = mem_get0 (this->local_pool); - if (!local) { - CHANGELOG_IOBUF_UNREF (iobuf); - goto out; - } +/* + * Descriptions: + * Writes fop details in ascii format to CSNAP. + * Issues: + * Not Encoding agnostic. + * Returns: + * 0 : On Success. + * -1 : On Failure. + */ +int +changelog_snap_handle_ascii_change(xlator_t *this, changelog_log_data_t *cld) +{ + size_t off = 0; + size_t gfid_len = 0; + char *gfid_str = NULL; + char *buffer = NULL; + changelog_priv_t *priv = NULL; + int ret = 0; + + if (this == NULL) { + ret = -1; + goto out; + } + + priv = this->private; + + if (priv == NULL) { + ret = -1; + goto out; + } + + gfid_str = uuid_utoa(cld->cld_gfid); + gfid_len = strlen(gfid_str); + + /* extra bytes for decorations */ + buffer = alloca(gfid_len + cld->cld_ptr_len + 10); + CHANGELOG_STORE_ASCII(priv, buffer, off, gfid_str, gfid_len, cld); + + CHANGELOG_FILL_BUFFER(buffer, off, "\0", 1); + + ret = changelog_snap_write_change(priv, buffer, off); + + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_WRITE_FAILED, + "csnap", NULL); + } + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_WROTE_TO_CSNAP, NULL); + ret = 0; +out: + return ret; +} - local->update_no_check = update_flag; +int +changelog_handle_change(xlator_t *this, changelog_priv_t *priv, + changelog_log_data_t *cld) +{ + int ret = 0; + + if (CHANGELOG_TYPE_IS_ROLLOVER(cld->cld_type)) { + changelog_encode_change(priv); + ret = changelog_start_next_change(this, priv, cld->cld_roll_time, + cld->cld_finale); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_MSG_GET_TIME_OP_FAILED, NULL); + goto out; + } + + /** + * case when there is reconfigure done (disabling changelog) and there + * are still fops that have updates in prgress. + */ + if (priv->changelog_fd == -1) + return 0; - uuid_copy (local->cld.cld_gfid, gfid); + if (CHANGELOG_TYPE_IS_FSYNC(cld->cld_type)) { + ret = sys_fsync(priv->changelog_fd); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_FSYNC_OP_FAILED, NULL); + } + goto out; + } - local->cld.cld_iobuf = iobuf; - local->cld.cld_xtra_records = 0; /* set by the caller */ + ret = priv->ce->encode(this, cld); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_WRITE_FAILED, + "changelog", NULL); + } - if (inode) - local->inode = inode_ref (inode); +out: + return ret; +} - out: - return local; +changelog_local_t * +changelog_local_init(xlator_t *this, inode_t *inode, uuid_t gfid, + int xtra_records, gf_boolean_t update_flag) +{ + changelog_local_t *local = NULL; + struct iobuf *iobuf = NULL; + + /** + * We relax the presence of inode if @update_flag is true. + * The caller (implementation of the fop) needs to be careful to + * not blindly use local->inode. + */ + if (!update_flag && !inode) { + gf_msg_callingfn(this->name, GF_LOG_WARNING, 0, + CHANGELOG_MSG_INODE_NOT_FOUND, + "inode needed for version checking !!!"); + + goto out; + } + + if (xtra_records) { + iobuf = iobuf_get2(this->ctx->iobuf_pool, + xtra_records * CHANGELOG_OPT_RECORD_LEN); + if (!iobuf) + goto out; + } + + local = mem_get0(this->local_pool); + if (!local) { + CHANGELOG_IOBUF_UNREF(iobuf); + goto out; + } + + local->update_no_check = update_flag; + + gf_uuid_copy(local->cld.cld_gfid, gfid); + + local->cld.cld_iobuf = iobuf; + local->cld.cld_xtra_records = 0; /* set by the caller */ + + if (inode) + local->inode = inode_ref(inode); + +out: + return local; } int -changelog_forget (xlator_t *this, inode_t *inode) +changelog_forget(xlator_t *this, inode_t *inode) { - uint64_t ctx_addr = 0; - changelog_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + changelog_inode_ctx_t *ctx = NULL; - inode_ctx_del (inode, this, &ctx_addr); - if (!ctx_addr) - return 0; + inode_ctx_del(inode, this, &ctx_addr); + if (!ctx_addr) + return 0; - ctx = (changelog_inode_ctx_t *) (long) ctx_addr; - GF_FREE (ctx); + ctx = (changelog_inode_ctx_t *)(long)ctx_addr; + GF_FREE(ctx); - return 0; + return 0; } int -changelog_inject_single_event (xlator_t *this, - changelog_priv_t *priv, - changelog_log_data_t *cld) +changelog_inject_single_event(xlator_t *this, changelog_priv_t *priv, + changelog_log_data_t *cld) { - return priv->cd.dispatchfn (this, priv, priv->cd.cd_data, cld, NULL); + return priv->cd.dispatchfn(this, priv, priv->cd.cd_data, cld, NULL); +} + +/* Wait till all the black fops are drained */ +void +changelog_drain_black_fops(xlator_t *this, changelog_priv_t *priv) +{ + int ret = 0; + + /* clean up framework of pthread_mutex is required here as + * 'reconfigure' terminates the changelog_rollover thread + * on graph change. + */ + pthread_cleanup_push(changelog_cleanup_free_mutex, + &priv->dm.drain_black_mutex); + ret = pthread_mutex_lock(&priv->dm.drain_black_mutex); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PTHREAD_ERROR, + "error=%d", ret, NULL); + while (priv->dm.black_fop_cnt > 0) { + gf_msg_debug(this->name, 0, "Conditional wait on black fops: %ld", + priv->dm.black_fop_cnt); + priv->dm.drain_wait_black = _gf_true; + ret = pthread_cond_wait(&priv->dm.drain_black_cond, + &priv->dm.drain_black_mutex); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_PTHREAD_COND_WAIT_FAILED, "error=%d", ret, + NULL); + } + priv->dm.drain_wait_black = _gf_false; + ret = pthread_mutex_unlock(&priv->dm.drain_black_mutex); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PTHREAD_ERROR, + "error=%d", ret, NULL); + pthread_cleanup_pop(0); + gf_msg_debug(this->name, 0, "Woke up: Conditional wait on black fops"); +} + +/* Wait till all the white fops are drained */ +void +changelog_drain_white_fops(xlator_t *this, changelog_priv_t *priv) +{ + int ret = 0; + + /* clean up framework of pthread_mutex is required here as + * 'reconfigure' terminates the changelog_rollover thread + * on graph change. + */ + pthread_cleanup_push(changelog_cleanup_free_mutex, + &priv->dm.drain_white_mutex); + ret = pthread_mutex_lock(&priv->dm.drain_white_mutex); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PTHREAD_ERROR, + "error=%d", ret, NULL); + while (priv->dm.white_fop_cnt > 0) { + gf_msg_debug(this->name, 0, "Conditional wait on white fops : %ld", + priv->dm.white_fop_cnt); + priv->dm.drain_wait_white = _gf_true; + ret = pthread_cond_wait(&priv->dm.drain_white_cond, + &priv->dm.drain_white_mutex); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_PTHREAD_COND_WAIT_FAILED, "error=%d", ret, + NULL); + } + priv->dm.drain_wait_white = _gf_false; + ret = pthread_mutex_unlock(&priv->dm.drain_white_mutex); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PTHREAD_ERROR, + "error=%d", ret, NULL); + pthread_cleanup_pop(0); + gf_msg_debug(this->name, 0, "Woke up: Conditional wait on white fops"); } /** @@ -380,160 +1242,248 @@ changelog_inject_single_event (xlator_t *this, * a certain time etc..). move them into separate routine. */ void * -changelog_rollover (void *data) -{ - int ret = 0; - xlator_t *this = NULL; - struct timeval tv = {0,}; - changelog_log_data_t cld = {0,}; - changelog_time_slice_t *slice = NULL; - changelog_priv_t *priv = data; - - this = priv->cr.this; - slice = &priv->slice; - - while (1) { - tv.tv_sec = priv->rollover_time; - tv.tv_usec = 0; - - ret = select (0, NULL, NULL, NULL, &tv); - if (ret) - continue; - - ret = changelog_fill_rollover_data (&cld, _gf_false); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "failed to fill rollover data"); - continue; - } +changelog_rollover(void *data) +{ + int ret = 0; + xlator_t *this = NULL; + struct timespec tv = { + 0, + }; + changelog_log_data_t cld = { + 0, + }; + changelog_time_slice_t *slice = NULL; + changelog_priv_t *priv = data; + + this = priv->cr.this; + slice = &priv->slice; + + while (1) { + (void)pthread_testcancel(); + + tv.tv_sec = gf_time() + priv->rollover_time; + tv.tv_nsec = 0; + ret = 0; /* Reset ret to zero */ + + /* The race between actual rollover and explicit rollover is + * handled. If actual rollover is being done and the + * explicit rollover event comes, the event is not missed. + * Since explicit rollover sets 'cr.notify' to true, this + * thread doesn't wait on 'pthread_cond_timedwait'. + */ + pthread_cleanup_push(changelog_cleanup_free_mutex, &priv->cr.lock); + pthread_mutex_lock(&priv->cr.lock); + { + while (ret == 0 && !priv->cr.notify) + ret = pthread_cond_timedwait(&priv->cr.cond, &priv->cr.lock, + &tv); + if (ret == 0) + priv->cr.notify = _gf_false; + } + pthread_mutex_unlock(&priv->cr.lock); + pthread_cleanup_pop(0); + + if (ret == 0) { + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_BARRIER_INFO, + NULL); + priv->explicit_rollover = _gf_true; + } else if (ret && ret != ETIMEDOUT) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_SELECT_FAILED, NULL); + continue; + } else if (ret && ret == ETIMEDOUT) { + gf_msg_debug(this->name, 0, "Wokeup on timeout"); + } - LOCK (&priv->lock); - { - ret = changelog_inject_single_event (this, priv, &cld); - if (!ret) - SLICE_VERSION_UPDATE (slice); - } - UNLOCK (&priv->lock); + /* Reading curent_color without lock is fine here + * as it is only modified here and is next to reading. + */ + if (priv->current_color == FOP_COLOR_BLACK) { + LOCK(&priv->lock); + priv->current_color = FOP_COLOR_WHITE; + UNLOCK(&priv->lock); + gf_msg_debug(this->name, 0, + "Black fops" + " to be drained:%ld", + priv->dm.black_fop_cnt); + changelog_drain_black_fops(this, priv); + } else { + LOCK(&priv->lock); + priv->current_color = FOP_COLOR_BLACK; + UNLOCK(&priv->lock); + gf_msg_debug(this->name, 0, + "White fops" + " to be drained:%ld", + priv->dm.white_fop_cnt); + changelog_drain_white_fops(this, priv); } - return NULL; + /* Adding delay of 1 second only during explicit rollover: + * + * Changelog rollover can happen either due to actual + * or the explicit rollover during snapshot. Actual + * rollover is controlled by tuneable called 'rollover-time'. + * The minimum granularity for rollover-time is 1 second. + * Explicit rollover is asynchronous in nature and happens + * during snapshot. + * + * Basically, rollover renames the current CHANGELOG file + * to CHANGELOG.TIMESTAMP. Let's assume, at time 't1', + * actual and explicit rollover raced against each + * other and actual rollover won the race renaming the + * CHANGELOG file to CHANGELOG.t1 and opens a new + * CHANGELOG file. There is high chance that, an immediate + * explicit rollover at time 't1' can happen with in the same + * second to rename CHANGELOG file to CHANGELOG.t1 resulting in + * purging the earlier CHANGELOG.t1 file created by actual + * rollover. So adding a delay of 1 second guarantees unique + * CHANGELOG.TIMESTAMP during explicit rollover. + */ + if (priv->explicit_rollover == _gf_true) + sleep(1); + + changelog_fill_rollover_data(&cld, _gf_false); + + _mask_cancellation(); + + LOCK(&priv->lock); + { + ret = changelog_inject_single_event(this, priv, &cld); + if (!ret) + SLICE_VERSION_UPDATE(slice); + } + UNLOCK(&priv->lock); + + _unmask_cancellation(); + } + + return NULL; } void * -changelog_fsync_thread (void *data) +changelog_fsync_thread(void *data) { - int ret = 0; - xlator_t *this = NULL; - struct timeval tv = {0,}; - changelog_log_data_t cld = {0,}; - changelog_priv_t *priv = data; + int ret = 0; + xlator_t *this = NULL; + struct timeval tv = { + 0, + }; + changelog_log_data_t cld = { + 0, + }; + changelog_priv_t *priv = data; - this = priv->cf.this; - cld.cld_type = CHANGELOG_TYPE_FSYNC; + this = priv->cf.this; + cld.cld_type = CHANGELOG_TYPE_FSYNC; - while (1) { - tv.tv_sec = priv->fsync_interval; - tv.tv_usec = 0; + while (1) { + (void)pthread_testcancel(); - ret = select (0, NULL, NULL, NULL, &tv); - if (ret) - continue; + tv.tv_sec = priv->fsync_interval; + tv.tv_usec = 0; - ret = changelog_inject_single_event (this, priv, &cld); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "failed to inject fsync event"); - } + ret = select(0, NULL, NULL, NULL, &tv); + if (ret) + continue; - return NULL; + _mask_cancellation(); + + ret = changelog_inject_single_event(this, priv, &cld); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_MSG_INJECT_FSYNC_FAILED, NULL); + + _unmask_cancellation(); + } + + return NULL; } /* macros for inode/changelog version checks */ -#define INODE_VERSION_UPDATE(priv, inode, iver, slice, type) do { \ - LOCK (&inode->lock); \ - { \ - LOCK (&priv->lock); \ - { \ - *iver = slice->changelog_version[type]; \ - } \ - UNLOCK (&priv->lock); \ - } \ - UNLOCK (&inode->lock); \ - } while (0) - -#define INODE_VERSION_EQUALS_SLICE(priv, ver, slice, type, upd) do { \ - LOCK (&priv->lock); \ - { \ - upd = (ver == slice->changelog_version[type]) \ - ? _gf_false : _gf_true; \ - } \ - UNLOCK (&priv->lock); \ - } while (0) +#define INODE_VERSION_UPDATE(priv, inode, iver, slice, type) \ + do { \ + LOCK(&inode->lock); \ + { \ + LOCK(&priv->lock); \ + { \ + *iver = slice->changelog_version[type]; \ + } \ + UNLOCK(&priv->lock); \ + } \ + UNLOCK(&inode->lock); \ + } while (0) + +#define INODE_VERSION_EQUALS_SLICE(priv, ver, slice, type, upd) \ + do { \ + LOCK(&priv->lock); \ + { \ + upd = (ver == slice->changelog_version[type]) ? _gf_false \ + : _gf_true; \ + } \ + UNLOCK(&priv->lock); \ + } while (0) static int -__changelog_inode_ctx_set (xlator_t *this, - inode_t *inode, changelog_inode_ctx_t *ctx) +__changelog_inode_ctx_set(xlator_t *this, inode_t *inode, + changelog_inode_ctx_t *ctx) { - uint64_t ctx_addr = (uint64_t) ctx; - return __inode_ctx_set (inode, this, &ctx_addr); + uint64_t ctx_addr = (uint64_t)(uintptr_t)ctx; + return __inode_ctx_set(inode, this, &ctx_addr); } /** * one shot routine to get the address and the value of a inode version * for a particular type. */ -static changelog_inode_ctx_t * -__changelog_inode_ctx_get (xlator_t *this, - inode_t *inode, unsigned long **iver, - unsigned long *version, changelog_log_type type) -{ - int ret = 0; - uint64_t ctx_addr = 0; - changelog_inode_ctx_t *ctx = NULL; - - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - if (ctx_addr != 0) { - ctx = (changelog_inode_ctx_t *) (long)ctx_addr; - goto out; - } - - ctx = GF_CALLOC (1, sizeof (*ctx), gf_changelog_mt_inode_ctx_t); - if (!ctx) - goto out; - - ret = __changelog_inode_ctx_set (this, inode, ctx); - if (ret) { - GF_FREE (ctx); - ctx = NULL; - } - - out: - if (ctx && iver && version) { - *iver = CHANGELOG_INODE_VERSION_TYPE (ctx, type); - *version = **iver; - } - - return ctx; +changelog_inode_ctx_t * +__changelog_inode_ctx_get(xlator_t *this, inode_t *inode, unsigned long **iver, + unsigned long *version, changelog_log_type type) +{ + int ret = 0; + uint64_t ctx_addr = 0; + changelog_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + if (ctx_addr != 0) { + ctx = (changelog_inode_ctx_t *)(long)ctx_addr; + goto out; + } + + ctx = GF_CALLOC(1, sizeof(*ctx), gf_changelog_mt_inode_ctx_t); + if (!ctx) + goto out; + + ret = __changelog_inode_ctx_set(this, inode, ctx); + if (ret) { + GF_FREE(ctx); + ctx = NULL; + } + +out: + if (ctx && iver && version) { + *iver = CHANGELOG_INODE_VERSION_TYPE(ctx, type); + *version = **iver; + } + + return ctx; } static changelog_inode_ctx_t * -changelog_inode_ctx_get (xlator_t *this, - inode_t *inode, unsigned long **iver, - unsigned long *version, changelog_log_type type) +changelog_inode_ctx_get(xlator_t *this, inode_t *inode, unsigned long **iver, + unsigned long *version, changelog_log_type type) { - changelog_inode_ctx_t *ctx = NULL; + changelog_inode_ctx_t *ctx = NULL; - LOCK (&inode->lock); - { - ctx = __changelog_inode_ctx_get (this, - inode, iver, version, type); - } - UNLOCK (&inode->lock); + LOCK(&inode->lock); + { + ctx = __changelog_inode_ctx_get(this, inode, iver, version, type); + } + UNLOCK(&inode->lock); - return ctx; + return ctx; } /** @@ -636,58 +1586,392 @@ changelog_inode_ctx_get (xlator_t *this, * then there is no need to record an update (as the equality of the two version * signifies an update was recorded in the current time slice). */ -inline void -changelog_update (xlator_t *this, changelog_priv_t *priv, - changelog_local_t *local, changelog_log_type type) -{ - int ret = 0; - unsigned long *iver = NULL; - unsigned long version = 0; - inode_t *inode = NULL; - changelog_time_slice_t *slice = NULL; - changelog_inode_ctx_t *ctx = NULL; - changelog_log_data_t *cld_0 = NULL; - changelog_log_data_t *cld_1 = NULL; - changelog_local_t *next_local = NULL; - gf_boolean_t need_upd = _gf_true; - - slice = &priv->slice; +void +changelog_update(xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local, changelog_log_type type) +{ + int ret = 0; + unsigned long *iver = NULL; + unsigned long version = 0; + inode_t *inode = NULL; + changelog_time_slice_t *slice = NULL; + changelog_inode_ctx_t *ctx = NULL; + changelog_log_data_t *cld_0 = NULL; + changelog_log_data_t *cld_1 = NULL; + changelog_local_t *next_local = NULL; + gf_boolean_t need_upd = _gf_true; + + slice = &priv->slice; + + /** + * for fops that do not require inode version checking + */ + if (local->update_no_check) + goto update; + + inode = local->inode; + + ctx = changelog_inode_ctx_get(this, inode, &iver, &version, type); + if (!ctx) + goto update; + + INODE_VERSION_EQUALS_SLICE(priv, version, slice, type, need_upd); + +update: + if (need_upd) { + cld_0 = &local->cld; + cld_0->cld_type = type; + + if ((next_local = local->prev_entry) != NULL) { + cld_1 = &next_local->cld; + cld_1->cld_type = type; + } + + ret = priv->cd.dispatchfn(this, priv, priv->cd.cd_data, cld_0, cld_1); /** - * for fops that do not require inode version checking + * update after the dispatcher has successfully done + * it's job. */ - if (local->update_no_check) - goto update; + if (!local->update_no_check && iver && !ret) + INODE_VERSION_UPDATE(priv, inode, iver, slice, type); + } - inode = local->inode; + return; +} - ctx = changelog_inode_ctx_get (this, - inode, &iver, &version, type); - if (!ctx) - goto update; +/* Begin: Geo-rep snapshot dependency changes */ - INODE_VERSION_EQUALS_SLICE (priv, version, slice, type, need_upd); +/* changelog_color_fop_and_inc_cnt: Assign color and inc fop cnt. + * + * Assigning color and increment of corresponding fop count should happen + * in a lock (i.e., there should be no window between them). If it does not, + * we might miss draining those fops which are colored but not yet incremented + * the count. Let's assume black fops are draining. If the black fop count + * reaches zero, we say draining is completed but we miss black fops which are + * not incremented fop count but color is assigned black. + */ - update: - if (need_upd) { - cld_0 = &local->cld; - cld_0->cld_type = type; +void +changelog_color_fop_and_inc_cnt(xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local) +{ + if (!priv || !local) + return; + + LOCK(&priv->lock); + { + local->color = priv->current_color; + changelog_inc_fop_cnt(this, priv, local); + } + UNLOCK(&priv->lock); +} + +/* Increments the respective fop counter based on the fop color */ +void +changelog_inc_fop_cnt(xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local) +{ + int ret = 0; + + if (local) { + if (local->color == FOP_COLOR_BLACK) { + ret = pthread_mutex_lock(&priv->dm.drain_black_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out); + { + priv->dm.black_fop_cnt++; + } + ret = pthread_mutex_unlock(&priv->dm.drain_black_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out); + } else { + ret = pthread_mutex_lock(&priv->dm.drain_white_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out); + { + priv->dm.white_fop_cnt++; + } + ret = pthread_mutex_unlock(&priv->dm.drain_white_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out); + } + } +out: + return; +} - if ( (next_local = local->prev_entry) != NULL ) { - cld_1 = &next_local->cld; - cld_1->cld_type = type; +/* Decrements the respective fop counter based on the fop color */ +void +changelog_dec_fop_cnt(xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local) +{ + int ret = 0; + + if (local) { + if (local->color == FOP_COLOR_BLACK) { + ret = pthread_mutex_lock(&priv->dm.drain_black_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out); + { + priv->dm.black_fop_cnt--; + if (priv->dm.black_fop_cnt == 0 && + priv->dm.drain_wait_black == _gf_true) { + ret = pthread_cond_signal(&priv->dm.drain_black_cond); + CHANGELOG_PTHREAD_ERROR_HANDLE_2( + ret, out, priv->dm.drain_black_mutex); + gf_msg_debug(this->name, 0, + "Signalled " + "draining of black"); + } + } + ret = pthread_mutex_unlock(&priv->dm.drain_black_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out); + } else { + ret = pthread_mutex_lock(&priv->dm.drain_white_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out); + { + priv->dm.white_fop_cnt--; + if (priv->dm.white_fop_cnt == 0 && + priv->dm.drain_wait_white == _gf_true) { + ret = pthread_cond_signal(&priv->dm.drain_white_cond); + CHANGELOG_PTHREAD_ERROR_HANDLE_2( + ret, out, priv->dm.drain_white_mutex); + gf_msg_debug(this->name, 0, + "Signalled " + "draining of white"); } + } + ret = pthread_mutex_unlock(&priv->dm.drain_white_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out); + } + } +out: + return; +} - ret = priv->cd.dispatchfn (this, priv, - priv->cd.cd_data, cld_0, cld_1); +/* Write to a pipe setup between changelog main thread and changelog + * rollover thread to initiate explicit rollover of changelog journal. + */ +int +changelog_barrier_notify(changelog_priv_t *priv, char *buf) +{ + int ret = 0; + + pthread_mutex_lock(&priv->cr.lock); + { + ret = pthread_cond_signal(&priv->cr.cond); + priv->cr.notify = _gf_true; + } + pthread_mutex_unlock(&priv->cr.lock); + return ret; +} - /** - * update after the dispatcher has successfully done - * it's job. - */ - if (!local->update_no_check && iver && !ret) - INODE_VERSION_UPDATE (priv, inode, iver, slice, type); +/* Clean up flags set on barrier notification */ +void +changelog_barrier_cleanup(xlator_t *this, changelog_priv_t *priv, + struct list_head *queue) +{ + int ret = 0; + + LOCK(&priv->bflags.lock); + priv->bflags.barrier_ext = _gf_false; + UNLOCK(&priv->bflags.lock); + + ret = pthread_mutex_lock(&priv->bn.bnotify_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out); + { + priv->bn.bnotify = _gf_false; + } + ret = pthread_mutex_unlock(&priv->bn.bnotify_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out); + + /* Disable changelog barrier and dequeue fops */ + LOCK(&priv->lock); + { + if (priv->barrier_enabled == _gf_true) + __chlog_barrier_disable(this, queue); + else + ret = -1; + } + UNLOCK(&priv->lock); + if (ret == 0) + chlog_barrier_dequeue_all(this, queue); + +out: + return; +} +/* End: Geo-Rep snapshot dependency changes */ + +int32_t +changelog_fill_entry_buf(call_frame_t *frame, xlator_t *this, loc_t *loc, + changelog_local_t **local) +{ + changelog_opt_t *co = NULL; + size_t xtra_len = 0; + char *dup_path = NULL; + char *bname = NULL; + inode_t *parent = NULL; + + GF_ASSERT(this); + + parent = inode_parent(loc->inode, 0, 0); + if (!parent) { + gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_INODE_NOT_FOUND, + "type=parent", "gfid=%s", uuid_utoa(loc->inode->gfid), NULL); + goto err; + } + + CHANGELOG_INIT_NOCHECK(this, *local, loc->inode, loc->inode->gfid, 5); + if (!(*local)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_LOCAL_INIT_FAILED, + NULL); + goto err; + } + + co = changelog_get_usable_buffer(*local); + if (!co) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_GET_BUFFER_FAILED, + NULL); + goto err; + } + + if (loc->inode->ia_type == IA_IFDIR) { + CHANGLOG_FILL_FOP_NUMBER(co, GF_FOP_MKDIR, fop_fn, xtra_len); + co++; + CHANGELOG_FILL_UINT32(co, S_IFDIR | 0755, number_fn, xtra_len); + co++; + } else { + CHANGLOG_FILL_FOP_NUMBER(co, GF_FOP_CREATE, fop_fn, xtra_len); + co++; + CHANGELOG_FILL_UINT32(co, S_IFREG | 0644, number_fn, xtra_len); + co++; + } + + CHANGELOG_FILL_UINT32(co, frame->root->uid, number_fn, xtra_len); + co++; + + CHANGELOG_FILL_UINT32(co, frame->root->gid, number_fn, xtra_len); + co++; + + dup_path = gf_strdup(loc->path); + bname = basename(dup_path); + + CHANGELOG_FILL_ENTRY(co, parent->gfid, bname, entry_fn, entry_free_fn, + xtra_len, err); + changelog_set_usable_record_and_length(*local, xtra_len, 5); + + if (dup_path) + GF_FREE(dup_path); + if (parent) + inode_unref(parent); + return 0; + +err: + if (dup_path) + GF_FREE(dup_path); + if (parent) + inode_unref(parent); + return -1; +} + +/* + * resolve_pargfid_to_path: + * It converts given pargfid to path by doing recursive readlinks at the + * backend. If bname is given, it suffixes bname to pargfid to form the + * complete path else it doesn't. It allocates memory for the path and is + * caller's responsibility to free the same. If bname is NULL and pargfid + * is ROOT, then it returns "." + */ + +int +resolve_pargfid_to_path(xlator_t *this, const uuid_t pgfid, char **path, + char *bname) +{ + char *linkname = NULL; + char *dir_handle = NULL; + char *pgfidstr = NULL; + char *saveptr = NULL; + ssize_t len = 0; + int ret = 0; + uuid_t tmp_gfid = { + 0, + }; + uuid_t pargfid = { + 0, + }; + changelog_priv_t *priv = NULL; + char gpath[PATH_MAX] = { + 0, + }; + char result[PATH_MAX] = { + 0, + }; + char *dir_name = NULL; + char pre_dir_name[PATH_MAX] = { + 0, + }; + + GF_ASSERT(this); + priv = this->private; + GF_ASSERT(priv); + + gf_uuid_copy(pargfid, pgfid); + if (!path || gf_uuid_is_null(pargfid)) { + ret = -1; + goto out; + } + + if (__is_root_gfid(pargfid)) { + if (bname) + *path = gf_strdup(bname); + else + *path = gf_strdup("."); + return ret; + } + + dir_handle = alloca(PATH_MAX); + linkname = alloca(PATH_MAX); + (void)snprintf(gpath, PATH_MAX, "%s/.glusterfs/", priv->changelog_brick); + + while (!(__is_root_gfid(pargfid))) { + len = snprintf(dir_handle, PATH_MAX, "%s/%02x/%02x/%s", gpath, + pargfid[0], pargfid[1], uuid_utoa(pargfid)); + if ((len < 0) || (len >= PATH_MAX)) { + ret = -1; + goto out; } - return; + len = sys_readlink(dir_handle, linkname, PATH_MAX); + if (len < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_READLINK_OP_FAILED, + "could not read the " + "link from the gfid handle", + "handle=%s", dir_handle, NULL); + ret = -1; + goto out; + } + + linkname[len] = '\0'; + + pgfidstr = strtok_r(linkname + strlen("../../00/00/"), "/", &saveptr); + dir_name = strtok_r(NULL, "/", &saveptr); + + len = snprintf(result, PATH_MAX, "%s/%s", dir_name, pre_dir_name); + if ((len < 0) || (len >= PATH_MAX)) { + ret = -1; + goto out; + } + if (snprintf(pre_dir_name, len + 1, "%s", result) >= len + 1) { + ret = -1; + goto out; + } + + gf_uuid_parse(pgfidstr, tmp_gfid); + gf_uuid_copy(pargfid, tmp_gfid); + } + + if (bname) + strncat(result, bname, strlen(bname) + 1); + + *path = gf_strdup(result); + +out: + return ret; } diff --git a/xlators/features/changelog/src/changelog-helpers.h b/xlators/features/changelog/src/changelog-helpers.h index ad79636b0eb..38fa7590c32 100644 --- a/xlators/features/changelog/src/changelog-helpers.h +++ b/xlators/features/changelog/src/changelog-helpers.h @@ -11,55 +11,63 @@ #ifndef _CHANGELOG_HELPERS_H #define _CHANGELOG_HELPERS_H -#include "locking.h" -#include "timer.h" +#include <glusterfs/locking.h> +#include <glusterfs/timer.h> #include "pthread.h" -#include "iobuf.h" +#include <glusterfs/iobuf.h> +#include <glusterfs/rot-buffs.h> #include "changelog-misc.h" +#include <glusterfs/call-stub.h> + +#include "rpcsvc.h" +#include "changelog-ev-handle.h" + +#include "changelog.h" +#include "changelog-messages.h" /** * the changelog entry */ typedef struct changelog_log_data { - /* rollover related */ - unsigned long cld_roll_time; + /* rollover related */ + time_t cld_roll_time; - /* reopen changelog? */ - gf_boolean_t cld_finale; + /* reopen changelog? */ + gf_boolean_t cld_finale; - changelog_log_type cld_type; + changelog_log_type cld_type; - /** - * sincd gfid is _always_ a necessity, it's not a part - * of the iobuf. by doing this we do not add any overhead - * for data and metadata related fops. - */ - uuid_t cld_gfid; + /** + * sincd gfid is _always_ a necessity, it's not a part + * of the iobuf. by doing this we do not add any overhead + * for data and metadata related fops. + */ + uuid_t cld_gfid; - /** - * iobufs are used for optionals records: pargfid, path, - * write offsets etc.. It's the fop implementers job - * to allocate (iobuf_get() in the fop) and get unref'ed - * in the callback (CHANGELOG_STACK_UNWIND). - */ - struct iobuf *cld_iobuf; + /** + * iobufs are used for optionals records: pargfid, path, + * write offsets etc.. It's the fop implementers job + * to allocate (iobuf_get() in the fop) and get unref'ed + * in the callback (CHANGELOG_STACK_UNWIND). + */ + struct iobuf *cld_iobuf; #define cld_ptr cld_iobuf->ptr - /** - * after allocation you can point this to the length of - * usable data, but make sure it does not exceed the - * the size of the requested iobuf. - */ - size_t cld_iobuf_len; + /** + * after allocation you can point this to the length of + * usable data, but make sure it does not exceed the + * the size of the requested iobuf. + */ + size_t cld_iobuf_len; #define cld_ptr_len cld_iobuf_len - /** - * number of optional records - */ - int cld_xtra_records; + /** + * number of optional records + */ + int cld_xtra_records; } changelog_log_data_t; /** @@ -69,153 +77,280 @@ typedef struct changelog_log_data { typedef struct changelog_priv changelog_priv_t; typedef struct changelog_dispatcher { - void *cd_data; - int (*dispatchfn) (xlator_t *, changelog_priv_t *, void *, - changelog_log_data_t *, changelog_log_data_t *); + void *cd_data; + int (*dispatchfn)(xlator_t *, changelog_priv_t *, void *, + changelog_log_data_t *, changelog_log_data_t *); } changelog_dispatcher_t; struct changelog_bootstrap { - changelog_mode_t mode; - int (*ctor) (xlator_t *, changelog_dispatcher_t *); - int (*dtor) (xlator_t *, changelog_dispatcher_t *); + changelog_mode_t mode; + int (*ctor)(xlator_t *, changelog_dispatcher_t *); + int (*dtor)(xlator_t *, changelog_dispatcher_t *); }; struct changelog_encoder { - changelog_encoder_t encoder; - int (*encode) (xlator_t *, changelog_log_data_t *); + changelog_encoder_t encoder; + int (*encode)(xlator_t *, changelog_log_data_t *); }; - /* xlator private */ typedef struct changelog_time_slice { - /** - * just in case we need nanosecond granularity some day. - * field is unused as of now (maybe we'd need it later). - */ - struct timeval tv_start; - - /** - * version of changelog file, incremented each time changes - * rollover. - */ - unsigned long changelog_version[CHANGELOG_MAX_TYPE]; + /** + * version of changelog file, incremented each time changes + * rollover. + */ + unsigned long changelog_version[CHANGELOG_MAX_TYPE]; } changelog_time_slice_t; typedef struct changelog_rollover { - /* rollover thread */ - pthread_t rollover_th; + /* rollover thread */ + pthread_t rollover_th; - xlator_t *this; + xlator_t *this; + + pthread_mutex_t lock; + pthread_cond_t cond; + gf_boolean_t notify; } changelog_rollover_t; typedef struct changelog_fsync { - /* fsync() thread */ - pthread_t fsync_th; + /* fsync() thread */ + pthread_t fsync_th; - xlator_t *this; + xlator_t *this; } changelog_fsync_t; -# define CHANGELOG_MAX_CLIENTS 5 -typedef struct changelog_notify { - /* reader end of the pipe */ - int rfd; +/* Draining during changelog rollover (for geo-rep snapshot dependency): + * -------------------------------------------------------------------- + * The introduction of draining of in-transit fops during changelog rollover + * (both explicit/timeout triggered) requires coloring of fops. Basically the + * implementation requires two counters, one counter which keeps the count of + * current intransit fops which should end up in current changelog and the other + * counter to keep track of incoming fops which should be drained as part of + * next changelog rollover event. The fops are colored w.r.t these counters. + * The fops that are to be drained as part of current changelog rollover is + * given one color and the fops which keep incoming during this and not + * necessarily should end up in current changelog and should be drained as part + * of next changelog rollover are given other color. The color switching + * continues with each changelog rollover. Two colors(black and white) are + * chosen here and initially black is chosen is default. + */ + +typedef enum chlog_fop_color { + FOP_COLOR_BLACK, + FOP_COLOR_WHITE +} chlog_fop_color_t; + +/* Barrier notify variable */ +typedef struct barrier_notify { + pthread_mutex_t bnotify_mutex; + pthread_cond_t bnotify_cond; + gf_boolean_t bnotify; + gf_boolean_t bnotify_error; +} barrier_notify_t; + +/* Two separate mutex and conditional variable set is used + * to drain white and black fops. */ + +typedef struct drain_mgmt { + pthread_mutex_t drain_black_mutex; + pthread_cond_t drain_black_cond; + pthread_mutex_t drain_white_mutex; + pthread_cond_t drain_white_cond; + /* Represents black fops count in-transit */ + unsigned long black_fop_cnt; + /* Represents white fops count in-transit */ + unsigned long white_fop_cnt; + gf_boolean_t drain_wait_black; + gf_boolean_t drain_wait_white; +} drain_mgmt_t; + +/* External barrier as a result of snap on/off indicating flag*/ +typedef struct barrier_flags { + gf_lock_t lock; + gf_boolean_t barrier_ext; +} barrier_flags_t; + +/* Event selection */ +typedef struct changelog_ev_selector { + gf_lock_t reflock; + + /** + * Array of references for each selection bit. + */ + unsigned int ref[CHANGELOG_EV_SELECTION_RANGE]; +} changelog_ev_selector_t; + +/* changelog's private structure */ +struct changelog_priv { + /* changelog journalling */ + gf_boolean_t active; - /* notifier thread */ - pthread_t notify_th; + /* changelog live notifications */ + gf_boolean_t rpc_active; - /* unique socket path */ - char sockpath[PATH_MAX]; + /* to generate unique socket file per brick */ + char *changelog_brick; - int socket_fd; + /* logging directory */ + char *changelog_dir; - /** - * simple array of accept()'ed fds. Not scalable at all - * for large number of clients, but it's okay as we have - * a ahrd limit in this version (@CHANGELOG_MAX_CLIENTS). - */ - int client_fd[CHANGELOG_MAX_CLIENTS]; + /* htime directory */ + char *htime_dir; - xlator_t *this; -} changelog_notify_t; + /* one file for all changelog types */ + int changelog_fd; -struct changelog_priv { - gf_boolean_t active; + /* htime fd for current changelog session */ + int htime_fd; + + /* c_snap_fd is fd for call-path changelog */ + int c_snap_fd; + + /* rollover_count used by htime */ + int rollover_count; + + gf_lock_t lock; + + /* lock to synchronize CSNAP updation */ + gf_lock_t c_snap_lock; + + /* written end of the pipe */ + int wfd; + + /* rollover time */ + int32_t rollover_time; + + /* fsync() interval */ + int32_t fsync_interval; + + /* changelog type maps */ + const char *maps[CHANGELOG_MAX_TYPE]; + + /* time slicer */ + changelog_time_slice_t slice; + + /* context of the updater */ + changelog_dispatcher_t cd; + + /* context of the rollover thread */ + changelog_rollover_t cr; - /* to generate unique socket file per brick */ - char *changelog_brick; + /* context of fsync thread */ + changelog_fsync_t cf; - /* logging directory */ - char *changelog_dir; + /* operation mode */ + changelog_mode_t op_mode; - /* one file for all changelog types */ - int changelog_fd; + /* bootstrap routine for 'current' logger */ + struct changelog_bootstrap *cb; - gf_lock_t lock; + /* encoder mode */ + changelog_encoder_t encode_mode; - /* writen end of the pipe */ - int wfd; + /* encoder */ + struct changelog_encoder *ce; - /* rollover time */ - int32_t rollover_time; + /** + * snapshot dependency changes + */ - /* fsync() interval */ - int32_t fsync_interval; + /* Draining of fops*/ + drain_mgmt_t dm; - /* changelog type maps */ - const char *maps[CHANGELOG_MAX_TYPE]; + /* Represents the active color. Initially by default black */ + chlog_fop_color_t current_color; - /* time slicer */ - changelog_time_slice_t slice; + /* flag to determine explicit rollover is triggered */ + gf_boolean_t explicit_rollover; - /* context of the updater */ - changelog_dispatcher_t cd; + /* barrier notification variable protected by mutex */ + barrier_notify_t bn; - /* context of the rollover thread */ - changelog_rollover_t cr; + /* barrier on/off indicating flags */ + barrier_flags_t bflags; - /* context of fsync thread */ - changelog_fsync_t cf; + /* changelog barrier on/off indicating flag */ + gf_boolean_t barrier_enabled; + struct list_head queue; + uint32_t queue_size; + gf_timer_t *timer; + struct timespec timeout; - /* context of the notifier thread */ - changelog_notify_t cn; + /** + * buffers, RPC, event selection, notifications and other + * beasts. + */ - /* operation mode */ - changelog_mode_t op_mode; + /* epoll pthread */ + pthread_t poller; - /* bootstrap routine for 'current' logger */ - struct changelog_bootstrap *cb; + /* rotational buffer */ + rbuf_t *rbuf; - /* encoder mode */ - changelog_encoder_t encode_mode; + /* changelog RPC server */ + rpcsvc_t *rpc; - /* encoder */ - struct changelog_encoder *ce; + /* event selection */ + changelog_ev_selector_t ev_selection; + + /* client handling (reverse connection) */ + pthread_t connector; + + int nr_dispatchers; + pthread_t *ev_dispatcher; + + changelog_clnt_t connections; + + /* glusterfind dependency to capture paths on deleted entries*/ + gf_boolean_t capture_del_path; + + /* Save total no. of listners */ + gf_atomic_t listnercnt; + + /* Save total no. of xprt are associated with listner */ + gf_atomic_t xprtcnt; + + /* Save xprt list */ + struct list_head xprt_list; + + /* Save total no. of client connection */ + gf_atomic_t clntcnt; + + /* Save cleanup brick in victim */ + xlator_t *victim; + + /* Status to save cleanup notify status */ + gf_boolean_t notify_down; }; struct changelog_local { - inode_t *inode; - gf_boolean_t update_no_check; - - changelog_log_data_t cld; - - /** - * ->prev_entry is used in cases when there needs to be - * additional changelog entry for the parent (eg. rename) - * It's analogous to ->next in single linked list world, - * but we call it as ->prev_entry... ha ha ha - */ - struct changelog_local *prev_entry; + inode_t *inode; + gf_boolean_t update_no_check; + + changelog_log_data_t cld; + + /** + * ->prev_entry is used in cases when there needs to be + * additional changelog entry for the parent (eg. rename) + * It's analogous to ->next in single linked list world, + * but we call it as ->prev_entry... ha ha ha + */ + struct changelog_local *prev_entry; + + /* snap dependency changes */ + chlog_fop_color_t color; }; typedef struct changelog_local changelog_local_t; /* inode version is stored in inode ctx */ typedef struct changelog_inode_ctx { - unsigned long iversion[CHANGELOG_MAX_TYPE]; + unsigned long iversion[CHANGELOG_MAX_TYPE]; } changelog_inode_ctx_t; -#define CHANGELOG_INODE_VERSION_TYPE(ctx, type) &(ctx->iversion[type]) +#define CHANGELOG_INODE_VERSION_TYPE(ctx, type) &(ctx->iversion[type]) /** * Optional Records: @@ -223,173 +358,359 @@ typedef struct changelog_inode_ctx { * @changelog_opt_t struct. The array is allocated via @iobufs. */ typedef enum { - CHANGELOG_OPT_REC_FOP, - CHANGELOG_OPT_REC_ENTRY, + CHANGELOG_OPT_REC_FOP, + CHANGELOG_OPT_REC_ENTRY, + CHANGELOG_OPT_REC_UINT32, } changelog_optional_rec_type_t; struct changelog_entry_fields { - uuid_t cef_uuid; - char *cef_bname; + uuid_t cef_uuid; + char *cef_bname; + char *cef_path; }; typedef struct { - /** - * @co_covert can be used to do post-processing of the record before - * it's persisted to the CHANGELOG. If this is NULL, then the record - * is persisted as per it's in memory format. - */ - size_t (*co_convert) (void *data, char *buffer, gf_boolean_t encode); - - /* release routines */ - void (*co_free) (void *data); - - /* type of the field */ - changelog_optional_rec_type_t co_type; - - /** - * sizeof of the 'valid' field in the union. This field is not used if - * @co_convert is specified. - */ - size_t co_len; - - union { - glusterfs_fop_t co_fop; - struct changelog_entry_fields co_entry; - }; + /** + * @co_covert can be used to do post-processing of the record before + * it's persisted to the CHANGELOG. If this is NULL, then the record + * is persisted as per it's in memory format. + */ + size_t (*co_convert)(void *data, char *buffer, gf_boolean_t encode); + + /* release routines */ + void (*co_free)(void *data); + + /* type of the field */ + changelog_optional_rec_type_t co_type; + + /** + * sizeof of the 'valid' field in the union. This field is not used if + * @co_convert is specified. + */ + size_t co_len; + + union { + unsigned int co_uint32; + glusterfs_fop_t co_fop; + struct changelog_entry_fields co_entry; + }; } changelog_opt_t; -#define CHANGELOG_OPT_RECORD_LEN sizeof (changelog_opt_t) +#define CHANGELOG_OPT_RECORD_LEN sizeof(changelog_opt_t) /** * helpers routines */ +int +changelog_thread_cleanup(xlator_t *this, pthread_t thr_id); + +void * +changelog_get_usable_buffer(changelog_local_t *local); + void -changelog_thread_cleanup (xlator_t *this, pthread_t thr_id); -inline void * -changelog_get_usable_buffer (changelog_local_t *local); -inline void -changelog_set_usable_record_and_length (changelog_local_t *local, - size_t len, int xr); +changelog_set_usable_record_and_length(changelog_local_t *local, size_t len, + int xr); void -changelog_local_cleanup (xlator_t *xl, changelog_local_t *local); +changelog_local_cleanup(xlator_t *xl, changelog_local_t *local); changelog_local_t * -changelog_local_init (xlator_t *this, inode_t *inode, uuid_t gfid, - int xtra_records, gf_boolean_t update_flag); +changelog_local_init(xlator_t *this, inode_t *inode, uuid_t gfid, + int xtra_records, gf_boolean_t update_flag); +int +changelog_start_next_change(xlator_t *this, changelog_priv_t *priv, time_t ts, + gf_boolean_t finale); int -changelog_start_next_change (xlator_t *this, - changelog_priv_t *priv, - unsigned long ts, gf_boolean_t finale); +changelog_open_journal(xlator_t *this, changelog_priv_t *priv); +void +changelog_fill_rollover_data(changelog_log_data_t *cld, gf_boolean_t is_last); int -changelog_open (xlator_t *this, changelog_priv_t *priv); +changelog_inject_single_event(xlator_t *this, changelog_priv_t *priv, + changelog_log_data_t *cld); +size_t +changelog_entry_length(); int -changelog_fill_rollover_data (changelog_log_data_t *cld, gf_boolean_t is_last); +changelog_write(int fd, char *buffer, size_t len); int -changelog_inject_single_event (xlator_t *this, - changelog_priv_t *priv, - changelog_log_data_t *cld); -inline size_t -changelog_entry_length (); -inline int -changelog_write (int fd, char *buffer, size_t len); +changelog_write_change(changelog_priv_t *priv, char *buffer, size_t len); int -changelog_write_change (changelog_priv_t *priv, char *buffer, size_t len); -inline int -changelog_handle_change (xlator_t *this, - changelog_priv_t *priv, changelog_log_data_t *cld); -inline void -changelog_update (xlator_t *this, changelog_priv_t *priv, - changelog_local_t *local, changelog_log_type type); +changelog_handle_change(xlator_t *this, changelog_priv_t *priv, + changelog_log_data_t *cld); +void +changelog_update(xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local, changelog_log_type type); void * -changelog_rollover (void *data); +changelog_rollover(void *data); void * -changelog_fsync_thread (void *data); +changelog_fsync_thread(void *data); +int +changelog_forget(xlator_t *this, inode_t *inode); +int +htime_update(xlator_t *this, changelog_priv_t *priv, time_t ts, char *buffer); +int +htime_open(xlator_t *this, changelog_priv_t *priv, time_t ts); +int +htime_create(xlator_t *this, changelog_priv_t *priv, time_t ts); + +/* Geo-Rep snapshot dependency changes */ +void +changelog_color_fop_and_inc_cnt(xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local); +void +changelog_inc_fop_cnt(xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local); +void +changelog_dec_fop_cnt(xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local); +int +changelog_barrier_notify(changelog_priv_t *priv, char *buf); +void +changelog_barrier_cleanup(xlator_t *this, changelog_priv_t *priv, + struct list_head *queue); +void +changelog_drain_white_fops(xlator_t *this, changelog_priv_t *priv); +void +changelog_drain_black_fops(xlator_t *this, changelog_priv_t *priv); + +/* Crash consistency of changelog wrt snapshot */ +int +changelog_snap_logging_stop(xlator_t *this, changelog_priv_t *priv); +int +changelog_snap_logging_start(xlator_t *this, changelog_priv_t *priv); +int +changelog_snap_open(xlator_t *this, changelog_priv_t *priv); +int +changelog_snap_handle_ascii_change(xlator_t *this, changelog_log_data_t *cld); int -changelog_forget (xlator_t *this, inode_t *inode); +changelog_snap_write_change(changelog_priv_t *priv, char *buffer, size_t len); + +/* Changelog barrier routines */ +void +__chlog_barrier_enqueue(xlator_t *this, call_stub_t *stub); +void +__chlog_barrier_disable(xlator_t *this, struct list_head *queue); +void +chlog_barrier_dequeue_all(xlator_t *this, struct list_head *queue); +call_stub_t * +__chlog_barrier_dequeue(xlator_t *this, struct list_head *queue); +int +__chlog_barrier_enable(xlator_t *this, changelog_priv_t *priv); + +int32_t +changelog_fill_entry_buf(call_frame_t *frame, xlator_t *this, loc_t *loc, + changelog_local_t **local); + +/* event selection routines */ +void +changelog_select_event(xlator_t *, changelog_ev_selector_t *, unsigned int); +void +changelog_deselect_event(xlator_t *, changelog_ev_selector_t *, unsigned int); +int +changelog_init_event_selection(xlator_t *, changelog_ev_selector_t *); +int +changelog_ev_selected(xlator_t *, changelog_ev_selector_t *, unsigned int); +void +changelog_dispatch_event(xlator_t *, changelog_priv_t *, changelog_event_t *); + +changelog_inode_ctx_t * +__changelog_inode_ctx_get(xlator_t *, inode_t *, unsigned long **, + unsigned long *, changelog_log_type); +int +resolve_pargfid_to_path(xlator_t *this, const uuid_t gfid, char **path, + char *bname); /* macros */ -#define CHANGELOG_STACK_UNWIND(fop, frame, params ...) do { \ - changelog_local_t *__local = NULL; \ - xlator_t *__xl = NULL; \ - if (frame) { \ - __local = frame->local; \ - __xl = frame->this; \ - frame->local = NULL; \ - } \ - STACK_UNWIND_STRICT (fop, frame, params); \ - changelog_local_cleanup (__xl, __local); \ - if (__local && __local->prev_entry) \ - changelog_local_cleanup (__xl, \ - __local->prev_entry); \ - } while (0) - -#define CHANGELOG_IOBUF_REF(iobuf) do { \ - if (iobuf) \ - iobuf_ref (iobuf); \ - } while (0) - -#define CHANGELOG_IOBUF_UNREF(iobuf) do { \ - if (iobuf) \ - iobuf_unref (iobuf); \ - } while (0) - -#define CHANGELOG_FILL_BUFFER(buffer, off, val, len) do { \ - memcpy (buffer + off, val, len); \ - off += len; \ - } while (0) - -#define SLICE_VERSION_UPDATE(slice) do { \ - int i = 0; \ - for (; i < CHANGELOG_MAX_TYPE; i++) { \ - slice->changelog_version[i]++; \ - } \ - } while (0) - -#define CHANGLOG_FILL_FOP_NUMBER(co, fop, converter, xlen) do { \ - co->co_convert = converter; \ - co->co_free = NULL; \ - co->co_type = CHANGELOG_OPT_REC_FOP; \ - co->co_fop = fop; \ - xlen += sizeof (fop); \ - } while (0) - -#define CHANGELOG_FILL_ENTRY(co, pargfid, bname, \ - converter, freefn, xlen, label) \ - do { \ - co->co_convert = converter; \ - co->co_free = freefn; \ - co->co_type = CHANGELOG_OPT_REC_ENTRY; \ - uuid_copy (co->co_entry.cef_uuid, pargfid); \ - co->co_entry.cef_bname = gf_strdup(bname); \ - if (!co->co_entry.cef_bname) \ - goto label; \ - xlen += (UUID_CANONICAL_FORM_LEN + strlen (bname)); \ - } while (0) - -#define CHANGELOG_INIT(this, local, inode, gfid, xrec) \ - local = changelog_local_init (this, inode, gfid, xrec, _gf_false) - -#define CHANGELOG_INIT_NOCHECK(this, local, inode, gfid, xrec) \ - local = changelog_local_init (this, inode, gfid, xrec, _gf_true) - -#define CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, label) do { \ - if (!priv->active) \ - goto label; \ - /* ignore rebalance process's activity. */ \ - if (frame->root->pid == GF_CLIENT_PID_DEFRAG) \ - goto label; \ - } while (0) - -/* ignore internal fops */ -#define CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(dict, label) do { \ - if (dict && dict_get (dict, GLUSTERFS_INTERNAL_FOP_KEY)) \ - goto label; \ - } while (0) - -#define CHANGELOG_COND_GOTO(priv, cond, label) do { \ - if (!priv->active || cond) \ - goto label; \ - } while (0) +#define CHANGELOG_STACK_UNWIND(fop, frame, params...) \ + do { \ + changelog_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + if (frame) { \ + __local = frame->local; \ + __xl = frame->this; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + if (__local && __local->prev_entry) \ + changelog_local_cleanup(__xl, __local->prev_entry); \ + changelog_local_cleanup(__xl, __local); \ + } while (0) + +#define CHANGELOG_IOBUF_REF(iobuf) \ + do { \ + if (iobuf) \ + iobuf_ref(iobuf); \ + } while (0) + +#define CHANGELOG_IOBUF_UNREF(iobuf) \ + do { \ + if (iobuf) \ + iobuf_unref(iobuf); \ + } while (0) + +#define CHANGELOG_FILL_BUFFER(buffer, off, val, len) \ + do { \ + memcpy(buffer + off, val, len); \ + off += len; \ + } while (0) + +#define SLICE_VERSION_UPDATE(slice) \ + do { \ + int i = 0; \ + for (; i < CHANGELOG_MAX_TYPE; i++) { \ + slice->changelog_version[i]++; \ + } \ + } while (0) + +#define CHANGELOG_FILL_UINT32(co, number, converter, xlen) \ + do { \ + co->co_convert = converter; \ + co->co_free = NULL; \ + co->co_type = CHANGELOG_OPT_REC_UINT32; \ + co->co_uint32 = number; \ + xlen += sizeof(unsigned int); \ + } while (0) + +#define CHANGLOG_FILL_FOP_NUMBER(co, fop, converter, xlen) \ + do { \ + co->co_convert = converter; \ + co->co_free = NULL; \ + co->co_type = CHANGELOG_OPT_REC_FOP; \ + co->co_fop = fop; \ + xlen += sizeof(fop); \ + } while (0) + +#define CHANGELOG_FILL_ENTRY(co, pargfid, bname, converter, freefn, xlen, \ + label) \ + do { \ + co->co_convert = converter; \ + co->co_free = freefn; \ + co->co_type = CHANGELOG_OPT_REC_ENTRY; \ + gf_uuid_copy(co->co_entry.cef_uuid, pargfid); \ + co->co_entry.cef_bname = gf_strdup(bname); \ + if (!co->co_entry.cef_bname) \ + goto label; \ + xlen += (UUID_CANONICAL_FORM_LEN + strlen(bname)); \ + } while (0) + +#define CHANGELOG_FILL_ENTRY_DIR_PATH(co, pargfid, bname, converter, \ + del_freefn, xlen, label, capture_del) \ + do { \ + co->co_convert = converter; \ + co->co_free = del_freefn; \ + co->co_type = CHANGELOG_OPT_REC_ENTRY; \ + gf_uuid_copy(co->co_entry.cef_uuid, pargfid); \ + co->co_entry.cef_bname = gf_strdup(bname); \ + if (!co->co_entry.cef_bname) \ + goto label; \ + xlen += (UUID_CANONICAL_FORM_LEN + strlen(bname)); \ + if (!capture_del || \ + resolve_pargfid_to_path(this, pargfid, &(co->co_entry.cef_path), \ + co->co_entry.cef_bname)) { \ + co->co_entry.cef_path = gf_strdup("\0"); \ + xlen += 1; \ + } else { \ + xlen += (strlen(co->co_entry.cef_path)); \ + } \ + } while (0) + +#define CHANGELOG_INIT(this, local, inode, gfid, xrec) \ + local = changelog_local_init(this, inode, gfid, xrec, _gf_false) + +#define CHANGELOG_INIT_NOCHECK(this, local, inode, gfid, xrec) \ + local = changelog_local_init(this, inode, gfid, xrec, _gf_true) + +#define CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, label) \ + do { \ + if (!priv->active) \ + goto label; \ + /* ignore rebalance process's activity. */ \ + if ((frame->root->pid == GF_CLIENT_PID_DEFRAG) || \ + (frame->root->pid == GF_CLIENT_PID_TIER_DEFRAG)) \ + goto label; \ + } while (0) + +/* If it is a METADATA entry and fop num being GF_FOP_NULL, don't + * log in the changelog as it is of no use. And also if it is + * logged, since slicing version checking is done for metadata + * entries, the subsequent entries with valid fop num which falls + * to same changelog will be missed. Hence check for boundary + * condition. + */ +#define CHANGELOG_OP_BOUNDARY_CHECK(frame, label) \ + do { \ + if (frame->root->op <= GF_FOP_NULL || \ + frame->root->op >= GF_FOP_MAXVALUE) \ + goto label; \ + } while (0) + +/** + * ignore internal fops for all clients except AFR self-heal daemon + */ +#define CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(frame, dict, label) \ + do { \ + if ((frame->root->pid != GF_CLIENT_PID_SELF_HEALD) && dict && \ + dict_get(dict, GLUSTERFS_INTERNAL_FOP_KEY)) \ + goto label; \ + } while (0) + +#define CHANGELOG_COND_GOTO(priv, cond, label) \ + do { \ + if (!priv->active || cond) \ + goto label; \ + } while (0) + +/* Begin: Geo-Rep snapshot dependency changes */ + +#define DICT_ERROR -1 +#define BARRIER_OFF 0 +#define BARRIER_ON 1 +#define DICT_DEFAULT 2 + +#define CHANGELOG_NOT_ON_THEN_GOTO(priv, ret, label) \ + do { \ + if (!priv->active) { \ + gf_smsg(this->name, GF_LOG_WARNING, 0, \ + CHANGELOG_MSG_CHANGELOG_NOT_ACTIVE, NULL); \ + ret = 0; \ + goto label; \ + } \ + } while (0) + +/* Log pthread error and goto label */ +#define CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, label) \ + do { \ + if (ret) { \ + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_PTHREAD_ERROR, \ + "error=%d", ret, NULL); \ + ret = -1; \ + goto label; \ + } \ + } while (0); + +/* Log pthread error, set flag and goto label */ +#define CHANGELOG_PTHREAD_ERROR_HANDLE_1(ret, label, flag) \ + do { \ + if (ret) { \ + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_PTHREAD_ERROR, \ + "error=%d", ret, NULL); \ + ret = -1; \ + flag = _gf_true; \ + goto label; \ + } \ + } while (0) + +/* Log pthread error, unlock mutex and goto label */ +#define CHANGELOG_PTHREAD_ERROR_HANDLE_2(ret, label, mutex) \ + do { \ + if (ret) { \ + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_PTHREAD_ERROR, \ + "error=%d", ret, NULL); \ + ret = -1; \ + pthread_mutex_unlock(&mutex); \ + goto label; \ + } \ + } while (0) + +/* End: Geo-Rep snapshot dependency changes */ #endif /* _CHANGELOG_HELPERS_H */ diff --git a/xlators/features/changelog/src/changelog-mem-types.h b/xlators/features/changelog/src/changelog-mem-types.h index d72464eab70..a2d8a9cbe93 100644 --- a/xlators/features/changelog/src/changelog-mem-types.h +++ b/xlators/features/changelog/src/changelog-mem-types.h @@ -11,19 +11,24 @@ #ifndef _CHANGELOG_MEM_TYPES_H #define _CHANGELOG_MEM_TYPES_H -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_changelog_mem_types { - gf_changelog_mt_priv_t = gf_common_mt_end + 1, - gf_changelog_mt_str_t = gf_common_mt_end + 2, - gf_changelog_mt_batch_t = gf_common_mt_end + 3, - gf_changelog_mt_rt_t = gf_common_mt_end + 4, - gf_changelog_mt_inode_ctx_t = gf_common_mt_end + 5, - gf_changelog_mt_libgfchangelog_t = gf_common_mt_end + 6, - gf_changelog_mt_libgfchangelog_rl_t = gf_common_mt_end + 7, - gf_changelog_mt_libgfchangelog_dirent_t = gf_common_mt_end + 8, - gf_changelog_mt_changelog_buffer_t = gf_common_mt_end + 9, - gf_changelog_mt_end + gf_changelog_mt_priv_t = gf_common_mt_end + 1, + gf_changelog_mt_str_t = gf_common_mt_end + 2, + gf_changelog_mt_batch_t = gf_common_mt_end + 3, + gf_changelog_mt_rt_t = gf_common_mt_end + 4, + gf_changelog_mt_inode_ctx_t = gf_common_mt_end + 5, + gf_changelog_mt_rpc_clnt_t = gf_common_mt_end + 6, + gf_changelog_mt_libgfchangelog_t = gf_common_mt_end + 7, + gf_changelog_mt_libgfchangelog_entry_t = gf_common_mt_end + 8, + gf_changelog_mt_libgfchangelog_rl_t = gf_common_mt_end + 9, + gf_changelog_mt_changelog_buffer_t = gf_common_mt_end + 10, + gf_changelog_mt_history_data_t = gf_common_mt_end + 11, + gf_changelog_mt_libgfchangelog_call_pool_t = gf_common_mt_end + 12, + gf_changelog_mt_libgfchangelog_event_t = gf_common_mt_end + 13, + gf_changelog_mt_ev_dispatcher_t = gf_common_mt_end + 14, + gf_changelog_mt_end }; #endif diff --git a/xlators/features/changelog/src/changelog-messages.h b/xlators/features/changelog/src/changelog-messages.h new file mode 100644 index 00000000000..cb0e16c85d8 --- /dev/null +++ b/xlators/features/changelog/src/changelog-messages.h @@ -0,0 +1,172 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _CHANGELOG_MESSAGES_H_ +#define _CHANGELOG_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID( + CHANGELOG, CHANGELOG_MSG_OPEN_FAILED, CHANGELOG_MSG_BARRIER_FOP_FAILED, + CHANGELOG_MSG_VOL_MISCONFIGURED, CHANGELOG_MSG_RENAME_ERROR, + CHANGELOG_MSG_READ_ERROR, CHANGELOG_MSG_HTIME_ERROR, + CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED, + CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED, CHANGELOG_MSG_CHILD_MISCONFIGURED, + CHANGELOG_MSG_DIR_OPTIONS_NOT_SET, CHANGELOG_MSG_CLOSE_ERROR, + CHANGELOG_MSG_PIPE_CREATION_ERROR, CHANGELOG_MSG_DICT_GET_FAILED, + CHANGELOG_MSG_BARRIER_INFO, CHANGELOG_MSG_BARRIER_ERROR, + CHANGELOG_MSG_GET_TIME_OP_FAILED, CHANGELOG_MSG_WRITE_FAILED, + CHANGELOG_MSG_PTHREAD_ERROR, CHANGELOG_MSG_INODE_NOT_FOUND, + CHANGELOG_MSG_FSYNC_OP_FAILED, CHANGELOG_MSG_TOTAL_LOG_INFO, + CHANGELOG_MSG_SNAP_INFO, CHANGELOG_MSG_SELECT_FAILED, + CHANGELOG_MSG_FCNTL_FAILED, CHANGELOG_MSG_BNOTIFY_INFO, + CHANGELOG_MSG_ENTRY_BUF_INFO, CHANGELOG_MSG_CHANGELOG_NOT_ACTIVE, + CHANGELOG_MSG_LOCAL_INIT_FAILED, CHANGELOG_MSG_NOTIFY_REGISTER_FAILED, + CHANGELOG_MSG_PROGRAM_NAME_REG_FAILED, CHANGELOG_MSG_HANDLE_PROBE_ERROR, + CHANGELOG_MSG_SET_FD_CONTEXT, CHANGELOG_MSG_FREEUP_FAILED, + CHANGELOG_MSG_RECONFIGURE, CHANGELOG_MSG_RPC_SUBMIT_REPLY_FAILED, + CHANGELOG_MSG_RPC_BUILD_ERROR, CHANGELOG_MSG_RPC_CONNECT_ERROR, + CHANGELOG_MSG_RPC_START_ERROR, CHANGELOG_MSG_BUFFER_STARVATION_ERROR, + CHANGELOG_MSG_SCAN_DIR_FAILED, CHANGELOG_MSG_FSETXATTR_FAILED, + CHANGELOG_MSG_FGETXATTR_FAILED, CHANGELOG_MSG_CLEANUP_ON_ACTIVE_REF, + CHANGELOG_MSG_DISPATCH_EVENT_FAILED, CHANGELOG_MSG_PUT_BUFFER_FAILED, + CHANGELOG_MSG_PTHREAD_COND_WAIT_FAILED, CHANGELOG_MSG_PTHREAD_CANCEL_FAILED, + CHANGELOG_MSG_INJECT_FSYNC_FAILED, CHANGELOG_MSG_CREATE_FRAME_FAILED, + CHANGELOG_MSG_FSTAT_OP_FAILED, CHANGELOG_MSG_LSEEK_OP_FAILED, + CHANGELOG_MSG_STRSTR_OP_FAILED, CHANGELOG_MSG_UNLINK_OP_FAILED, + CHANGELOG_MSG_DETECT_EMPTY_CHANGELOG_FAILED, + CHANGELOG_MSG_READLINK_OP_FAILED, CHANGELOG_MSG_EXPLICIT_ROLLOVER_FAILED, + CHANGELOG_MSG_RPCSVC_NOTIFY_FAILED, CHANGELOG_MSG_MEMORY_INIT_FAILED, + CHANGELOG_MSG_NO_MEMORY, CHANGELOG_MSG_HTIME_STAT_ERROR, + CHANGELOG_MSG_HTIME_CURRENT_ERROR, CHANGELOG_MSG_BNOTIFY_COND_INFO, + CHANGELOG_MSG_NO_HTIME_CURRENT, CHANGELOG_MSG_HTIME_CURRENT, + CHANGELOG_MSG_NEW_HTIME_FILE, CHANGELOG_MSG_MKDIR_ERROR, + CHANGELOG_MSG_PATH_NOT_FOUND, CHANGELOG_MSG_XATTR_INIT_FAILED, + CHANGELOG_MSG_WROTE_TO_CSNAP, CHANGELOG_MSG_UNUSED_0, + CHANGELOG_MSG_GET_BUFFER_FAILED, CHANGELOG_MSG_BARRIER_STATE_NOTIFY, + CHANGELOG_MSG_BARRIER_DISABLED, CHANGELOG_MSG_BARRIER_ALREADY_DISABLED, + CHANGELOG_MSG_BARRIER_ON_ERROR, CHANGELOG_MSG_BARRIER_ENABLE, + CHANGELOG_MSG_BARRIER_KEY_NOT_FOUND, CHANGELOG_MSG_ERROR_IN_DICT_GET, + CHANGELOG_MSG_UNUSED_1, CHANGELOG_MSG_UNUSED_2, + CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS, + CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS_FINISHED, + CHANGELOG_MSG_BARRIER_TIMEOUT, CHANGELOG_MSG_TIMEOUT_ADD_FAILED, + CHANGELOG_MSG_CLEANUP_ALREADY_SET); + +#define CHANGELOG_MSG_BARRIER_FOP_FAILED_STR \ + "failed to barrier FOPs, disabling changelog barrier" +#define CHANGELOG_MSG_MEMORY_INIT_FAILED_STR "memory accounting init failed" +#define CHANGELOG_MSG_NO_MEMORY_STR "failed to create local memory pool" +#define CHANGELOG_MSG_ENTRY_BUF_INFO_STR \ + "Entry cannot be captured for gfid, Capturing DATA entry." +#define CHANGELOG_MSG_PTHREAD_ERROR_STR "pthread error" +#define CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED_STR "pthread_mutex_init failed" +#define CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED_STR "pthread_cond_init failed" +#define CHANGELOG_MSG_HTIME_ERROR_STR "failed to update HTIME file" +#define CHANGELOG_MSG_HTIME_STAT_ERROR_STR "unable to stat htime file" +#define CHANGELOG_MSG_HTIME_CURRENT_ERROR_STR "Error extracting HTIME_CURRENT." +#define CHANGELOG_MSG_UNLINK_OP_FAILED_STR "error unlinking empty changelog" +#define CHANGELOG_MSG_RENAME_ERROR_STR "error renaming" +#define CHANGELOG_MSG_MKDIR_ERROR_STR "unable to create directory" +#define CHANGELOG_MSG_BNOTIFY_INFO_STR \ + "Explicit rollover changelog signaling bnotify" +#define CHANGELOG_MSG_BNOTIFY_COND_INFO_STR "Woke up: bnotify conditional wait" +#define CHANGELOG_MSG_RECONFIGURE_STR "Reconfigure: Changelog Enable" +#define CHANGELOG_MSG_NO_HTIME_CURRENT_STR \ + "HTIME_CURRENT not found. Changelog enabled before init" +#define CHANGELOG_MSG_HTIME_CURRENT_STR "HTIME_CURRENT" +#define CHANGELOG_MSG_NEW_HTIME_FILE_STR \ + "Changelog enable: Creating new HTIME file" +#define CHANGELOG_MSG_FGETXATTR_FAILED_STR "fgetxattr failed" +#define CHANGELOG_MSG_TOTAL_LOG_INFO_STR "changelog info" +#define CHANGELOG_MSG_PTHREAD_COND_WAIT_FAILED_STR "pthread cond wait failed" +#define CHANGELOG_MSG_INODE_NOT_FOUND_STR "inode not found" +#define CHANGELOG_MSG_READLINK_OP_FAILED_STR \ + "could not read the link from the gfid handle" +#define CHANGELOG_MSG_OPEN_FAILED_STR "unable to open file" +#define CHANGELOG_MSG_RPC_CONNECT_ERROR_STR "failed to connect back" +#define CHANGELOG_MSG_BUFFER_STARVATION_ERROR_STR \ + "Failed to get buffer for RPC dispatch" +#define CHANGELOG_MSG_PTHREAD_CANCEL_FAILED_STR "could not cancel thread" +#define CHANGELOG_MSG_FSTAT_OP_FAILED_STR "Could not stat (CHANGELOG)" +#define CHANGELOG_MSG_LSEEK_OP_FAILED_STR "Could not lseek (changelog)" +#define CHANGELOG_MSG_PATH_NOT_FOUND_STR \ + "Could not find CHANGELOG in changelog path" +#define CHANGELOG_MSG_FSYNC_OP_FAILED_STR "fsync failed" +#define CHANGELOG_MSG_DETECT_EMPTY_CHANGELOG_FAILED_STR \ + "Error detecting empty changelog" +#define CHANGELOG_MSG_EXPLICIT_ROLLOVER_FAILED_STR \ + "Fail snapshot because of previous errors" +#define CHANGELOG_MSG_SCAN_DIR_FAILED_STR "scandir failed" +#define CHANGELOG_MSG_FSETXATTR_FAILED_STR "fsetxattr failed" +#define CHANGELOG_MSG_XATTR_INIT_FAILED_STR "Htime xattr initialization failed" +#define CHANGELOG_MSG_SNAP_INFO_STR "log in call path" +#define CHANGELOG_MSG_WRITE_FAILED_STR "error writing to disk" +#define CHANGELOG_MSG_WROTE_TO_CSNAP_STR "Successfully wrote to csnap" +#define CHANGELOG_MSG_GET_TIME_OP_FAILED_STR "Problem rolling over changelog(s)" +#define CHANGELOG_MSG_BARRIER_INFO_STR "Explicit wakeup on barrier notify" +#define CHANGELOG_MSG_SELECT_FAILED_STR "pthread_cond_timedwait failed" +#define CHANGELOG_MSG_INJECT_FSYNC_FAILED_STR "failed to inject fsync event" +#define CHANGELOG_MSG_LOCAL_INIT_FAILED_STR \ + "changelog local initialization failed" +#define CHANGELOG_MSG_GET_BUFFER_FAILED_STR "Failed to get buffer" +#define CHANGELOG_MSG_SET_FD_CONTEXT_STR \ + "could not set fd context(for release cbk)" +#define CHANGELOG_MSG_DICT_GET_FAILED_STR "Barrier failed" +#define CHANGELOG_MSG_BARRIER_STATE_NOTIFY_STR "Barrier notification" +#define CHANGELOG_MSG_BARRIER_ERROR_STR \ + "Received another barrier off notification while already off" +#define CHANGELOG_MSG_BARRIER_DISABLED_STR "disabled changelog barrier" +#define CHANGELOG_MSG_BARRIER_ALREADY_DISABLED_STR \ + "Changelog barrier already disabled" +#define CHANGELOG_MSG_BARRIER_ON_ERROR_STR \ + "Received another barrier on notification when last one is not served yet" +#define CHANGELOG_MSG_BARRIER_ENABLE_STR "Enabled changelog barrier" +#define CHANGELOG_MSG_BARRIER_KEY_NOT_FOUND_STR "barrier key not found" +#define CHANGELOG_MSG_ERROR_IN_DICT_GET_STR \ + "Something went wrong in dict_get_str_boolean" +#define CHANGELOG_MSG_DIR_OPTIONS_NOT_SET_STR "changelog-dir option is not set" +#define CHANGELOG_MSG_FREEUP_FAILED_STR "could not cleanup bootstrapper" +#define CHANGELOG_MSG_CHILD_MISCONFIGURED_STR \ + "translator needs a single subvolume" +#define CHANGELOG_MSG_VOL_MISCONFIGURED_STR \ + "dangling volume. please check volfile" +#define CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS_STR \ + "Dequeuing all the changelog barriered fops" +#define CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS_FINISHED_STR \ + "Dequeuing changelog barriered fops is finished" +#define CHANGELOG_MSG_BARRIER_TIMEOUT_STR \ + "Disabling changelog barrier because of the timeout" +#define CHANGELOG_MSG_TIMEOUT_ADD_FAILED_STR \ + "Couldn't add changelog barrier timeout event" +#define CHANGELOG_MSG_RPC_BUILD_ERROR_STR "failed to build rpc options" +#define CHANGELOG_MSG_NOTIFY_REGISTER_FAILED_STR "failed to register notify" +#define CHANGELOG_MSG_RPC_START_ERROR_STR "failed to start rpc" +#define CHANGELOG_MSG_CREATE_FRAME_FAILED_STR "failed to create frame" +#define CHANGELOG_MSG_RPC_SUBMIT_REPLY_FAILED_STR "failed to serialize reply" +#define CHANGELOG_MSG_PROGRAM_NAME_REG_FAILED_STR "cannot register program" +#define CHANGELOG_MSG_CHANGELOG_NOT_ACTIVE_STR \ + "Changelog is not active, return success" +#define CHANGELOG_MSG_PUT_BUFFER_FAILED_STR \ + "failed to put buffer after consumption" +#define CHANGELOG_MSG_CLEANUP_ALREADY_SET_STR \ + "cleanup_starting flag is already set for xl" +#define CHANGELOG_MSG_HANDLE_PROBE_ERROR_STR "xdr decoding error" +#endif /* !_CHANGELOG_MESSAGES_H_ */ diff --git a/xlators/features/changelog/src/changelog-misc.h b/xlators/features/changelog/src/changelog-misc.h index 0712a377183..e2addc09414 100644 --- a/xlators/features/changelog/src/changelog-misc.h +++ b/xlators/features/changelog/src/changelog-misc.h @@ -11,91 +11,121 @@ #ifndef _CHANGELOG_MISC_H #define _CHANGELOG_MISC_H -#include "glusterfs.h" -#include "common-utils.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/common-utils.h> -#define CHANGELOG_MAX_TYPE 3 +#define CHANGELOG_MAX_TYPE 4 #define CHANGELOG_FILE_NAME "CHANGELOG" +#define HTIME_FILE_NAME "HTIME" +#define CSNAP_FILE_NAME "CHANGELOG.SNAP" +#define HTIME_KEY "trusted.glusterfs.htime" +#define HTIME_CURRENT "trusted.glusterfs.current_htime" +#define HTIME_INITIAL_VALUE "0:0" -#define CHANGELOG_VERSION_MAJOR 1 -#define CHANGELOG_VERSION_MINOR 0 +#define CHANGELOG_VERSION_MAJOR 1 +#define CHANGELOG_VERSION_MINOR 2 -#define CHANGELOG_UNIX_SOCK DEFAULT_VAR_RUN_DIRECTORY"/changelog-%s.sock" +#define CHANGELOG_UNIX_SOCK DEFAULT_VAR_RUN_DIRECTORY "/changelog-%s.sock" +#define CHANGELOG_TMP_UNIX_SOCK DEFAULT_VAR_RUN_DIRECTORY "/.%s%lu.sock" /** * header starts with the version and the format of the changelog. * 'version' not much of a use now. */ -#define CHANGELOG_HEADER \ - "GlusterFS Changelog | version: v%d.%d | encoding : %d\n" - -#define CHANGELOG_MAKE_SOCKET_PATH(brick_path, sockpath, len) do { \ - char md5_sum[MD5_DIGEST_LENGTH*2+1] = {0,}; \ - md5_wrapper((unsigned char *) brick_path, \ - strlen(brick_path), \ - md5_sum); \ - (void) snprintf (sockpath, len, \ - CHANGELOG_UNIX_SOCK, md5_sum); \ - } while (0) +#define CHANGELOG_HEADER \ + "GlusterFS Changelog | version: v%d.%d | encoding : %d\n" + +#define CHANGELOG_MAKE_SOCKET_PATH(brick_path, sockpath, len) \ + do { \ + char xxh64[GF_XXH64_DIGEST_LENGTH * 2 + 1] = { \ + 0, \ + }; \ + gf_xxh64_wrapper((unsigned char *)brick_path, strlen(brick_path), \ + GF_XXHSUM64_DEFAULT_SEED, xxh64); \ + (void)snprintf(sockpath, len, CHANGELOG_UNIX_SOCK, xxh64); \ + } while (0) + +#define CHANGELOG_MAKE_TMP_SOCKET_PATH(brick_path, sockpath, len) \ + do { \ + unsigned long pid = 0; \ + char xxh64[GF_XXH64_DIGEST_LENGTH * 2 + 1] = { \ + 0, \ + }; \ + pid = (unsigned long)getpid(); \ + gf_xxh64_wrapper((unsigned char *)brick_path, strlen(brick_path), \ + GF_XXHSUM64_DEFAULT_SEED, xxh64); \ + (void)snprintf(sockpath, len, CHANGELOG_TMP_UNIX_SOCK, xxh64, pid); \ + } while (0) /** * ... used by libgfchangelog. */ -#define CHANGELOG_GET_ENCODING(fd, buffer, len, enc, enc_len) do { \ - FILE *fp; \ - int fd_dup, maj, min; \ - \ - enc = -1; \ - fd_dup = dup (fd); \ - \ - if (fd_dup != -1) { \ - fp = fdopen (fd_dup, "r"); \ - if (fp) { \ - if (fgets (buffer, len, fp)) { \ - elen = strlen (buffer); \ - sscanf (buffer, \ - CHANGELOG_HEADER, \ - &maj, &min, &enc); \ - } \ - fclose (fp); \ - } else { \ - close (fd_dup); \ - } \ - } \ - } while (0) - +#define CHANGELOG_GET_HEADER_INFO(fd, buffer, len, enc, maj, min, elen) \ + do { \ + FILE *fp; \ + int fd_dup; \ + \ + enc = -1; \ + maj = -1; \ + min = -1; \ + fd_dup = dup(fd); \ + \ + if (fd_dup != -1) { \ + fp = fdopen(fd_dup, "r"); \ + if (fp) { \ + if (fgets(buffer, len, fp)) { \ + elen = strlen(buffer); \ + sscanf(buffer, CHANGELOG_HEADER, &maj, &min, &enc); \ + } \ + fclose(fp); \ + } else { \ + sys_close(fd_dup); \ + } \ + } \ + } while (0) + +#define CHANGELOG_FILL_HTIME_DIR(changelog_dir, path) \ + do { \ + snprintf(path, sizeof(path), "%s/htime", changelog_dir); \ + } while (0) + +#define CHANGELOG_FILL_CSNAP_DIR(changelog_dir, path) \ + do { \ + snprintf(path, sizeof(path), "%s/csnap", changelog_dir); \ + } while (0) /** - * everything after 'CHANGELOG_TYPE_ENTRY' are internal types + * everything after 'CHANGELOG_TYPE_METADATA_XATTR' are internal types * (ie. none of the fops trigger this type of event), hence - * CHANGELOG_MAX_TYPE = 3 + * CHANGELOG_MAX_TYPE = 4 */ typedef enum { - CHANGELOG_TYPE_DATA = 0, - CHANGELOG_TYPE_METADATA, - CHANGELOG_TYPE_ENTRY, - CHANGELOG_TYPE_ROLLOVER, - CHANGELOG_TYPE_FSYNC, + CHANGELOG_TYPE_DATA = 0, + CHANGELOG_TYPE_METADATA, + CHANGELOG_TYPE_ENTRY, + CHANGELOG_TYPE_METADATA_XATTR, + CHANGELOG_TYPE_ROLLOVER, + CHANGELOG_TYPE_FSYNC, } changelog_log_type; /* operation modes - RT for now */ typedef enum { - CHANGELOG_MODE_RT = 0, + CHANGELOG_MODE_RT = 0, } changelog_mode_t; /* encoder types */ typedef enum { - CHANGELOG_ENCODE_MIN = 0, - CHANGELOG_ENCODE_BINARY, - CHANGELOG_ENCODE_ASCII, - CHANGELOG_ENCODE_MAX, + CHANGELOG_ENCODE_MIN = 0, + CHANGELOG_ENCODE_BINARY, + CHANGELOG_ENCODE_ASCII, + CHANGELOG_ENCODE_MAX, } changelog_encoder_t; -#define CHANGELOG_VALID_ENCODING(enc) \ - (enc > CHANGELOG_ENCODE_MIN && enc < CHANGELOG_ENCODE_MAX) +#define CHANGELOG_VALID_ENCODING(enc) \ + (enc > CHANGELOG_ENCODE_MIN && enc < CHANGELOG_ENCODE_MAX) -#define CHANGELOG_TYPE_IS_ENTRY(type) (type == CHANGELOG_TYPE_ENTRY) -#define CHANGELOG_TYPE_IS_ROLLOVER(type) (type == CHANGELOG_TYPE_ROLLOVER) -#define CHANGELOG_TYPE_IS_FSYNC(type) (type == CHANGELOG_TYPE_FSYNC) +#define CHANGELOG_TYPE_IS_ENTRY(type) (type == CHANGELOG_TYPE_ENTRY) +#define CHANGELOG_TYPE_IS_ROLLOVER(type) (type == CHANGELOG_TYPE_ROLLOVER) +#define CHANGELOG_TYPE_IS_FSYNC(type) (type == CHANGELOG_TYPE_FSYNC) #endif /* _CHANGELOG_MISC_H */ diff --git a/xlators/features/changelog/src/changelog-notifier.c b/xlators/features/changelog/src/changelog-notifier.c deleted file mode 100644 index 1f8b312538e..00000000000 --- a/xlators/features/changelog/src/changelog-notifier.c +++ /dev/null @@ -1,314 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#include "changelog-notifier.h" - -#include <pthread.h> - -inline static void -changelog_notify_clear_fd (changelog_notify_t *cn, int i) -{ - cn->client_fd[i] = -1; -} - -inline static void -changelog_notify_save_fd (changelog_notify_t *cn, int i, int fd) -{ - cn->client_fd[i] = fd; -} - -static int -changelog_notify_insert_fd (xlator_t *this, changelog_notify_t *cn, int fd) -{ - int i = 0; - int ret = 0; - - for (; i < CHANGELOG_MAX_CLIENTS; i++) { - if (cn->client_fd[i] == -1) - break; - } - - if (i == CHANGELOG_MAX_CLIENTS) { - /** - * this case should not be hit as listen() would limit - * the number of completely established connections. - */ - gf_log (this->name, GF_LOG_WARNING, - "hit max client limit (%d)", CHANGELOG_MAX_CLIENTS); - ret = -1; - } - else - changelog_notify_save_fd (cn, i, fd); - - return ret; -} - -static void -changelog_notify_fill_rset (changelog_notify_t *cn, fd_set *rset, int *maxfd) -{ - int i = 0; - - FD_ZERO (rset); - - FD_SET (cn->socket_fd, rset); - *maxfd = cn->socket_fd; - - FD_SET (cn->rfd, rset); - *maxfd = max (*maxfd, cn->rfd); - - for (; i < CHANGELOG_MAX_CLIENTS; i++) { - if (cn->client_fd[i] != -1) { - FD_SET (cn->client_fd[i], rset); - *maxfd = max (*maxfd, cn->client_fd[i]); - } - } - - *maxfd = *maxfd + 1; -} - -static int -changelog_notify_client (changelog_notify_t *cn, char *path, ssize_t len) -{ - int i = 0; - int ret = 0; - - for (; i < CHANGELOG_MAX_CLIENTS; i++) { - if (cn->client_fd[i] == -1) - continue; - - if (changelog_write (cn->client_fd[i], - path, len)) { - ret = -1; - - close (cn->client_fd[i]); - changelog_notify_clear_fd (cn, i); - } - } - - return ret; -} - -static void -changelog_notifier_init (changelog_notify_t *cn) -{ - int i = 0; - - cn->socket_fd = -1; - - for (; i < CHANGELOG_MAX_CLIENTS; i++) { - changelog_notify_clear_fd (cn, i); - } -} - -static void -changelog_close_client_conn (changelog_notify_t *cn) -{ - int i = 0; - - for (; i < CHANGELOG_MAX_CLIENTS; i++) { - if (cn->client_fd[i] == -1) - continue; - - close (cn->client_fd[i]); - changelog_notify_clear_fd (cn, i); - } -} - -static void -changelog_notifier_cleanup (void *arg) -{ - changelog_notify_t *cn = NULL; - - cn = (changelog_notify_t *) arg; - - changelog_close_client_conn (cn); - - if (cn->socket_fd != -1) - close (cn->socket_fd); - - if (cn->rfd) - close (cn->rfd); - - if (unlink (cn->sockpath)) - gf_log ("", GF_LOG_WARNING, - "could not unlink changelog socket file" - " %s (reason: %s", cn->sockpath, strerror (errno)); -} - -void * -changelog_notifier (void *data) -{ - int i = 0; - int fd = 0; - int max_fd = 0; - int len = 0; - ssize_t readlen = 0; - xlator_t *this = NULL; - changelog_priv_t *priv = NULL; - changelog_notify_t *cn = NULL; - struct sockaddr_un local = {0,}; - char path[PATH_MAX] = {0,}; - char abspath[PATH_MAX] = {0,}; - - char buffer; - fd_set rset; - - priv = (changelog_priv_t *) data; - - cn = &priv->cn; - this = cn->this; - - pthread_cleanup_push (changelog_notifier_cleanup, cn); - - changelog_notifier_init (cn); - - cn->socket_fd = socket (AF_UNIX, SOCK_STREAM, 0); - if (cn->socket_fd < 0) { - gf_log (this->name, GF_LOG_ERROR, - "changelog socket error (reason: %s)", - strerror (errno)); - goto out; - } - - CHANGELOG_MAKE_SOCKET_PATH (priv->changelog_brick, - cn->sockpath, PATH_MAX); - if (unlink (cn->sockpath) < 0) { - if (errno != ENOENT) { - gf_log (this->name, GF_LOG_ERROR, - "Could not unlink changelog socket file (%s)" - " (reason: %s)", - CHANGELOG_UNIX_SOCK, strerror (errno)); - goto cleanup; - } - } - - local.sun_family = AF_UNIX; - strcpy (local.sun_path, cn->sockpath); - - len = strlen (local.sun_path) + sizeof (local.sun_family); - - /* bind to the unix domain socket */ - if (bind (cn->socket_fd, (struct sockaddr *) &local, len) < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not bind to changelog socket (reason: %s)", - strerror (errno)); - goto cleanup; - } - - /* listen for incoming connections */ - if (listen (cn->socket_fd, CHANGELOG_MAX_CLIENTS) < 0) { - gf_log (this->name, GF_LOG_ERROR, - "listen() error on changelog socket (reason: %s)", - strerror (errno)); - goto cleanup; - } - - /** - * simple select() on all to-be-read file descriptors. This method - * though old school works pretty well when you have a handfull of - * fd's to be watched (clients). - * - * Future TODO: move this to epoll based notification facility if - * number of clients increase. - */ - for (;;) { - changelog_notify_fill_rset (cn, &rset, &max_fd); - - if (select (max_fd, &rset, NULL, NULL, NULL) < 0) { - gf_log (this->name, GF_LOG_ERROR, - "select() returned -1 (reason: %s)", - strerror (errno)); - sleep (2); - continue; - } - - if (FD_ISSET (cn->socket_fd, &rset)) { - fd = accept (cn->socket_fd, NULL, NULL); - if (fd < 0) { - gf_log (this->name, GF_LOG_ERROR, - "accept error on changelog socket" - " (reason: %s)", strerror (errno)); - } else if (changelog_notify_insert_fd (this, cn, fd)) { - gf_log (this->name, GF_LOG_ERROR, - "hit max client limit"); - } - } - - if (FD_ISSET (cn->rfd, &rset)) { - /** - * read changelog filename and notify all connected - * clients. - */ - readlen = 0; - while (readlen < PATH_MAX) { - len = read (cn->rfd, &path[readlen++], 1); - if (len == -1) { - break; - } - - if (len == 0) { - gf_log (this->name, GF_LOG_ERROR, - "rollover thread sent EOF" - " on pipe - possibly a crash."); - /* be blunt and close all connections */ - pthread_exit(NULL); - } - - if (path[readlen - 1] == '\0') - break; - } - - /* should we close all client connections here too? */ - if (len < 0 || readlen == PATH_MAX) { - gf_log (this->name, GF_LOG_ERROR, - "Could not get pathname from rollover" - " thread or pathname too long"); - goto process_rest; - } - - (void) snprintf (abspath, PATH_MAX, - "%s/%s", priv->changelog_dir, path); - if (changelog_notify_client (cn, abspath, - strlen (abspath) + 1)) - gf_log (this->name, GF_LOG_ERROR, - "could not notify some clients with new" - " changelogs"); - } - - process_rest: - for (i = 0; i < CHANGELOG_MAX_CLIENTS; i++) { - if ( (fd = cn->client_fd[i]) == -1 ) - continue; - - if (FD_ISSET (fd, &rset)) { - /** - * the only data we accept from the client is a - * disconnect. Anything else is treated as bogus - * and is silently discarded (also warned!!!). - */ - if ( (readlen = read (fd, &buffer, 1)) <= 0 ) { - close (fd); - changelog_notify_clear_fd (cn, i); - } else { - /* silently discard data and log */ - gf_log (this->name, GF_LOG_WARNING, - "misbehaving changelog client"); - } - } - } - - } - - cleanup:; - pthread_cleanup_pop (1); - - out: - return NULL; -} diff --git a/xlators/features/changelog/src/changelog-rpc-common.c b/xlators/features/changelog/src/changelog-rpc-common.c new file mode 100644 index 00000000000..125246a17e1 --- /dev/null +++ b/xlators/features/changelog/src/changelog-rpc-common.c @@ -0,0 +1,359 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "changelog-rpc-common.h" +#include "changelog-messages.h" + +#include <glusterfs/syscall.h> +/** +***************************************************** + Client Interface +***************************************************** +*/ + +/** + * Initialize and return an RPC client object for a given unix + * domain socket. + */ + +void * +changelog_rpc_poller(void *arg) +{ + xlator_t *this = arg; + + (void)gf_event_dispatch(this->ctx->event_pool); + return NULL; +} + +struct rpc_clnt * +changelog_rpc_client_init(xlator_t *this, void *cbkdata, char *sockfile, + rpc_clnt_notify_t fn) +{ + int ret = 0; + struct rpc_clnt *rpc = NULL; + dict_t *options = NULL; + + if (!cbkdata) + cbkdata = this; + + options = dict_new(); + if (!options) + goto error_return; + + ret = rpc_transport_unix_options_build(options, sockfile, 0); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_BUILD_ERROR, + NULL); + goto dealloc_dict; + } + + rpc = rpc_clnt_new(options, this, this->name, 16); + if (!rpc) + goto dealloc_dict; + + ret = rpc_clnt_register_notify(rpc, fn, cbkdata); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_MSG_NOTIFY_REGISTER_FAILED, NULL); + goto dealloc_rpc_clnt; + } + + ret = rpc_clnt_start(rpc); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_START_ERROR, + NULL); + goto dealloc_rpc_clnt; + } + + dict_unref(options); + return rpc; + +dealloc_rpc_clnt: + rpc_clnt_unref(rpc); +dealloc_dict: + dict_unref(options); +error_return: + return NULL; +} + +/** + * Generic RPC client routine to dispatch a request to an + * RPC server. + */ +int +changelog_rpc_sumbit_req(struct rpc_clnt *rpc, void *req, call_frame_t *frame, + rpc_clnt_prog_t *prog, int procnum, + struct iovec *payload, int payloadcnt, + struct iobref *iobref, xlator_t *this, + fop_cbk_fn_t cbkfn, xdrproc_t xdrproc) +{ + int ret = 0; + int count = 0; + struct iovec iov = { + 0, + }; + struct iobuf *iobuf = NULL; + char new_iobref = 0; + ssize_t xdr_size = 0; + + GF_ASSERT(this); + + if (req) { + xdr_size = xdr_sizeof(xdrproc, req); + + iobuf = iobuf_get2(this->ctx->iobuf_pool, xdr_size); + if (!iobuf) { + goto out; + }; + + if (!iobref) { + iobref = iobref_new(); + if (!iobref) { + goto out; + } + + new_iobref = 1; + } + + iobref_add(iobref, iobuf); + + iov.iov_base = iobuf->ptr; + iov.iov_len = iobuf_size(iobuf); + + /* Create the xdr payload */ + ret = xdr_serialize_generic(iov, req, xdrproc); + if (ret == -1) { + goto out; + } + + iov.iov_len = ret; + count = 1; + } + + ret = rpc_clnt_submit(rpc, prog, procnum, cbkfn, &iov, count, payload, + payloadcnt, iobref, frame, NULL, 0, NULL, 0, NULL); + +out: + if (new_iobref) + iobref_unref(iobref); + if (iobuf) + iobuf_unref(iobuf); + return ret; +} + +/** + * Entry point to perform a remote procedure call + */ +int +changelog_invoke_rpc(xlator_t *this, struct rpc_clnt *rpc, + rpc_clnt_prog_t *prog, int procidx, void *arg) +{ + int ret = 0; + call_frame_t *frame = NULL; + rpc_clnt_procedure_t *proc = NULL; + + if (!this || !prog) + goto error_return; + + frame = create_frame(this, this->ctx->pool); + if (!frame) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_CREATE_FRAME_FAILED, + NULL); + goto error_return; + } + + proc = &prog->proctable[procidx]; + if (proc->fn) + ret = proc->fn(frame, this, arg); + + STACK_DESTROY(frame->root); + return ret; + +error_return: + return -1; +} + +/** +***************************************************** + Server Interface +***************************************************** +*/ + +struct iobuf * +__changelog_rpc_serialize_reply(rpcsvc_request_t *req, void *arg, + struct iovec *outmsg, xdrproc_t xdrproc) +{ + struct iobuf *iob = NULL; + ssize_t retlen = 0; + ssize_t rsp_size = 0; + + rsp_size = xdr_sizeof(xdrproc, arg); + iob = iobuf_get2(req->svc->ctx->iobuf_pool, rsp_size); + if (!iob) + goto error_return; + + iobuf_to_iovec(iob, outmsg); + + retlen = xdr_serialize_generic(*outmsg, arg, xdrproc); + if (retlen == -1) + goto unref_iob; + + outmsg->iov_len = retlen; + return iob; + +unref_iob: + iobuf_unref(iob); +error_return: + return NULL; +} + +int +changelog_rpc_sumbit_reply(rpcsvc_request_t *req, void *arg, + struct iovec *payload, int payloadcount, + struct iobref *iobref, xdrproc_t xdrproc) +{ + int ret = -1; + struct iobuf *iob = NULL; + struct iovec iov = { + 0, + }; + char new_iobref = 0; + + if (!req) + goto return_ret; + + if (!iobref) { + iobref = iobref_new(); + if (!iobref) + goto return_ret; + new_iobref = 1; + } + + iob = __changelog_rpc_serialize_reply(req, arg, &iov, xdrproc); + if (!iob) + gf_smsg("", GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_SUBMIT_REPLY_FAILED, + NULL); + else + iobref_add(iobref, iob); + + ret = rpcsvc_submit_generic(req, &iov, 1, payload, payloadcount, iobref); + + if (new_iobref) + iobref_unref(iobref); + if (iob) + iobuf_unref(iob); +return_ret: + return ret; +} + +void +changelog_rpc_server_destroy(xlator_t *this, rpcsvc_t *rpc, char *sockfile, + rpcsvc_notify_t fn, struct rpcsvc_program **progs) +{ + rpcsvc_listener_t *listener = NULL; + rpcsvc_listener_t *next = NULL; + struct rpcsvc_program *prog = NULL; + rpc_transport_t *trans = NULL; + + if (!rpc) + return; + + while (*progs) { + prog = *progs; + (void)rpcsvc_program_unregister(rpc, prog); + progs++; + } + + list_for_each_entry_safe(listener, next, &rpc->listeners, list) + { + if (listener->trans) { + trans = listener->trans; + rpc_transport_disconnect(trans, _gf_false); + } + } + + (void)rpcsvc_unregister_notify(rpc, fn, this); + + /* TODO Avoid freeing rpc object in case of brick multiplex + after freeing rpc object svc->rpclock corrupted and it takes + more time to detach a brick + */ + if (!this->cleanup_starting) { + if (rpc->rxpool) { + mem_pool_destroy(rpc->rxpool); + rpc->rxpool = NULL; + } + GF_FREE(rpc); + } +} + +rpcsvc_t * +changelog_rpc_server_init(xlator_t *this, char *sockfile, void *cbkdata, + rpcsvc_notify_t fn, struct rpcsvc_program **progs) +{ + int ret = 0; + rpcsvc_t *rpc = NULL; + dict_t *options = NULL; + struct rpcsvc_program *prog = NULL; + + if (!cbkdata) + cbkdata = this; + + options = dict_new(); + if (!options) + return NULL; + + ret = rpcsvc_transport_unix_options_build(options, sockfile); + if (ret) + goto dealloc_dict; + + rpc = rpcsvc_init(this, this->ctx, options, 8); + if (rpc == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_START_ERROR, + NULL); + goto dealloc_dict; + } + + ret = rpcsvc_register_notify(rpc, fn, cbkdata); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_MSG_NOTIFY_REGISTER_FAILED, NULL); + goto dealloc_rpc; + } + + ret = rpcsvc_create_listeners(rpc, options, this->name); + if (ret != 1) { + gf_msg_debug(this->name, 0, "failed to create listeners"); + goto dealloc_rpc; + } + + while (*progs) { + prog = *progs; + ret = rpcsvc_program_register(rpc, prog, _gf_false); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_MSG_PROGRAM_NAME_REG_FAILED, "name%s", + prog->progname, "prognum=%d", prog->prognum, "pogver=%d", + prog->progver, NULL); + goto dealloc_rpc; + } + + progs++; + } + + dict_unref(options); + return rpc; + +dealloc_rpc: + GF_FREE(rpc); +dealloc_dict: + dict_unref(options); + return NULL; +} diff --git a/xlators/features/changelog/src/changelog-rpc-common.h b/xlators/features/changelog/src/changelog-rpc-common.h new file mode 100644 index 00000000000..4d9aa2c694b --- /dev/null +++ b/xlators/features/changelog/src/changelog-rpc-common.h @@ -0,0 +1,85 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __CHANGELOG_RPC_COMMON_H +#define __CHANGELOG_RPC_COMMON_H + +#include "rpcsvc.h" +#include "rpc-clnt.h" +#include <glusterfs/gf-event.h> +#include <glusterfs/call-stub.h> + +#include "changelog-xdr.h" +#include "xdr-generic.h" + +#include "changelog.h" + +/** + * Let's keep this non-configurable for now. + */ +#define NR_ROTT_BUFFS 4 +#define NR_DISPATCHERS (NR_ROTT_BUFFS - 1) + +enum changelog_rpc_procnum { + CHANGELOG_RPC_PROC_NULL = 0, + CHANGELOG_RPC_PROBE_FILTER = 1, + CHANGELOG_RPC_PROC_MAX = 2, +}; + +#define CHANGELOG_RPC_PROGNUM 1885957735 +#define CHANGELOG_RPC_PROGVER 1 + +/** + * reverse connection: data xfer path + */ +enum changelog_reverse_rpc_procnum { + CHANGELOG_REV_PROC_NULL = 0, + CHANGELOG_REV_PROC_EVENT = 1, + CHANGELOG_REV_PROC_MAX = 2, +}; + +#define CHANGELOG_REV_RPC_PROCNUM 1886350951 +#define CHANGELOG_REV_RPC_PROCVER 1 + +typedef struct changelog_rpc { + rpcsvc_t *svc; + struct rpc_clnt *rpc; + char sock[UNIX_PATH_MAX]; /* tied to server */ +} changelog_rpc_t; + +/* event poller */ +void * +changelog_rpc_poller(void *); + +/* CLIENT API */ +struct rpc_clnt * +changelog_rpc_client_init(xlator_t *, void *, char *, rpc_clnt_notify_t); + +int +changelog_rpc_sumbit_req(struct rpc_clnt *, void *, call_frame_t *, + rpc_clnt_prog_t *, int, struct iovec *, int, + struct iobref *, xlator_t *, fop_cbk_fn_t, xdrproc_t); + +int +changelog_invoke_rpc(xlator_t *, struct rpc_clnt *, rpc_clnt_prog_t *, int, + void *); + +/* SERVER API */ +int +changelog_rpc_sumbit_reply(rpcsvc_request_t *, void *, struct iovec *, int, + struct iobref *, xdrproc_t); +rpcsvc_t * +changelog_rpc_server_init(xlator_t *, char *, void *, rpcsvc_notify_t, + struct rpcsvc_program **); +void +changelog_rpc_server_destroy(xlator_t *, rpcsvc_t *, char *, rpcsvc_notify_t, + struct rpcsvc_program **); + +#endif diff --git a/xlators/features/changelog/src/changelog-rpc.c b/xlators/features/changelog/src/changelog-rpc.c new file mode 100644 index 00000000000..440b88091a6 --- /dev/null +++ b/xlators/features/changelog/src/changelog-rpc.c @@ -0,0 +1,440 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <glusterfs/syscall.h> +#include "changelog-rpc.h" +#include "changelog-mem-types.h" +#include "changelog-ev-handle.h" + +static struct rpcsvc_program *changelog_programs[]; + +static void +changelog_cleanup_dispatchers(xlator_t *this, changelog_priv_t *priv, int count) +{ + for (count--; count >= 0; count--) { + (void)changelog_thread_cleanup(this, priv->ev_dispatcher[count]); + priv->ev_dispatcher[count] = 0; + } +} + +int +changelog_cleanup_rpc_threads(xlator_t *this, changelog_priv_t *priv) +{ + int ret = 0; + changelog_clnt_t *conn = NULL; + + conn = &priv->connections; + if (!conn) + return 0; + + /** terminate RPC thread(s) */ + ret = changelog_thread_cleanup(this, priv->connector); + if (ret != 0) + goto error_return; + priv->connector = 0; + + /** terminate dispatcher thread(s) */ + changelog_cleanup_dispatchers(this, priv, priv->nr_dispatchers); + + /* destroy locks */ + ret = pthread_mutex_destroy(&conn->pending_lock); + if (ret != 0) + goto error_return; + ret = pthread_cond_destroy(&conn->pending_cond); + if (ret != 0) + goto error_return; + ret = LOCK_DESTROY(&conn->active_lock); + if (ret != 0) + goto error_return; + ret = LOCK_DESTROY(&conn->wait_lock); + if (ret != 0) + goto error_return; + return 0; + +error_return: + return -1; +} + +static int +changelog_init_rpc_threads(xlator_t *this, changelog_priv_t *priv, rbuf_t *rbuf, + int nr_dispatchers) +{ + int j = 0; + int ret = 0; + changelog_clnt_t *conn = NULL; + + conn = &priv->connections; + + conn->this = this; + conn->rbuf = rbuf; + conn->sequence = 1; /* start with sequence number one */ + + INIT_LIST_HEAD(&conn->pending); + INIT_LIST_HEAD(&conn->active); + INIT_LIST_HEAD(&conn->waitq); + + ret = pthread_mutex_init(&conn->pending_lock, NULL); + if (ret) + goto error_return; + ret = pthread_cond_init(&conn->pending_cond, NULL); + if (ret) + goto cleanup_pending_lock; + + ret = LOCK_INIT(&conn->active_lock); + if (ret) + goto cleanup_pending_cond; + ret = LOCK_INIT(&conn->wait_lock); + if (ret) + goto cleanup_active_lock; + + /* spawn reverse connection thread */ + ret = gf_thread_create(&priv->connector, NULL, changelog_ev_connector, conn, + "clogecon"); + if (ret != 0) + goto cleanup_wait_lock; + + /* spawn dispatcher thread(s) */ + priv->ev_dispatcher = GF_CALLOC(nr_dispatchers, sizeof(pthread_t), + gf_changelog_mt_ev_dispatcher_t); + if (!priv->ev_dispatcher) + goto cleanup_connector; + + /* spawn dispatcher threads */ + for (; j < nr_dispatchers; j++) { + ret = gf_thread_create(&priv->ev_dispatcher[j], NULL, + changelog_ev_dispatch, conn, "clogd%03hx", + j & 0x3ff); + if (ret != 0) { + changelog_cleanup_dispatchers(this, priv, j); + break; + } + } + + if (ret != 0) + goto cleanup_connector; + + priv->nr_dispatchers = nr_dispatchers; + return 0; + +cleanup_connector: + (void)pthread_cancel(priv->connector); +cleanup_wait_lock: + LOCK_DESTROY(&conn->wait_lock); +cleanup_active_lock: + LOCK_DESTROY(&conn->active_lock); +cleanup_pending_cond: + (void)pthread_cond_destroy(&conn->pending_cond); +cleanup_pending_lock: + (void)pthread_mutex_destroy(&conn->pending_lock); +error_return: + return -1; +} + +int +changelog_rpcsvc_notify(rpcsvc_t *rpc, void *xl, rpcsvc_event_t event, + void *data) +{ + xlator_t *this = NULL; + rpc_transport_t *trans = NULL; + rpc_transport_t *xprt = NULL; + rpc_transport_t *xp_next = NULL; + changelog_priv_t *priv = NULL; + uint64_t listnercnt = 0; + uint64_t xprtcnt = 0; + uint64_t clntcnt = 0; + rpcsvc_listener_t *listener = NULL; + rpcsvc_listener_t *next = NULL; + gf_boolean_t listner_found = _gf_false; + socket_private_t *sockpriv = NULL; + + if (!xl || !data || !rpc) { + gf_msg_callingfn("changelog", GF_LOG_WARNING, 0, + CHANGELOG_MSG_RPCSVC_NOTIFY_FAILED, + "Calling rpc_notify without initializing"); + goto out; + } + + this = xl; + trans = data; + priv = this->private; + + if (!priv) { + gf_msg_callingfn("changelog", GF_LOG_WARNING, 0, + CHANGELOG_MSG_RPCSVC_NOTIFY_FAILED, + "Calling rpc_notify without priv initializing"); + goto out; + } + + if (event == RPCSVC_EVENT_ACCEPT) { + GF_ATOMIC_INC(priv->xprtcnt); + LOCK(&priv->lock); + { + list_add_tail(&trans->list, &priv->xprt_list); + } + UNLOCK(&priv->lock); + goto out; + } + + if (event == RPCSVC_EVENT_DISCONNECT) { + list_for_each_entry_safe(listener, next, &rpc->listeners, list) + { + if (listener && listener->trans) { + if (listener->trans == trans) { + listnercnt = GF_ATOMIC_DEC(priv->listnercnt); + listner_found = _gf_true; + rpcsvc_listener_destroy(listener); + } + } + } + + if (listnercnt > 0) { + goto out; + } + if (listner_found) { + LOCK(&priv->lock); + list_for_each_entry_safe(xprt, xp_next, &priv->xprt_list, list) + { + sockpriv = (socket_private_t *)(xprt->private); + gf_log("changelog", GF_LOG_INFO, + "Send disconnect" + " on socket %d", + sockpriv->sock); + rpc_transport_disconnect(xprt, _gf_false); + } + UNLOCK(&priv->lock); + goto out; + } + LOCK(&priv->lock); + { + list_del_init(&trans->list); + } + UNLOCK(&priv->lock); + + xprtcnt = GF_ATOMIC_DEC(priv->xprtcnt); + clntcnt = GF_ATOMIC_GET(priv->clntcnt); + if (!xprtcnt && !clntcnt) { + changelog_process_cleanup_event(this); + } + } + +out: + return 0; +} + +void +changelog_process_cleanup_event(xlator_t *this) +{ + gf_boolean_t cleanup_notify = _gf_false; + changelog_priv_t *priv = NULL; + char sockfile[UNIX_PATH_MAX] = { + 0, + }; + + if (!this) + return; + priv = this->private; + if (!priv) + return; + + LOCK(&priv->lock); + { + cleanup_notify = priv->notify_down; + priv->notify_down = _gf_true; + } + UNLOCK(&priv->lock); + + if (priv->victim && !cleanup_notify) { + default_notify(this, GF_EVENT_PARENT_DOWN, priv->victim); + + if (priv->rpc) { + /* sockfile path could have been saved to avoid this */ + CHANGELOG_MAKE_SOCKET_PATH(priv->changelog_brick, sockfile, + UNIX_PATH_MAX); + sys_unlink(sockfile); + (void)rpcsvc_unregister_notify(priv->rpc, changelog_rpcsvc_notify, + this); + if (priv->rpc->rxpool) { + mem_pool_destroy(priv->rpc->rxpool); + priv->rpc->rxpool = NULL; + } + GF_FREE(priv->rpc); + priv->rpc = NULL; + } + } +} + +void +changelog_destroy_rpc_listner(xlator_t *this, changelog_priv_t *priv) +{ + char sockfile[UNIX_PATH_MAX] = { + 0, + }; + + /* sockfile path could have been saved to avoid this */ + CHANGELOG_MAKE_SOCKET_PATH(priv->changelog_brick, sockfile, UNIX_PATH_MAX); + changelog_rpc_server_destroy(this, priv->rpc, sockfile, + changelog_rpcsvc_notify, changelog_programs); +} + +rpcsvc_t * +changelog_init_rpc_listener(xlator_t *this, changelog_priv_t *priv, + rbuf_t *rbuf, int nr_dispatchers) +{ + int ret = 0; + char sockfile[UNIX_PATH_MAX] = { + 0, + }; + rpcsvc_t *svcp; + + ret = changelog_init_rpc_threads(this, priv, rbuf, nr_dispatchers); + if (ret) + return NULL; + + CHANGELOG_MAKE_SOCKET_PATH(priv->changelog_brick, sockfile, UNIX_PATH_MAX); + (void)sys_unlink(sockfile); + svcp = changelog_rpc_server_init( + this, sockfile, NULL, changelog_rpcsvc_notify, changelog_programs); + return svcp; +} + +void +changelog_rpc_clnt_cleanup(changelog_rpc_clnt_t *crpc) +{ + if (!crpc) + return; + crpc->c_clnt = NULL; + LOCK_DESTROY(&crpc->lock); + GF_FREE(crpc); +} + +static changelog_rpc_clnt_t * +changelog_rpc_clnt_init(xlator_t *this, changelog_probe_req *rpc_req, + changelog_clnt_t *c_clnt) +{ + int ret = 0; + changelog_rpc_clnt_t *crpc = NULL; + + crpc = GF_CALLOC(1, sizeof(*crpc), gf_changelog_mt_rpc_clnt_t); + if (!crpc) + goto error_return; + INIT_LIST_HEAD(&crpc->list); + + /* Take a ref, the last unref will be on RPC_CLNT_DESTROY + * which comes as a result of last rpc_clnt_unref. + */ + GF_ATOMIC_INIT(crpc->ref, 1); + changelog_set_disconnect_flag(crpc, _gf_false); + + crpc->filter = rpc_req->filter; + (void)memcpy(crpc->sock, rpc_req->sock, strlen(rpc_req->sock)); + + crpc->this = this; + crpc->c_clnt = c_clnt; + crpc->cleanup = changelog_rpc_clnt_cleanup; + + ret = LOCK_INIT(&crpc->lock); + if (ret != 0) + goto dealloc_crpc; + return crpc; + +dealloc_crpc: + GF_FREE(crpc); +error_return: + return NULL; +} + +/** + * Actor declarations + */ + +/** + * @probe_handler + * A probe RPC call spawns a connect back to the caller. Caller also + * passes an hint which acts as a filter for selecting updates. + */ + +int +changelog_handle_probe(rpcsvc_request_t *req) +{ + int ret = 0; + xlator_t *this = NULL; + rpcsvc_t *svc = NULL; + changelog_priv_t *priv = NULL; + changelog_clnt_t *c_clnt = NULL; + changelog_rpc_clnt_t *crpc = NULL; + + changelog_probe_req rpc_req = { + 0, + }; + changelog_probe_rsp rpc_rsp = { + 0, + }; + + this = req->trans->xl; + if (this->cleanup_starting) { + gf_smsg(this->name, GF_LOG_DEBUG, 0, CHANGELOG_MSG_CLEANUP_ALREADY_SET, + NULL); + return 0; + } + + ret = xdr_to_generic(req->msg[0], &rpc_req, + (xdrproc_t)xdr_changelog_probe_req); + if (ret < 0) { + gf_smsg("", GF_LOG_ERROR, 0, CHANGELOG_MSG_HANDLE_PROBE_ERROR, NULL); + req->rpc_err = GARBAGE_ARGS; + goto handle_xdr_error; + } + + /* ->xl hidden in rpcsvc */ + svc = rpcsvc_request_service(req); + this = svc->xl; + priv = this->private; + c_clnt = &priv->connections; + + crpc = changelog_rpc_clnt_init(this, &rpc_req, c_clnt); + if (!crpc) + goto handle_xdr_error; + + changelog_ev_queue_connection(c_clnt, crpc); + rpc_rsp.op_ret = 0; + + goto submit_rpc; + +handle_xdr_error: + rpc_rsp.op_ret = -1; +submit_rpc: + (void)changelog_rpc_sumbit_reply(req, &rpc_rsp, NULL, 0, NULL, + (xdrproc_t)xdr_changelog_probe_rsp); + return 0; +} + +/** + * RPC declarations + */ + +static rpcsvc_actor_t changelog_svc_actors[CHANGELOG_RPC_PROC_MAX] = { + [CHANGELOG_RPC_PROBE_FILTER] = {"CHANGELOG PROBE FILTER", + changelog_handle_probe, NULL, + CHANGELOG_RPC_PROBE_FILTER, DRC_NA, 0}, +}; + +static struct rpcsvc_program changelog_svc_prog = { + .progname = CHANGELOG_RPC_PROGNAME, + .prognum = CHANGELOG_RPC_PROGNUM, + .progver = CHANGELOG_RPC_PROGVER, + .numactors = CHANGELOG_RPC_PROC_MAX, + .actors = changelog_svc_actors, + .synctask = _gf_true, +}; + +static struct rpcsvc_program *changelog_programs[] = { + &changelog_svc_prog, + NULL, +}; diff --git a/xlators/features/changelog/src/changelog-rpc.h b/xlators/features/changelog/src/changelog-rpc.h new file mode 100644 index 00000000000..b1707565249 --- /dev/null +++ b/xlators/features/changelog/src/changelog-rpc.h @@ -0,0 +1,31 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __CHANGELOG_RPC_H +#define __CHANGELOG_RPC_H + +#include <glusterfs/xlator.h> +#include "changelog-helpers.h" + +/* one time */ +#include "socket.h" +#include "changelog-rpc-common.h" + +#define CHANGELOG_RPC_PROGNAME "GlusterFS Changelog" + +rpcsvc_t * +changelog_init_rpc_listener(xlator_t *, changelog_priv_t *, rbuf_t *, int); + +void +changelog_destroy_rpc_listner(xlator_t *, changelog_priv_t *); + +int +changelog_cleanup_rpc_threads(xlator_t *this, changelog_priv_t *priv); +#endif diff --git a/xlators/features/changelog/src/changelog-rt.c b/xlators/features/changelog/src/changelog-rt.c index c147f68ca85..841545ae359 100644 --- a/xlators/features/changelog/src/changelog-rt.c +++ b/xlators/features/changelog/src/changelog-rt.c @@ -8,65 +8,59 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "defaults.h" -#include "logging.h" +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include <glusterfs/logging.h> #include "changelog-rt.h" #include "changelog-mem-types.h" int -changelog_rt_init (xlator_t *this, changelog_dispatcher_t *cd) +changelog_rt_init(xlator_t *this, changelog_dispatcher_t *cd) { - changelog_rt_t *crt = NULL; + changelog_rt_t *crt = NULL; - crt = GF_CALLOC (1, sizeof (*crt), - gf_changelog_mt_rt_t); - if (!crt) - return -1; + crt = GF_CALLOC(1, sizeof(*crt), gf_changelog_mt_rt_t); + if (!crt) + return -1; - LOCK_INIT (&crt->lock); + LOCK_INIT(&crt->lock); - cd->cd_data = crt; - cd->dispatchfn = &changelog_rt_enqueue; + cd->cd_data = crt; + cd->dispatchfn = &changelog_rt_enqueue; - return 0; + return 0; } int -changelog_rt_fini (xlator_t *this, changelog_dispatcher_t *cd) +changelog_rt_fini(xlator_t *this, changelog_dispatcher_t *cd) { - changelog_rt_t *crt = NULL; + changelog_rt_t *crt = NULL; - crt = cd->cd_data; + crt = cd->cd_data; - LOCK_DESTROY (&crt->lock); - GF_FREE (crt); + LOCK_DESTROY(&crt->lock); + GF_FREE(crt); - return 0; + return 0; } int -changelog_rt_enqueue (xlator_t *this, changelog_priv_t *priv, void *cbatch, - changelog_log_data_t *cld_0, changelog_log_data_t *cld_1) +changelog_rt_enqueue(xlator_t *this, changelog_priv_t *priv, void *cbatch, + changelog_log_data_t *cld_0, changelog_log_data_t *cld_1) { - int ret = 0; - changelog_rt_t *crt = NULL; + int ret = 0; + changelog_rt_t *crt = NULL; - crt = (changelog_rt_t *) cbatch; + crt = (changelog_rt_t *)cbatch; - LOCK (&crt->lock); - { - ret = changelog_handle_change (this, priv, cld_0); - if (!ret && cld_1) - ret = changelog_handle_change (this, priv, cld_1); - } - UNLOCK (&crt->lock); + LOCK(&crt->lock); + { + ret = changelog_handle_change(this, priv, cld_0); + if (!ret && cld_1) + ret = changelog_handle_change(this, priv, cld_1); + } + UNLOCK(&crt->lock); - return ret; + return ret; } diff --git a/xlators/features/changelog/src/changelog-rt.h b/xlators/features/changelog/src/changelog-rt.h index 1fc2bbc5bb9..28b9827d85b 100644 --- a/xlators/features/changelog/src/changelog-rt.h +++ b/xlators/features/changelog/src/changelog-rt.h @@ -11,23 +11,23 @@ #ifndef _CHANGELOG_RT_H #define _CHANGELOG_RT_H -#include "locking.h" -#include "timer.h" +#include <glusterfs/locking.h> +#include <glusterfs/timer.h> #include "pthread.h" #include "changelog-helpers.h" /* unused as of now - may be you would need it later */ typedef struct changelog_rt { - gf_lock_t lock; + gf_lock_t lock; } changelog_rt_t; int -changelog_rt_init (xlator_t *this, changelog_dispatcher_t *cd); +changelog_rt_init(xlator_t *this, changelog_dispatcher_t *cd); int -changelog_rt_fini (xlator_t *this, changelog_dispatcher_t *cd); +changelog_rt_fini(xlator_t *this, changelog_dispatcher_t *cd); int -changelog_rt_enqueue (xlator_t *this, changelog_priv_t *priv, void *cbatch, - changelog_log_data_t *cld_0, changelog_log_data_t *cld_1); +changelog_rt_enqueue(xlator_t *this, changelog_priv_t *priv, void *cbatch, + changelog_log_data_t *cld_0, changelog_log_data_t *cld_1); #endif /* _CHANGELOG_RT_H */ diff --git a/xlators/features/changelog/src/changelog.c b/xlators/features/changelog/src/changelog.c index cea0e8c70b5..6a6e5af859e 100644 --- a/xlators/features/changelog/src/changelog.c +++ b/xlators/features/changelog/src/changelog.c @@ -8,34 +8,38 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "defaults.h" -#include "logging.h" -#include "iobuf.h" +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include <glusterfs/syscall.h> +#include <glusterfs/logging.h> +#include <glusterfs/iobuf.h> #include "changelog-rt.h" #include "changelog-encoders.h" #include "changelog-mem-types.h" +#include "changelog-messages.h" #include <pthread.h> +#include <signal.h> -#include "changelog-notifier.h" +#include "changelog-rpc.h" +#include "errno.h" -static struct changelog_bootstrap -cb_bootstrap[] = { - { - .mode = CHANGELOG_MODE_RT, - .ctor = changelog_rt_init, - .dtor = changelog_rt_fini, - }, +static struct changelog_bootstrap cb_bootstrap[] = { + { + .mode = CHANGELOG_MODE_RT, + .ctor = changelog_rt_init, + .dtor = changelog_rt_fini, + }, }; +static int +changelog_init_rpc(xlator_t *this, changelog_priv_t *priv); + +static int +changelog_init(xlator_t *this, changelog_priv_t *priv); + /* Entry operations - TYPE III */ /** @@ -47,788 +51,1566 @@ cb_bootstrap[] = { /* rmdir */ int32_t -changelog_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +changelog_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY); - unwind: - CHANGELOG_STACK_UNWIND (rmdir, frame, op_ret, op_errno, - preparent, postparent, xdata); - return 0; +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(rmdir, frame, op_ret, op_errno, preparent, + postparent, xdata); + return 0; } int32_t -changelog_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int xflags, dict_t *xdata) +changelog_rmdir_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + int xflags, dict_t *xdata) { - size_t xtra_len = 0; - changelog_priv_t *priv = NULL; - changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + priv = this->private; - CHANGELOG_INIT_NOCHECK (this, frame->local, - NULL, loc->inode->gfid, 2); - - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; - - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); - - co++; - CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, - entry_fn, entry_free_fn, xtra_len, wind); - - changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + gf_msg_debug(this->name, 0, "Dequeue rmdir"); + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_rmdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, loc, xflags, xdata); + return 0; +} - wind: - STACK_WIND (frame, changelog_rmdir_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->rmdir, - loc, xflags, xdata); - return 0; +int32_t +changelog_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) +{ + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + call_stub_t *stub = NULL; + struct list_head queue = { + 0, + }; + gf_boolean_t barrier_enabled = _gf_false; + + INIT_LIST_HEAD(&queue); + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, loc->inode->gfid, 2); + + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); + + co++; + if (priv->capture_del_path) { + CHANGELOG_FILL_ENTRY_DIR_PATH(co, loc->pargfid, loc->name, del_entry_fn, + del_entry_free_fn, xtra_len, wind, + _gf_true); + } else { + CHANGELOG_FILL_ENTRY_DIR_PATH(co, loc->pargfid, loc->name, del_entry_fn, + del_entry_free_fn, xtra_len, wind, + _gf_false); + } + + changelog_set_usable_record_and_length(frame->local, xtra_len, 2); + + /* changelog barrier */ + /* Color assignment and increment of fop_cnt for rmdir/unlink/rename + * should be made with in priv lock if changelog barrier is not enabled. + * Because if counter is not incremented yet, draining wakes up and + * publishes the changelog but later these fops might hit the disk and + * present in snapped volume but where as the intention is these fops + * should not be present in snapped volume. + */ + LOCK(&priv->lock); + { + if ((barrier_enabled = priv->barrier_enabled)) { + stub = fop_rmdir_stub(frame, changelog_rmdir_resume, loc, xflags, + xdata); + if (!stub) + __chlog_barrier_disable(this, &queue); + else + __chlog_barrier_enqueue(this, stub); + } else { + ((changelog_local_t *)frame->local)->color = priv->current_color; + changelog_inc_fop_cnt(this, priv, frame->local); + } + } + UNLOCK(&priv->lock); + + if (barrier_enabled && stub) { + gf_msg_debug(this->name, 0, "Enqueue rmdir"); + goto out; + } + if (barrier_enabled && !stub) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, + CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=rmdir", NULL); + chlog_barrier_dequeue_all(this, &queue); + } + + /* changelog barrier */ + +wind: + STACK_WIND(frame, changelog_rmdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, loc, xflags, xdata); +out: + return 0; } /* unlink */ int32_t -changelog_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +changelog_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY); - unwind: - CHANGELOG_STACK_UNWIND (unlink, frame, op_ret, op_errno, - preparent, postparent, xdata); - return 0; +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, + postparent, xdata); + return 0; } int32_t -changelog_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, int xflags, dict_t *xdata) +changelog_unlink_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + int xflags, dict_t *xdata) { - size_t xtra_len = 0; - changelog_priv_t *priv = NULL; - changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO (xdata, wind); + priv = this->private; - CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, loc->inode->gfid, 2); + gf_msg_debug(this->name, 0, "Dequeue unlink"); + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflags, xdata); + return 0; +} - co = changelog_get_usable_buffer (frame->local); +int32_t +changelog_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) +{ + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + call_stub_t *stub = NULL; + struct list_head queue = { + 0, + }; + gf_boolean_t barrier_enabled = _gf_false; + dht_changelog_rename_info_t *info = NULL; + int ret = 0; + char *old_name = NULL; + char *new_name = NULL; + char *nname = NULL; + + INIT_LIST_HEAD(&queue); + priv = this->private; + + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + ret = dict_get_bin(xdata, DHT_CHANGELOG_RENAME_OP_KEY, (void **)&info); + if (!ret) { /* special case: unlink considered as rename */ + /* 3 == fop + oldloc + newloc */ + old_name = alloca(info->oldname_len); + new_name = alloca(info->newname_len); + CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, loc->inode->gfid, 3); + + co = changelog_get_usable_buffer(frame->local); if (!co) - goto wind; + goto wind; - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + CHANGLOG_FILL_FOP_NUMBER(co, GF_FOP_RENAME, fop_fn, xtra_len); co++; - CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, - entry_fn, entry_free_fn, xtra_len, wind); + strncpy(old_name, info->buffer, info->oldname_len); + CHANGELOG_FILL_ENTRY(co, info->old_pargfid, old_name, entry_fn, + entry_free_fn, xtra_len, wind); - changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + co++; + /* new name resides just after old name */ + nname = info->buffer + info->oldname_len; + strncpy(new_name, nname, info->newname_len); + CHANGELOG_FILL_ENTRY(co, info->new_pargfid, new_name, entry_fn, + entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length(frame->local, xtra_len, 3); + } else { /* default unlink */ + CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, wind); + CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, loc->inode->gfid, 2); + + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; - wind: - STACK_WIND (frame, changelog_unlink_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->unlink, - loc, xflags, xdata); - return 0; + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); + + co++; + if (priv->capture_del_path) { + CHANGELOG_FILL_ENTRY_DIR_PATH(co, loc->pargfid, loc->name, + del_entry_fn, del_entry_free_fn, + xtra_len, wind, _gf_true); + } else { + CHANGELOG_FILL_ENTRY_DIR_PATH(co, loc->pargfid, loc->name, + del_entry_fn, del_entry_free_fn, + xtra_len, wind, _gf_false); + } + + changelog_set_usable_record_and_length(frame->local, xtra_len, 2); + } + + /* changelog barrier */ + LOCK(&priv->lock); + { + if ((barrier_enabled = priv->barrier_enabled)) { + stub = fop_unlink_stub(frame, changelog_unlink_resume, loc, xflags, + xdata); + if (!stub) + __chlog_barrier_disable(this, &queue); + else + __chlog_barrier_enqueue(this, stub); + } else { + ((changelog_local_t *)frame->local)->color = priv->current_color; + changelog_inc_fop_cnt(this, priv, frame->local); + } + } + UNLOCK(&priv->lock); + + if (barrier_enabled && stub) { + gf_msg_debug(this->name, 0, "Enqueue unlink"); + goto out; + } + if (barrier_enabled && !stub) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, + CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=unlink", NULL); + chlog_barrier_dequeue_all(this, &queue); + } + + /* changelog barrier */ + +wind: + STACK_WIND(frame, changelog_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflags, xdata); +out: + return 0; } /* rename */ int32_t -changelog_rename_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *buf, struct iatt *preoldparent, - struct iatt *postoldparent, struct iatt *prenewparent, - struct iatt *postnewparent, dict_t *xdata) +changelog_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); + changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY); +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(rename, frame, op_ret, op_errno, buf, preoldparent, + postoldparent, prenewparent, postnewparent, xdata); + return 0; +} - priv = this->private; - local = frame->local; +int32_t +changelog_rename_resume(call_frame_t *frame, xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + priv = this->private; - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + gf_msg_debug(this->name, 0, "Dequeue rename"); + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; +} - unwind: - CHANGELOG_STACK_UNWIND (rename, frame, op_ret, op_errno, - buf, preoldparent, postoldparent, - prenewparent, postnewparent, xdata); - return 0; +int32_t +changelog_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + call_stub_t *stub = NULL; + struct list_head queue = { + 0, + }; + gf_boolean_t barrier_enabled = _gf_false; + dht_changelog_rename_info_t *info = NULL; + int ret = 0; + + INIT_LIST_HEAD(&queue); + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + ret = dict_get_bin(xdata, DHT_CHANGELOG_RENAME_OP_KEY, (void **)&info); + if (ret && oldloc->inode->ia_type != IA_IFDIR) { + /* xdata "NOT" set for a non-directory, + * Special rename => avoid logging */ + goto wind; + } + + /* 3 == fop + oldloc + newloc */ + CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, oldloc->inode->gfid, 3); + + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY(co, oldloc->pargfid, oldloc->name, entry_fn, + entry_free_fn, xtra_len, wind); + + co++; + CHANGELOG_FILL_ENTRY(co, newloc->pargfid, newloc->name, entry_fn, + entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length(frame->local, xtra_len, 3); + /* changelog barrier */ + LOCK(&priv->lock); + { + if ((barrier_enabled = priv->barrier_enabled)) { + stub = fop_rename_stub(frame, changelog_rename_resume, oldloc, + newloc, xdata); + if (!stub) + __chlog_barrier_disable(this, &queue); + else + __chlog_barrier_enqueue(this, stub); + } else { + ((changelog_local_t *)frame->local)->color = priv->current_color; + changelog_inc_fop_cnt(this, priv, frame->local); + } + } + UNLOCK(&priv->lock); + + if (barrier_enabled && stub) { + gf_msg_debug(this->name, 0, "Enqueue rename"); + goto out; + } + if (barrier_enabled && !stub) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, + CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=rename", NULL); + chlog_barrier_dequeue_all(this, &queue); + } + /* changelog barrier */ + +wind: + STACK_WIND(frame, changelog_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); +out: + return 0; } +/* link */ int32_t -changelog_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) +changelog_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - size_t xtra_len = 0; - changelog_priv_t *priv = NULL; - changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + priv = this->private; + local = frame->local; - /* 3 == fop + oldloc + newloc */ - CHANGELOG_INIT_NOCHECK (this, frame->local, - NULL, oldloc->inode->gfid, 3); + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; + changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY); - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; +} - co++; - CHANGELOG_FILL_ENTRY (co, oldloc->pargfid, oldloc->name, - entry_fn, entry_free_fn, xtra_len, wind); +int32_t +changelog_link_resume(call_frame_t *frame, xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; - co++; - CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name, - entry_fn, entry_free_fn, xtra_len, wind); + GF_VALIDATE_OR_GOTO("changelog", this, out); + GF_VALIDATE_OR_GOTO("changelog", this->fops, out); + GF_VALIDATE_OR_GOTO("changelog", frame, out); - changelog_set_usable_record_and_length (frame->local, xtra_len, 3); + priv = this->private; - wind: - STACK_WIND (frame, changelog_rename_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->rename, - oldloc, newloc, xdata); - return 0; + gf_msg_debug(this->name, 0, "Dequeuing link"); + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; +out: + return -1; +} +int32_t +changelog_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + call_stub_t *stub = NULL; + struct list_head queue = { + 0, + }; + gf_boolean_t barrier_enabled = _gf_false; + + priv = this->private; + + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, wind); + + CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, oldloc->gfid, 2); + + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY(co, newloc->pargfid, newloc->name, entry_fn, + entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length(frame->local, xtra_len, 2); + + LOCK(&priv->lock); + { + if ((barrier_enabled = priv->barrier_enabled)) { + stub = fop_link_stub(frame, changelog_link_resume, oldloc, newloc, + xdata); + if (!stub) + __chlog_barrier_disable(this, &queue); + else + __chlog_barrier_enqueue(this, stub); + } else { + ((changelog_local_t *)frame->local)->color = priv->current_color; + changelog_inc_fop_cnt(this, priv, frame->local); + } + } + UNLOCK(&priv->lock); + + if (barrier_enabled && stub) { + gf_msg_debug(this->name, 0, "Enqueued link"); + goto out; + } + + if (barrier_enabled && !stub) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_BARRIER_FOP_FAILED, + "fop=link", NULL); + chlog_barrier_dequeue_all(this, &queue); + } +wind: + STACK_WIND(frame, changelog_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); +out: + return 0; } -/* link */ +/* mkdir */ int32_t -changelog_link_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, inode_t *inode, +changelog_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY); - unwind: - CHANGELOG_STACK_UNWIND (link, frame, op_ret, op_errno, - inode, buf, preparent, postparent, xdata); - return 0; +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; } int32_t -changelog_link (call_frame_t *frame, - xlator_t *this, loc_t *oldloc, - loc_t *newloc, dict_t *xdata) +changelog_mkdir_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + mode_t mode, mode_t umask, dict_t *xdata) { - size_t xtra_len = 0; - changelog_priv_t *priv = NULL; - changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; - priv = this->private; + GF_VALIDATE_OR_GOTO("changelog", this, out); + GF_VALIDATE_OR_GOTO("changelog", this->fops, out); + GF_VALIDATE_OR_GOTO("changelog", frame, out); - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); - CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO (xdata, wind); + priv = this->private; - CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, oldloc->gfid, 2); + gf_msg_debug(this->name, 0, "Dequeuing mkdir"); + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); + return 0; +out: + return -1; +} - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; +int32_t +changelog_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + int ret = -1; + uuid_t gfid = { + 0, + }; + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + call_stub_t *stub = NULL; + struct list_head queue = { + 0, + }; + gf_boolean_t barrier_enabled = _gf_false; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + ret = dict_get_gfuuid(xdata, "gfid-req", &gfid); + if (ret) { + gf_msg_debug(this->name, 0, "failed to get gfid from dict"); + goto wind; + } + + CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, gfid, 5); + + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); + co++; + + CHANGELOG_FILL_UINT32(co, S_IFDIR | mode, number_fn, xtra_len); + co++; + + CHANGELOG_FILL_UINT32(co, frame->root->uid, number_fn, xtra_len); + co++; + + CHANGELOG_FILL_UINT32(co, frame->root->gid, number_fn, xtra_len); + co++; + + CHANGELOG_FILL_ENTRY(co, loc->pargfid, loc->name, entry_fn, entry_free_fn, + xtra_len, wind); + + changelog_set_usable_record_and_length(frame->local, xtra_len, 5); + + LOCK(&priv->lock); + { + if ((barrier_enabled = priv->barrier_enabled)) { + stub = fop_mkdir_stub(frame, changelog_mkdir_resume, loc, mode, + umask, xdata); + if (!stub) + __chlog_barrier_disable(this, &queue); + else + __chlog_barrier_enqueue(this, stub); + } else { + ((changelog_local_t *)frame->local)->color = priv->current_color; + changelog_inc_fop_cnt(this, priv, frame->local); + } + } + UNLOCK(&priv->lock); + + if (barrier_enabled && stub) { + gf_msg_debug(this->name, 0, "Enqueued mkdir"); + goto out; + } + + if (barrier_enabled && !stub) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, + CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=mkdir", NULL); + chlog_barrier_dequeue_all(this, &queue); + } + +wind: + STACK_WIND(frame, changelog_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); +out: + return 0; +} - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); +/* symlink */ - co++; - CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name, - entry_fn, entry_free_fn, xtra_len, wind); +int32_t +changelog_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; - changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + priv = this->private; + local = frame->local; - wind: - STACK_WIND (frame, changelog_link_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->link, - oldloc, newloc, xdata); - return 0; -} + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); -/* mkdir */ + changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY); + +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(symlink, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +} int32_t -changelog_mkdir_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +changelog_symlink_resume(call_frame_t *frame, xlator_t *this, + const char *linkname, loc_t *loc, mode_t umask, + dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + changelog_priv_t *priv = NULL; - priv = this->private; - local = frame->local; + GF_VALIDATE_OR_GOTO("changelog", this, out); + GF_VALIDATE_OR_GOTO("changelog", this->fops, out); + GF_VALIDATE_OR_GOTO("changelog", frame, out); - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + priv = this->private; - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + gf_msg_debug(this->name, 0, "Dequeuing symlink"); + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_symlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata); + return 0; +out: + return -1; +} - unwind: - CHANGELOG_STACK_UNWIND (mkdir, frame, op_ret, op_errno, - inode, buf, preparent, postparent, xdata); - return 0; +int32_t +changelog_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + int ret = -1; + size_t xtra_len = 0; + uuid_t gfid = { + 0, + }; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + call_stub_t *stub = NULL; + struct list_head queue = { + 0, + }; + gf_boolean_t barrier_enabled = _gf_false; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + ret = dict_get_gfuuid(xdata, "gfid-req", &gfid); + if (ret) { + gf_msg_debug(this->name, 0, "failed to get gfid from dict"); + goto wind; + } + + CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, gfid, 2); + + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); + co++; + + CHANGELOG_FILL_ENTRY(co, loc->pargfid, loc->name, entry_fn, entry_free_fn, + xtra_len, wind); + + changelog_set_usable_record_and_length(frame->local, xtra_len, 2); + + LOCK(&priv->lock); + { + if ((barrier_enabled = priv->barrier_enabled)) { + stub = fop_symlink_stub(frame, changelog_symlink_resume, linkname, + loc, umask, xdata); + if (!stub) + __chlog_barrier_disable(this, &queue); + else + __chlog_barrier_enqueue(this, stub); + } else { + ((changelog_local_t *)frame->local)->color = priv->current_color; + changelog_inc_fop_cnt(this, priv, frame->local); + } + } + UNLOCK(&priv->lock); + + if (barrier_enabled && stub) { + gf_msg_debug(this->name, 0, "Enqueued symlink"); + goto out; + } + + if (barrier_enabled && !stub) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, + CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=symlink", NULL); + chlog_barrier_dequeue_all(this, &queue); + } + +wind: + STACK_WIND(frame, changelog_symlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata); +out: + return 0; } +/* mknod */ + int32_t -changelog_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) +changelog_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - int ret = -1; - uuid_t gfid = {0,}; - void *uuid_req = NULL; - size_t xtra_len = 0; - changelog_priv_t *priv = NULL; - changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + priv = this->private; + local = frame->local; - ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get gfid from dict"); - goto wind; - } - uuid_copy (gfid, uuid_req); + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); - CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); + changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY); - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +} - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); +int32_t +changelog_mknod_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; - co++; - CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, - entry_fn, entry_free_fn, xtra_len, wind); + GF_VALIDATE_OR_GOTO("changelog", this, out); + GF_VALIDATE_OR_GOTO("changelog", this->fops, out); + GF_VALIDATE_OR_GOTO("changelog", frame, out); - changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + priv = this->private; - wind: - STACK_WIND (frame, changelog_mkdir_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->mkdir, - loc, mode, umask, xdata); - return 0; + gf_msg_debug(this->name, 0, "Dequeuing mknod"); + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); + return 0; +out: + return -1; } -/* symlink */ - int32_t -changelog_symlink_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +changelog_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + int ret = -1; + uuid_t gfid = { + 0, + }; + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + call_stub_t *stub = NULL; + struct list_head queue = { + 0, + }; + gf_boolean_t barrier_enabled = _gf_false; + + priv = this->private; + + /* Check whether changelog active */ + if (!(priv->active)) + goto wind; + + /* Check whether rebalance activity */ + if (frame->root->pid == GF_CLIENT_PID_DEFRAG) + goto wind; + + /* If tier-dht linkto is SET, ignore about verifiying : + * 1. Whether internal fop AND + * 2. Whether tier rebalance process activity (this will help in + * recording mknod if tier rebalance process calls this mknod) */ + if (!(dict_get(xdata, "trusted.tier.tier-dht.linkto"))) { + CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, wind); + if (frame->root->pid == GF_CLIENT_PID_TIER_DEFRAG) + goto wind; + } + + ret = dict_get_gfuuid(xdata, "gfid-req", &gfid); + if (ret) { + gf_msg_debug(this->name, 0, "failed to get gfid from dict"); + goto wind; + } + + CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, gfid, 5); + + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); + co++; + + CHANGELOG_FILL_UINT32(co, mode, number_fn, xtra_len); + co++; + + CHANGELOG_FILL_UINT32(co, frame->root->uid, number_fn, xtra_len); + co++; + + CHANGELOG_FILL_UINT32(co, frame->root->gid, number_fn, xtra_len); + co++; + + CHANGELOG_FILL_ENTRY(co, loc->pargfid, loc->name, entry_fn, entry_free_fn, + xtra_len, wind); + + changelog_set_usable_record_and_length(frame->local, xtra_len, 5); + + LOCK(&priv->lock); + { + if ((barrier_enabled = priv->barrier_enabled)) { + stub = fop_mknod_stub(frame, changelog_mknod_resume, loc, mode, dev, + umask, xdata); + if (!stub) + __chlog_barrier_disable(this, &queue); + else + __chlog_barrier_enqueue(this, stub); + } else { + ((changelog_local_t *)frame->local)->color = priv->current_color; + changelog_inc_fop_cnt(this, priv, frame->local); + } + } + UNLOCK(&priv->lock); + + if (barrier_enabled && stub) { + gf_msg_debug(this->name, 0, "Enqueued mknod"); + goto out; + } + + if (barrier_enabled && !stub) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, + CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=mknod", NULL); + chlog_barrier_dequeue_all(this, &queue); + } + +wind: + STACK_WIND(frame, changelog_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, dev, umask, xdata); +out: + return 0; +} - priv = this->private; - local = frame->local; +/* create */ - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); +int32_t +changelog_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int32_t ret = 0; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + changelog_event_t ev = { + 0, + }; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); + + /* fill the event structure.. similar to open() */ + ev.ev_type = CHANGELOG_OP_TYPE_CREATE; + gf_uuid_copy(ev.u.create.gfid, buf->ia_gfid); + ev.u.create.flags = fd->flags; + changelog_dispatch_event(this, priv, &ev); + + if (changelog_ev_selected(this, &priv->ev_selection, + CHANGELOG_OP_TYPE_RELEASE)) { + ret = fd_ctx_set(fd, this, (uint64_t)(long)0x1); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, CHANGELOG_MSG_SET_FD_CONTEXT, + NULL); + } - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY); - unwind: - CHANGELOG_STACK_UNWIND (symlink, frame, op_ret, op_errno, - inode, buf, preparent, postparent, xdata); - return 0; +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); + return 0; } int32_t -changelog_symlink (call_frame_t *frame, xlator_t *this, - const char *linkname, loc_t *loc, - mode_t umask, dict_t *xdata) +changelog_create_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flags, mode_t mode, mode_t umask, fd_t *fd, + dict_t *xdata) { - int ret = -1; - size_t xtra_len = 0; - uuid_t gfid = {0,}; - void *uuid_req = NULL; - changelog_priv_t *priv = NULL; - changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + GF_VALIDATE_OR_GOTO("changelog", this, out); + GF_VALIDATE_OR_GOTO("changelog", this->fops, out); + GF_VALIDATE_OR_GOTO("changelog", frame, out); - ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get gfid from dict"); - goto wind; - } - uuid_copy (gfid, uuid_req); + priv = this->private; - CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); + gf_msg_debug(this->name, 0, "Dequeuing create"); + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; +out: + return -1; +} - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); +int32_t +changelog_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + int ret = -1; + uuid_t gfid = { + 0, + }; + changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; + size_t xtra_len = 0; + call_stub_t *stub = NULL; + struct list_head queue = { + 0, + }; + gf_boolean_t barrier_enabled = _gf_false; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + ret = dict_get_gfuuid(xdata, "gfid-req", &gfid); + if (ret) { + gf_msg_debug(this->name, 0, "failed to get gfid from dict"); + goto wind; + } + + /* init with two extra records */ + CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, gfid, 5); + if (!frame->local) + goto wind; + + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); + co++; + + CHANGELOG_FILL_UINT32(co, mode, number_fn, xtra_len); + co++; + + CHANGELOG_FILL_UINT32(co, frame->root->uid, number_fn, xtra_len); + co++; + + CHANGELOG_FILL_UINT32(co, frame->root->gid, number_fn, xtra_len); + co++; + + CHANGELOG_FILL_ENTRY(co, loc->pargfid, loc->name, entry_fn, entry_free_fn, + xtra_len, wind); + + changelog_set_usable_record_and_length(frame->local, xtra_len, 5); + + LOCK(&priv->lock); + { + if ((barrier_enabled = priv->barrier_enabled)) { + stub = fop_create_stub(frame, changelog_create_resume, loc, flags, + mode, umask, fd, xdata); + if (!stub) + __chlog_barrier_disable(this, &queue); + else + __chlog_barrier_enqueue(this, stub); + } else { + ((changelog_local_t *)frame->local)->color = priv->current_color; + changelog_inc_fop_cnt(this, priv, frame->local); + } + } + UNLOCK(&priv->lock); + + if (barrier_enabled && stub) { + gf_msg_debug(this->name, 0, "Enqueued create"); + goto out; + } + + if (barrier_enabled && !stub) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, + CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=create", NULL); + chlog_barrier_dequeue_all(this, &queue); + } + +wind: + STACK_WIND(frame, changelog_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); +out: + return 0; +} - co++; - CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, - entry_fn, entry_free_fn, xtra_len, wind); +/* }}} */ - changelog_set_usable_record_and_length (frame->local, xtra_len, 2); +/* Metadata modification fops - TYPE II */ - wind: - STACK_WIND (frame, changelog_symlink_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->symlink, - linkname, loc, umask, xdata); - return 0; -} +/* {{{ */ -/* mknod */ +/* {f}setattr */ int32_t -changelog_mknod_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +changelog_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop_stbuf, struct iatt *postop_stbuf, + dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA); - unwind: - CHANGELOG_STACK_UNWIND (mknod, frame, op_ret, op_errno, - inode, buf, preparent, postparent, xdata); - return 0; +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, preop_stbuf, + postop_stbuf, xdata); + + return 0; } int32_t -changelog_mknod (call_frame_t *frame, - xlator_t *this, loc_t *loc, - mode_t mode, dev_t dev, mode_t umask, dict_t *xdata) +changelog_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { - int ret = -1; - uuid_t gfid = {0,}; - void *uuid_req = NULL; - size_t xtra_len = 0; - changelog_priv_t *priv = NULL; - changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + size_t xtra_len = 0; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); - ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get gfid from dict"); - goto wind; - } - uuid_copy (gfid, uuid_req); + CHANGELOG_OP_BOUNDARY_CHECK(frame, wind); - CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); + CHANGELOG_INIT(this, frame->local, fd->inode, fd->inode->gfid, 1); + if (!frame->local) + goto wind; - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; - - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; - co++; - CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, - entry_fn, entry_free_fn, xtra_len, wind); + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); - changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + changelog_set_usable_record_and_length(frame->local, xtra_len, 1); - wind: - STACK_WIND (frame, changelog_mknod_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->mknod, - loc, mode, dev, umask, xdata); - return 0; +wind: + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; } -/* creat */ - int32_t -changelog_create_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, +changelog_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd, inode_t *inode, struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) + struct iatt *preop_stbuf, struct iatt *postop_stbuf, + dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA); - unwind: - CHANGELOG_STACK_UNWIND (create, frame, - op_ret, op_errno, fd, inode, - buf, preparent, postparent, xdata); - return 0; +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(setattr, frame, op_ret, op_errno, preop_stbuf, + postop_stbuf, xdata); + + return 0; } int32_t -changelog_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *xdata) +changelog_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { - int ret = -1; - uuid_t gfid = {0,}; - void *uuid_req = NULL; - changelog_opt_t *co = NULL; - changelog_priv_t *priv = NULL; - size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + size_t xtra_len = 0; + uuid_t shard_root_gfid = { + 0, + }; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); - ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get gfid from dict"); - goto wind; - } - uuid_copy (gfid, uuid_req); + CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, wind); - /* init with two extra records */ - CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); - if (!frame->local) - goto wind; + /* Do not record META on .shard */ + gf_uuid_parse(SHARD_ROOT_GFID, shard_root_gfid); + if (gf_uuid_compare(loc->gfid, shard_root_gfid) == 0) { + goto wind; + } - co = changelog_get_usable_buffer (frame->local); - if (!co) - goto wind; + CHANGELOG_OP_BOUNDARY_CHECK(frame, wind); - CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + CHANGELOG_INIT(this, frame->local, loc->inode, loc->inode->gfid, 1); + if (!frame->local) + goto wind; - co++; - CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, - entry_fn, entry_free_fn, xtra_len, wind); + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; - changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); - wind: - STACK_WIND (frame, changelog_create_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, - loc, flags, mode, umask, fd, xdata); - return 0; + changelog_set_usable_record_and_length(frame->local, xtra_len, 1); + +wind: + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; } -/* }}} */ +/* {f}removexattr */ +int32_t +changelog_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; -/* Metadata modification fops - TYPE II */ + priv = this->private; + local = frame->local; -/* {{{ */ + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); -/* {f}setattr */ + changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA_XATTR); + +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata); + + return 0; +} int32_t -changelog_fsetattr_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *preop_stbuf, - struct iatt *postop_stbuf, dict_t *xdata) +changelog_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + size_t xtra_len = 0; - priv = this->private; - local = frame->local; + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + CHANGELOG_OP_BOUNDARY_CHECK(frame, wind); - changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + CHANGELOG_INIT(this, frame->local, fd->inode, fd->inode->gfid, 1); - unwind: - CHANGELOG_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, - preop_stbuf, postop_stbuf, xdata); + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; - return 0; + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); + changelog_set_usable_record_and_length(frame->local, xtra_len, 1); +wind: + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_fremovexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + return 0; } int32_t -changelog_fsetattr (call_frame_t *frame, - xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid, dict_t *xdata) +changelog_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - changelog_priv_t *priv = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + priv = this->private; + local = frame->local; - CHANGELOG_INIT (this, frame->local, - fd->inode, fd->inode->gfid, 0); + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); - wind: - STACK_WIND (frame, changelog_fsetattr_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetattr, - fd, stbuf, valid, xdata); - return 0; + changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA_XATTR); +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata); + return 0; } int32_t -changelog_setattr_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *preop_stbuf, - struct iatt *postop_stbuf, dict_t *xdata) +changelog_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + size_t xtra_len = 0; - priv = this->private; - local = frame->local; + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + CHANGELOG_OP_BOUNDARY_CHECK(frame, wind); - changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + CHANGELOG_INIT(this, frame->local, loc->inode, loc->inode->gfid, 1); - unwind: - CHANGELOG_STACK_UNWIND (setattr, frame, op_ret, op_errno, - preop_stbuf, postop_stbuf, xdata); + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; - return 0; + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); + + changelog_set_usable_record_and_length(frame->local, xtra_len, 1); + +wind: + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; } +/* {f}setxattr */ + int32_t -changelog_setattr (call_frame_t *frame, - xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid, dict_t *xdata) +changelog_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - changelog_priv_t *priv = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + priv = this->private; + local = frame->local; - CHANGELOG_INIT (this, frame->local, - loc->inode, loc->inode->gfid, 0); + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); - wind: - STACK_WIND (frame, changelog_setattr_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->setattr, - loc, stbuf, valid, xdata); - return 0; -} + changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA_XATTR); -/* {f}removexattr */ +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); -int32_t -changelog_fremovexattr_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + return 0; +} - priv = this->private; - local = frame->local; +/* changelog_handle_virtual_xattr: + * Handles virtual setxattr 'glusterfs.geo-rep.trigger-sync' on files. + * Following is the behaviour based on the value of xattr. + * 1: Captures only DATA entry in changelog. + * 2: Tries to captures both ENTRY and DATA entry in + * changelog. If failed to get pargfid, only DATA + * entry is captured. + * any other value: ENOTSUP is returned. + */ +static void +changelog_handle_virtual_xattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *dict) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + int32_t value = 0; + int ret = 0; + int dict_ret = 0; + gf_boolean_t valid = _gf_false; - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + priv = this->private; + GF_ASSERT(priv); - changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + dict_ret = dict_get_int32(dict, GF_XATTR_TRIGGER_SYNC, &value); - unwind: - CHANGELOG_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata); + if ((dict_ret == 0 && value == 1) && ((loc->inode->ia_type == IA_IFDIR) || + (loc->inode->ia_type == IA_IFREG))) + valid = _gf_true; - return 0; + if (valid) { + ret = changelog_fill_entry_buf(frame, this, loc, &local); + if (ret) { + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_ENTRY_BUF_INFO, + "gfid=%s", uuid_utoa(loc->inode->gfid), NULL); + goto unwind; + } + changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + /* Capture DATA only if it's a file. */ + if (loc->inode->ia_type != IA_IFDIR) + changelog_update(this, priv, frame->local, CHANGELOG_TYPE_DATA); + /* Assign local to prev_entry, so unwind will take + * care of cleanup. */ + ((changelog_local_t *)(frame->local))->prev_entry = local; + CHANGELOG_STACK_UNWIND(setxattr, frame, 0, 0, NULL); + return; + } else { + CHANGELOG_STACK_UNWIND(setxattr, frame, -1, ENOTSUP, NULL); + return; + } } int32_t -changelog_fremovexattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata) +changelog_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *dict, int32_t flags, dict_t *xdata) { - changelog_priv_t *priv = NULL; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + size_t xtra_len = 0; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + CHANGELOG_OP_BOUNDARY_CHECK(frame, wind); - CHANGELOG_INIT (this, frame->local, - fd->inode, fd->inode->gfid, 0); + CHANGELOG_INIT(this, frame->local, loc->inode, loc->inode->gfid, 1); - wind: - STACK_WIND (frame, changelog_fremovexattr_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->fremovexattr, - fd, name, xdata); + /* On setting this virtual xattr on a file, an explicit data + * sync is triggered from geo-rep as CREATE|DATA entry is + * recorded in changelog based on xattr value. + */ + if (dict_get(dict, GF_XATTR_TRIGGER_SYNC)) { + changelog_handle_virtual_xattr(frame, this, loc, dict); return 0; + } + + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); + + changelog_set_usable_record_and_length(frame->local, xtra_len, 1); + +wind: + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata); + return 0; } int32_t -changelog_removexattr_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +changelog_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA_XATTR); - unwind: - CHANGELOG_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata); +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata); - return 0; + return 0; } int32_t -changelog_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) +changelog_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) { - changelog_priv_t *priv = NULL; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + size_t xtra_len = 0; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, wind); - CHANGELOG_INIT (this, frame->local, - loc->inode, loc->inode->gfid, 0); + CHANGELOG_OP_BOUNDARY_CHECK(frame, wind); - wind: - STACK_WIND (frame, changelog_removexattr_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->removexattr, - loc, name, xdata); - return 0; -} + CHANGELOG_INIT(this, frame->local, fd->inode, fd->inode->gfid, 1); -/* {f}setxattr */ + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); + + changelog_set_usable_record_and_length(frame->local, xtra_len, 1); + +wind: + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + return 0; +} int32_t -changelog_setxattr_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +changelog_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA); - unwind: - CHANGELOG_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(xattrop, frame, op_ret, op_errno, xattr, xdata); - return 0; + return 0; } int32_t -changelog_setxattr (call_frame_t *frame, - xlator_t *this, loc_t *loc, - dict_t *dict, int32_t flags, dict_t *xdata) +changelog_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - changelog_priv_t *priv = NULL; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + size_t xtra_len = 0; + int ret = 0; + void *size_attr = NULL; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + ret = dict_get_ptr(xattr, GF_XATTR_SHARD_FILE_SIZE, &size_attr); + if (ret) + goto wind; - CHANGELOG_INIT (this, frame->local, - loc->inode, loc->inode->gfid, 0); + CHANGELOG_OP_BOUNDARY_CHECK(frame, wind); - wind: - STACK_WIND (frame, changelog_setxattr_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->setxattr, - loc, dict, flags, xdata); - return 0; + CHANGELOG_INIT(this, frame->local, loc->inode, loc->inode->gfid, 1); + + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); + + changelog_set_usable_record_and_length(frame->local, xtra_len, 1); + +wind: + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_xattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, loc, optype, xattr, xdata); + return 0; } int32_t -changelog_fsetxattr_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xdata) +changelog_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA_XATTR); - unwind: - CHANGELOG_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata); +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, xattr, xdata); - return 0; + return 0; } int32_t -changelog_fsetxattr (call_frame_t *frame, - xlator_t *this, fd_t *fd, dict_t *dict, - int32_t flags, dict_t *xdata) +changelog_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - changelog_priv_t *priv = NULL; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + size_t xtra_len = 0; + void *size_attr = NULL; + int ret = 0; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + ret = dict_get_ptr(xattr, GF_XATTR_SHARD_FILE_SIZE, &size_attr); + if (ret) + goto wind; - CHANGELOG_INIT (this, frame->local, - fd->inode, fd->inode->gfid, 0); + CHANGELOG_OP_BOUNDARY_CHECK(frame, wind); - wind: - STACK_WIND (frame, changelog_fsetxattr_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, - fd, dict, flags, xdata); - return 0; -} + CHANGELOG_INIT(this, frame->local, fd->inode, fd->inode->gfid, 1); -/* }}} */ + co = changelog_get_usable_buffer(frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len); + changelog_set_usable_record_and_length(frame->local, xtra_len, 1); + +wind: + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_fxattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, fd, optype, xattr, xdata); + return 0; +} +/* }}} */ /* Data modification fops - TYPE I */ @@ -837,130 +1619,277 @@ changelog_fsetxattr (call_frame_t *frame, /* {f}truncate() */ int32_t -changelog_truncate_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +changelog_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); - changelog_update (this, priv, local, CHANGELOG_TYPE_DATA); + changelog_update(this, priv, local, CHANGELOG_TYPE_DATA); - unwind: - CHANGELOG_STACK_UNWIND (truncate, frame, - op_ret, op_errno, prebuf, postbuf, xdata); - return 0; +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } int32_t -changelog_truncate (call_frame_t *frame, - xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata) +changelog_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) { - changelog_priv_t *priv = NULL; + changelog_priv_t *priv = NULL; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); - CHANGELOG_INIT (this, frame->local, - loc->inode, loc->inode->gfid, 0); + CHANGELOG_INIT(this, frame->local, loc->inode, loc->inode->gfid, 0); + LOCK(&priv->c_snap_lock); + { + if (priv->c_snap_fd != -1 && priv->barrier_enabled == _gf_true) { + changelog_snap_handle_ascii_change( + this, &(((changelog_local_t *)(frame->local))->cld)); + } + } + UNLOCK(&priv->c_snap_lock); + +wind: + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; +} - wind: - STACK_WIND (frame, changelog_truncate_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->truncate, - loc, offset, xdata); - return 0; +int32_t +changelog_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind); + + changelog_update(this, priv, local, CHANGELOG_TYPE_DATA); + +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } int32_t -changelog_ftruncate_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +changelog_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + changelog_priv_t *priv = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); - CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + CHANGELOG_INIT(this, frame->local, fd->inode, fd->inode->gfid, 0); + LOCK(&priv->c_snap_lock); + { + if (priv->c_snap_fd != -1 && priv->barrier_enabled == _gf_true) { + changelog_snap_handle_ascii_change( + this, &(((changelog_local_t *)(frame->local))->cld)); + } + } + UNLOCK(&priv->c_snap_lock); + +wind: + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; +} - changelog_update (this, priv, local, CHANGELOG_TYPE_DATA); +/* writev() */ - unwind: - CHANGELOG_STACK_UNWIND (ftruncate, frame, - op_ret, op_errno, prebuf, postbuf, xdata); - return 0; +int32_t +changelog_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO(priv, ((op_ret <= 0) || !local), unwind); + + changelog_update(this, priv, local, CHANGELOG_TYPE_DATA); + +unwind: + changelog_dec_fop_cnt(this, priv, local); + CHANGELOG_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } int32_t -changelog_ftruncate (call_frame_t *frame, - xlator_t *this, fd_t *fd, off_t offset, dict_t *xdata) +changelog_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) { - changelog_priv_t *priv = NULL; + changelog_priv_t *priv = NULL; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); - CHANGELOG_INIT (this, frame->local, - fd->inode, fd->inode->gfid, 0); + CHANGELOG_INIT(this, frame->local, fd->inode, fd->inode->gfid, 0); + LOCK(&priv->c_snap_lock); + { + if (priv->c_snap_fd != -1 && priv->barrier_enabled == _gf_true) { + changelog_snap_handle_ascii_change( + this, &(((changelog_local_t *)(frame->local))->cld)); + } + } + UNLOCK(&priv->c_snap_lock); + +wind: + changelog_color_fop_and_inc_cnt(this, priv, frame->local); + STACK_WIND(frame, changelog_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; +} - wind: - STACK_WIND (frame, changelog_ftruncate_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->ftruncate, - fd, offset, xdata); - return 0; +/* }}} */ + +/* open, release and other beasts */ + +/* {{{ */ + +int +changelog_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ + int ret = 0; + changelog_priv_t *priv = NULL; + changelog_event_t ev = { + 0, + }; + gf_boolean_t logopen = _gf_false; + + priv = this->private; + if (frame->local) { + frame->local = NULL; + logopen = _gf_true; + } + + CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !logopen), unwind); + + /* fill the event structure */ + ev.ev_type = CHANGELOG_OP_TYPE_OPEN; + gf_uuid_copy(ev.u.open.gfid, fd->inode->gfid); + ev.u.open.flags = fd->flags; + changelog_dispatch_event(this, priv, &ev); + + if (changelog_ev_selected(this, &priv->ev_selection, + CHANGELOG_OP_TYPE_RELEASE)) { + ret = fd_ctx_set(fd, this, (uint64_t)(long)0x1); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, CHANGELOG_MSG_SET_FD_CONTEXT, + NULL); + } + +unwind: + CHANGELOG_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata); + return 0; } -/* writev() */ +int +changelog_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + fd_t *fd, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + frame->local = (void *)0x1; /* do not dereference in ->cbk */ + +wind: + STACK_WIND(frame, changelog_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +} + +/* }}} */ + +/* {{{ */ + +/* }}} */ int32_t -changelog_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, - dict_t *xdata) +_changelog_generic_dispatcher(dict_t *dict, char *key, data_t *value, + void *data) { - changelog_priv_t *priv = NULL; - changelog_local_t *local = NULL; + xlator_t *this = NULL; + changelog_priv_t *priv = NULL; + + this = data; + priv = this->private; - priv = this->private; - local = frame->local; + changelog_dispatch_event(this, priv, (changelog_event_t *)value->data); + return 0; +} - CHANGELOG_COND_GOTO (priv, ((op_ret <= 0) || !local), unwind); +/** + * changelog ipc dispatches events, pointers of which are passed in + * @xdata. Dispatching is orderless (whatever order dict_foreach() + * traverses the dictionary). + */ +int32_t +changelog_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) +{ + if (op != GF_IPC_TARGET_CHANGELOG) + goto wind; - changelog_update (this, priv, local, CHANGELOG_TYPE_DATA); + /* it's for us, do the job */ + if (xdata) + (void)dict_foreach(xdata, _changelog_generic_dispatcher, this); - unwind: - CHANGELOG_STACK_UNWIND (writev, frame, - op_ret, op_errno, prebuf, postbuf, xdata); - return 0; + STACK_UNWIND_STRICT(ipc, frame, 0, 0, NULL); + return 0; + +wind: + STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ipc, op, xdata); + return 0; } +/* {{{ */ + int32_t -changelog_writev (call_frame_t *frame, - xlator_t *this, fd_t *fd, struct iovec *vector, - int32_t count, off_t offset, uint32_t flags, - struct iobref *iobref, dict_t *xdata) +changelog_release(xlator_t *this, fd_t *fd) { - changelog_priv_t *priv = NULL; + changelog_event_t ev = { + 0, + }; + changelog_priv_t *priv = NULL; - priv = this->private; - CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + priv = this->private; - CHANGELOG_INIT (this, frame->local, - fd->inode, fd->inode->gfid, 0); + ev.ev_type = CHANGELOG_OP_TYPE_RELEASE; + gf_uuid_copy(ev.u.release.gfid, fd->inode->gfid); + changelog_dispatch_event(this, priv, &ev); - wind: - STACK_WIND (frame, changelog_writev_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->writev, fd, vector, - count, offset, flags, iobref, xdata); - return 0; + (void)fd_ctx_del(fd, this, NULL); + + return 0; } /* }}} */ @@ -977,501 +1906,1084 @@ changelog_writev (call_frame_t *frame, * needed if there are more operation modes in the future. */ static void -changelog_assign_opmode (changelog_priv_t *priv, char *mode) +changelog_assign_opmode(changelog_priv_t *priv, char *mode) { - if ( strncmp (mode, "realtime", 8) == 0 ) { - priv->op_mode = CHANGELOG_MODE_RT; - } + if (strncmp(mode, "realtime", 8) == 0) { + priv->op_mode = CHANGELOG_MODE_RT; + } } static void -changelog_assign_encoding (changelog_priv_t *priv, char *enc) +changelog_assign_encoding(changelog_priv_t *priv, char *enc) { - if ( strncmp (enc, "binary", 6) == 0 ) { - priv->encode_mode = CHANGELOG_ENCODE_BINARY; - } else if ( strncmp (enc, "ascii", 5) == 0 ) { - priv->encode_mode = CHANGELOG_ENCODE_ASCII; - } + if (strncmp(enc, "binary", 6) == 0) { + priv->encode_mode = CHANGELOG_ENCODE_BINARY; + } else if (strncmp(enc, "ascii", 5) == 0) { + priv->encode_mode = CHANGELOG_ENCODE_ASCII; + } } -/* cleanup any helper threads that are running */ static void -changelog_cleanup_helper_threads (xlator_t *this, changelog_priv_t *priv) +changelog_assign_barrier_timeout(changelog_priv_t *priv, uint32_t timeout) { - if (priv->cr.rollover_th) { - changelog_thread_cleanup (this, priv->cr.rollover_th); - priv->cr.rollover_th = 0; - } + LOCK(&priv->lock); + { + priv->timeout.tv_sec = timeout; + } + UNLOCK(&priv->lock); +} - if (priv->cf.fsync_th) { - changelog_thread_cleanup (this, priv->cf.fsync_th); - priv->cf.fsync_th = 0; - } +/* cleanup any helper threads that are running */ +static void +changelog_cleanup_helper_threads(xlator_t *this, changelog_priv_t *priv) +{ + if (priv->cr.rollover_th) { + (void)changelog_thread_cleanup(this, priv->cr.rollover_th); + priv->cr.rollover_th = 0; + } + + if (priv->cf.fsync_th) { + (void)changelog_thread_cleanup(this, priv->cf.fsync_th); + priv->cf.fsync_th = 0; + } } /* spawn helper thread; cleaning up in case of errors */ static int -changelog_spawn_helper_threads (xlator_t *this, changelog_priv_t *priv) +changelog_spawn_helper_threads(xlator_t *this, changelog_priv_t *priv) { - int ret = 0; - - priv->cr.this = this; - ret = gf_thread_create (&priv->cr.rollover_th, - NULL, changelog_rollover, priv); - if (ret) - goto out; + int ret = 0; + + /* Geo-Rep snapshot dependency: + * + * To implement explicit rollover of changlog journal on barrier + * notification, a pipe is created to communicate between + * 'changelog_rollover' thread and changelog main thread. The select + * call used to wait till roll-over time in changelog_rollover thread + * is modified to wait on read end of the pipe. When barrier + * notification comes (i.e, in 'reconfigure'), select in + * changelog_rollover thread is woken up explicitly by writing into + * the write end of the pipe in 'reconfigure'. + */ + + priv->cr.notify = _gf_false; + priv->cr.this = this; + ret = gf_thread_create(&priv->cr.rollover_th, NULL, changelog_rollover, + priv, "clogro"); + if (ret) + goto out; + + if (priv->fsync_interval) { + priv->cf.this = this; + ret = gf_thread_create(&priv->cf.fsync_th, NULL, changelog_fsync_thread, + priv, "clogfsyn"); + } + + if (ret) + changelog_cleanup_helper_threads(this, priv); + +out: + return ret; +} - if (priv->fsync_interval) { - priv->cf.this = this; - ret = gf_thread_create (&priv->cf.fsync_th, - NULL, changelog_fsync_thread, priv); +int +notify(xlator_t *this, int event, void *data, ...) +{ + changelog_priv_t *priv = NULL; + dict_t *dict = NULL; + char buf[1] = {1}; + int barrier = DICT_DEFAULT; + gf_boolean_t bclean_req = _gf_false; + int ret = 0; + int ret1 = 0; + struct list_head queue = { + 0, + }; + uint64_t xprtcnt = 0; + uint64_t clntcnt = 0; + changelog_clnt_t *conn = NULL; + gf_boolean_t cleanup_notify = _gf_false; + char sockfile[UNIX_PATH_MAX] = { + 0, + }; + rpcsvc_listener_t *listener = NULL; + rpcsvc_listener_t *next = NULL; + + INIT_LIST_HEAD(&queue); + + priv = this->private; + if (!priv) + goto out; + + if (event == GF_EVENT_PARENT_DOWN) { + priv->victim = data; + gf_log(this->name, GF_LOG_INFO, + "cleanup changelog rpc connection of brick %s", + priv->victim->name); + + if (priv->rpc_active) { + this->cleanup_starting = 1; + changelog_destroy_rpc_listner(this, priv); + conn = &priv->connections; + if (conn) + changelog_ev_cleanup_connections(this, conn); + xprtcnt = GF_ATOMIC_GET(priv->xprtcnt); + clntcnt = GF_ATOMIC_GET(priv->clntcnt); + if (!xprtcnt && !clntcnt) { + LOCK(&priv->lock); + { + cleanup_notify = priv->notify_down; + priv->notify_down = _gf_true; + } + UNLOCK(&priv->lock); + if (priv->rpc) { + list_for_each_entry_safe(listener, next, + &priv->rpc->listeners, list) + { + if (listener->trans) { + rpc_transport_unref(listener->trans); + } + } + rpcsvc_destroy(priv->rpc); + priv->rpc = NULL; + } + CHANGELOG_MAKE_SOCKET_PATH(priv->changelog_brick, sockfile, + UNIX_PATH_MAX); + sys_unlink(sockfile); + if (!cleanup_notify) + default_notify(this, GF_EVENT_PARENT_DOWN, data); + } + } else { + default_notify(this, GF_EVENT_PARENT_DOWN, data); } + goto out; + } - if (ret) - changelog_cleanup_helper_threads (this, priv); + if (event == GF_EVENT_TRANSLATOR_OP) { + dict = data; - out: - return ret; -} + barrier = dict_get_str_boolean(dict, "barrier", DICT_DEFAULT); -/* cleanup the notifier thread */ -static int -changelog_cleanup_notifier (xlator_t *this, changelog_priv_t *priv) -{ - int ret = 0; + switch (barrier) { + case DICT_ERROR: + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_MSG_DICT_GET_FAILED, "dict_get_str_boolean", + NULL); + ret = -1; + goto out; - if (priv->cn.notify_th) { - changelog_thread_cleanup (this, priv->cn.notify_th); - priv->cn.notify_th = 0; + case BARRIER_OFF: + gf_smsg(this->name, GF_LOG_INFO, 0, + CHANGELOG_MSG_BARRIER_STATE_NOTIFY, "off", NULL); - ret = close (priv->wfd); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "error closing writer end of notifier pipe" - " (reason: %s)", strerror (errno)); - } + CHANGELOG_NOT_ON_THEN_GOTO(priv, ret, out); + LOCK(&priv->c_snap_lock); + { + changelog_snap_logging_stop(this, priv); + } + UNLOCK(&priv->c_snap_lock); - return ret; -} + LOCK(&priv->bflags.lock); + { + if (priv->bflags.barrier_ext == _gf_false) + ret = -1; + } + UNLOCK(&priv->bflags.lock); -/* spawn the notifier thread - nop if already running */ -static int -changelog_spawn_notifier (xlator_t *this, changelog_priv_t *priv) -{ - int ret = 0; - int flags = 0; - int pipe_fd[2] = {0, 0}; + if (ret == -1) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_MSG_BARRIER_ERROR, NULL); + goto out; + } + + /* Stop changelog barrier and dequeue all fops */ + LOCK(&priv->lock); + { + if (priv->barrier_enabled == _gf_true) + __chlog_barrier_disable(this, &queue); + else + ret = -1; + } + UNLOCK(&priv->lock); + /* If ret = -1, then changelog barrier is already + * disabled because of error or timeout. + */ + if (ret == 0) { + chlog_barrier_dequeue_all(this, &queue); + gf_smsg(this->name, GF_LOG_INFO, 0, + CHANGELOG_MSG_BARRIER_DISABLED, NULL); + } else { + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_MSG_BARRIER_ALREADY_DISABLED, NULL); + } - if (priv->cn.notify_th) - goto out; /* notifier thread already running */ + LOCK(&priv->bflags.lock); + { + priv->bflags.barrier_ext = _gf_false; + } + UNLOCK(&priv->bflags.lock); - ret = pipe (pipe_fd); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Cannot create pipe (reason: %s)", strerror (errno)); goto out; - } - /* writer is non-blocking */ - flags = fcntl (pipe_fd[1], F_GETFL); - flags |= O_NONBLOCK; + case BARRIER_ON: + gf_smsg(this->name, GF_LOG_INFO, 0, + CHANGELOG_MSG_BARRIER_STATE_NOTIFY, "on", NULL); + + CHANGELOG_NOT_ON_THEN_GOTO(priv, ret, out); + LOCK(&priv->c_snap_lock); + { + changelog_snap_logging_start(this, priv); + } + UNLOCK(&priv->c_snap_lock); + + LOCK(&priv->bflags.lock); + { + if (priv->bflags.barrier_ext == _gf_true) + ret = -1; + else + priv->bflags.barrier_ext = _gf_true; + } + UNLOCK(&priv->bflags.lock); + + if (ret == -1) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_MSG_BARRIER_ON_ERROR, NULL); + goto out; + } + + ret = pthread_mutex_lock(&priv->bn.bnotify_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_1(ret, out, bclean_req); + { + priv->bn.bnotify = _gf_true; + } + ret = pthread_mutex_unlock(&priv->bn.bnotify_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_1(ret, out, bclean_req); + + /* Start changelog barrier */ + LOCK(&priv->lock); + { + ret = __chlog_barrier_enable(this, priv); + } + UNLOCK(&priv->lock); + if (ret == -1) { + changelog_barrier_cleanup(this, priv, &queue); + goto out; + } + + gf_smsg(this->name, GF_LOG_INFO, 0, + CHANGELOG_MSG_BARRIER_ENABLE, NULL); + + ret = changelog_barrier_notify(priv, buf); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_MSG_WRITE_FAILED, "Explicit roll over", + NULL); + changelog_barrier_cleanup(this, priv, &queue); + ret = -1; + goto out; + } + + ret = pthread_mutex_lock(&priv->bn.bnotify_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_1(ret, out, bclean_req); + { + /* The while condition check is required here to + * handle spurious wakeup of cond wait that can + * happen with pthreads. See man page */ + while (priv->bn.bnotify == _gf_true) { + ret = pthread_cond_wait(&priv->bn.bnotify_cond, + &priv->bn.bnotify_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_1(ret, out, bclean_req); + } + if (priv->bn.bnotify_error == _gf_true) { + ret = -1; + priv->bn.bnotify_error = _gf_false; + } + } + ret1 = pthread_mutex_unlock(&priv->bn.bnotify_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_1(ret1, out, bclean_req); + gf_smsg(this->name, GF_LOG_INFO, 0, + CHANGELOG_MSG_BNOTIFY_COND_INFO, NULL); - ret = fcntl (pipe_fd[1], F_SETFL, flags); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "failed to set O_NONBLOCK flag"); goto out; - } - priv->wfd = pipe_fd[1]; + case DICT_DEFAULT: + gf_smsg(this->name, GF_LOG_ERROR, 0, + CHANGELOG_MSG_BARRIER_KEY_NOT_FOUND, NULL); + ret = -1; + goto out; - priv->cn.this = this; - priv->cn.rfd = pipe_fd[0]; + default: + gf_smsg(this->name, GF_LOG_ERROR, EINVAL, + CHANGELOG_MSG_ERROR_IN_DICT_GET, NULL); + ret = -1; + goto out; + } + } else { + ret = default_notify(this, event, data); + } - ret = gf_thread_create (&priv->cn.notify_th, - NULL, changelog_notifier, priv); +out: + if (bclean_req) + changelog_barrier_cleanup(this, priv, &queue); - out: - return ret; + return ret; } int32_t -mem_acct_init (xlator_t *this) +mem_acct_init(xlator_t *this) { - int ret = -1; + int ret = -1; - if (!this) - return ret; + if (!this) + return ret; - ret = xlator_mem_acct_init (this, gf_changelog_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_WARNING, "Memory accounting" - " init failed"); - return ret; - } + ret = xlator_mem_acct_init(this, gf_changelog_mt_end + 1); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, + CHANGELOG_MSG_MEMORY_INIT_FAILED, NULL); return ret; + } + + return ret; } static int -changelog_init (xlator_t *this, changelog_priv_t *priv) +changelog_init(xlator_t *this, changelog_priv_t *priv) { - int i = 0; - int ret = -1; - struct timeval tv = {0,}; - changelog_log_data_t cld = {0,}; + int i = 0; + int ret = 0; + changelog_log_data_t cld = { + 0, + }; + + priv->maps[CHANGELOG_TYPE_DATA] = "D "; + priv->maps[CHANGELOG_TYPE_METADATA] = "M "; + priv->maps[CHANGELOG_TYPE_METADATA_XATTR] = "M "; + priv->maps[CHANGELOG_TYPE_ENTRY] = "E "; + + for (; i < CHANGELOG_MAX_TYPE; i++) { + /* start with version 1 */ + priv->slice.changelog_version[i] = 1; + } + + if (!priv->active) + return ret; - ret = gettimeofday (&tv, NULL); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "gettimeofday() failure"); - goto out; - } + /** + * start with a fresh changelog file every time. this is done + * in case there was an encoding change. so... things are kept + * simple here. + */ + changelog_fill_rollover_data(&cld, _gf_false); + + ret = htime_open(this, priv, cld.cld_roll_time); + /* call htime open with cld's rollover_time */ + if (ret) + goto out; + + LOCK(&priv->lock); + { + ret = changelog_inject_single_event(this, priv, &cld); + } + UNLOCK(&priv->lock); + + /* ... and finally spawn the helpers threads */ + ret = changelog_spawn_helper_threads(this, priv); + +out: + return ret; +} + +/** + * Init barrier related condition variables and locks + */ +static int +changelog_barrier_pthread_init(xlator_t *this, changelog_priv_t *priv) +{ + gf_boolean_t bn_mutex_init = _gf_false; + gf_boolean_t bn_cond_init = _gf_false; + gf_boolean_t dm_mutex_black_init = _gf_false; + gf_boolean_t dm_cond_black_init = _gf_false; + gf_boolean_t dm_mutex_white_init = _gf_false; + gf_boolean_t dm_cond_white_init = _gf_false; + gf_boolean_t cr_mutex_init = _gf_false; + gf_boolean_t cr_cond_init = _gf_false; + int ret = 0; + + if ((ret = pthread_mutex_init(&priv->bn.bnotify_mutex, NULL)) != 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED, "name=bnotify", + "ret=%d", ret, NULL); + ret = -1; + goto out; + } + bn_mutex_init = _gf_true; + + if ((ret = pthread_cond_init(&priv->bn.bnotify_cond, NULL)) != 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED, "name=bnotify", + "ret=%d", ret, NULL); + ret = -1; + goto out; + } + bn_cond_init = _gf_true; + + if ((ret = pthread_mutex_init(&priv->dm.drain_black_mutex, NULL)) != 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED, "name=drain_black", + "ret=%d", ret, NULL); + ret = -1; + goto out; + } + dm_mutex_black_init = _gf_true; + + if ((ret = pthread_cond_init(&priv->dm.drain_black_cond, NULL)) != 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED, "name=drain_black", + "ret=%d", ret, NULL); + ret = -1; + goto out; + } + dm_cond_black_init = _gf_true; + + if ((ret = pthread_mutex_init(&priv->dm.drain_white_mutex, NULL)) != 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED, "name=drain_white", + "ret=%d", ret, NULL); + ret = -1; + goto out; + } + dm_mutex_white_init = _gf_true; + + if ((ret = pthread_cond_init(&priv->dm.drain_white_cond, NULL)) != 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED, "name=drain_white", + "ret=%d", ret, NULL); + ret = -1; + goto out; + } + dm_cond_white_init = _gf_true; + + if ((pthread_mutex_init(&priv->cr.lock, NULL)) != 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED, + "name=changelog_rollover", "ret=%d", ret, NULL); + ret = -1; + goto out; + } + cr_mutex_init = _gf_true; + + if ((pthread_cond_init(&priv->cr.cond, NULL)) != 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, + CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED, + "changelog_rollover cond init failed", "ret=%d", ret, NULL); + ret = -1; + goto out; + } + cr_cond_init = _gf_true; +out: + if (ret) { + if (bn_mutex_init) + pthread_mutex_destroy(&priv->bn.bnotify_mutex); + if (bn_cond_init) + pthread_cond_destroy(&priv->bn.bnotify_cond); + if (dm_mutex_black_init) + pthread_mutex_destroy(&priv->dm.drain_black_mutex); + if (dm_cond_black_init) + pthread_cond_destroy(&priv->dm.drain_black_cond); + if (dm_mutex_white_init) + pthread_mutex_destroy(&priv->dm.drain_white_mutex); + if (dm_cond_white_init) + pthread_cond_destroy(&priv->dm.drain_white_cond); + if (cr_mutex_init) + pthread_mutex_destroy(&priv->cr.lock); + if (cr_cond_init) + pthread_cond_destroy(&priv->cr.cond); + } + return ret; +} - priv->slice.tv_start = tv; +/* Destroy barrier related condition variables and locks */ +static void +changelog_barrier_pthread_destroy(changelog_priv_t *priv) +{ + pthread_mutex_destroy(&priv->bn.bnotify_mutex); + pthread_cond_destroy(&priv->bn.bnotify_cond); + pthread_mutex_destroy(&priv->dm.drain_black_mutex); + pthread_cond_destroy(&priv->dm.drain_black_cond); + pthread_mutex_destroy(&priv->dm.drain_white_mutex); + pthread_cond_destroy(&priv->dm.drain_white_cond); + pthread_mutex_destroy(&priv->cr.lock); + pthread_cond_destroy(&priv->cr.cond); + LOCK_DESTROY(&priv->bflags.lock); +} - priv->maps[CHANGELOG_TYPE_DATA] = "D "; - priv->maps[CHANGELOG_TYPE_METADATA] = "M "; - priv->maps[CHANGELOG_TYPE_ENTRY] = "E "; +static void +changelog_cleanup_rpc(xlator_t *this, changelog_priv_t *priv) +{ + /* terminate rpc server */ + if (!this->cleanup_starting) + changelog_destroy_rpc_listner(this, priv); - for (; i < CHANGELOG_MAX_TYPE; i++) { - /* start with version 1 */ - priv->slice.changelog_version[i] = 1; - } + (void)changelog_cleanup_rpc_threads(this, priv); + /* cleanup rot buffs */ + rbuf_dtor(priv->rbuf); - if (!priv->active) - return ret; + /* cleanup poller thread */ + if (priv->poller) + (void)changelog_thread_cleanup(this, priv->poller); +} - /* spawn the notifier thread */ - ret = changelog_spawn_notifier (this, priv); +int +reconfigure(xlator_t *this, dict_t *options) +{ + int ret = 0; + char *tmp = NULL; + changelog_priv_t *priv = NULL; + gf_boolean_t active_earlier = _gf_true; + gf_boolean_t active_now = _gf_true; + gf_boolean_t rpc_active_earlier = _gf_true; + gf_boolean_t rpc_active_now = _gf_true; + gf_boolean_t iniate_rpc = _gf_false; + changelog_time_slice_t *slice = NULL; + changelog_log_data_t cld = { + 0, + }; + char htime_dir[PATH_MAX] = { + 0, + }; + char csnap_dir[PATH_MAX] = { + 0, + }; + uint32_t timeout = 0; + + priv = this->private; + if (!priv) + goto out; + + ret = -1; + active_earlier = priv->active; + rpc_active_earlier = priv->rpc_active; + + /* first stop the rollover and the fsync thread */ + changelog_cleanup_helper_threads(this, priv); + + GF_OPTION_RECONF("changelog-dir", tmp, options, str, out); + if (!tmp) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_DIR_OPTIONS_NOT_SET, + NULL); + goto out; + } + + GF_FREE(priv->changelog_dir); + priv->changelog_dir = gf_strdup(tmp); + if (!priv->changelog_dir) + goto out; + + ret = mkdir_p(priv->changelog_dir, 0600, _gf_true); + + if (ret) + goto out; + CHANGELOG_FILL_HTIME_DIR(priv->changelog_dir, htime_dir); + ret = mkdir_p(htime_dir, 0600, _gf_true); + + if (ret) + goto out; + + CHANGELOG_FILL_CSNAP_DIR(priv->changelog_dir, csnap_dir); + ret = mkdir_p(csnap_dir, 0600, _gf_true); + + if (ret) + goto out; + + GF_OPTION_RECONF("changelog", active_now, options, bool, out); + GF_OPTION_RECONF("changelog-notification", rpc_active_now, options, bool, + out); + + /* If journalling is enabled, enable rpc notifications */ + if (active_now && !active_earlier) { + if (!rpc_active_earlier) + iniate_rpc = _gf_true; + } + + if (rpc_active_now && !rpc_active_earlier) { + iniate_rpc = _gf_true; + } + + /* TODO: Disable of changelog-notifications is not supported for now + * as there is no clean way of cleaning up of rpc resources + */ + + if (iniate_rpc) { + ret = changelog_init_rpc(this, priv); if (ret) - goto out; + goto out; + priv->rpc_active = _gf_true; + } - /** - * start with a fresh changelog file every time. this is done - * in case there was an encoding change. so... things are kept - * simple here. - */ - ret = changelog_fill_rollover_data (&cld, _gf_false); - if (ret) - goto out; + /** + * changelog_handle_change() handles changes that could possibly + * have been submit changes before changelog deactivation. + */ + if (!active_now) + priv->active = _gf_false; - LOCK (&priv->lock); - { - ret = changelog_inject_single_event (this, priv, &cld); - } - UNLOCK (&priv->lock); + GF_OPTION_RECONF("op-mode", tmp, options, str, out); + changelog_assign_opmode(priv, tmp); - /* ... and finally spawn the helpers threads */ - ret = changelog_spawn_helper_threads (this, priv); + tmp = NULL; - out: - return ret; -} + GF_OPTION_RECONF("encoding", tmp, options, str, out); + changelog_assign_encoding(priv, tmp); -int -reconfigure (xlator_t *this, dict_t *options) -{ - int ret = 0; - char *tmp = NULL; - changelog_priv_t *priv = NULL; - gf_boolean_t active_earlier = _gf_true; - gf_boolean_t active_now = _gf_true; - changelog_time_slice_t *slice = NULL; - changelog_log_data_t cld = {0,}; - - priv = this->private; - if (!priv) - goto out; + GF_OPTION_RECONF("rollover-time", priv->rollover_time, options, int32, out); + GF_OPTION_RECONF("fsync-interval", priv->fsync_interval, options, int32, + out); + GF_OPTION_RECONF("changelog-barrier-timeout", timeout, options, time, out); + changelog_assign_barrier_timeout(priv, timeout); - ret = -1; - active_earlier = priv->active; + GF_OPTION_RECONF("capture-del-path", priv->capture_del_path, options, bool, + out); - /* first stop the rollover and the fsync thread */ - changelog_cleanup_helper_threads (this, priv); + if (active_now || active_earlier) { + changelog_fill_rollover_data(&cld, !active_now); - GF_OPTION_RECONF ("changelog-dir", tmp, options, str, out); - if (!tmp) { - gf_log (this->name, GF_LOG_ERROR, - "\"changelog-dir\" option is not set"); - goto out; - } + slice = &priv->slice; - GF_FREE (priv->changelog_dir); - priv->changelog_dir = gf_strdup (tmp); - if (!priv->changelog_dir) - goto out; + LOCK(&priv->lock); + { + ret = changelog_inject_single_event(this, priv, &cld); + if (!ret && active_now) + SLICE_VERSION_UPDATE(slice); + } + UNLOCK(&priv->lock); - ret = mkdir_p (priv->changelog_dir, 0600, _gf_true); if (ret) - goto out; + goto out; + + if (active_now) { + if (!active_earlier) { + gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_RECONFIGURE, + NULL); + htime_create(this, priv, gf_time()); + } + ret = changelog_spawn_helper_threads(this, priv); + } + } + +out: + if (ret) { + /* TODO */ + } else { + gf_msg_debug(this->name, 0, "changelog reconfigured"); + if (active_now && priv) + priv->active = _gf_true; + } + + return ret; +} - GF_OPTION_RECONF ("changelog", active_now, options, bool, out); +static void +changelog_freeup_options(xlator_t *this, changelog_priv_t *priv) +{ + int ret = 0; - /** - * changelog_handle_change() handles changes that could possibly - * have been submit changes before changelog deactivation. - */ - if (!active_now) - priv->active = _gf_false; + ret = priv->cb->dtor(this, &priv->cd); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_FREEUP_FAILED, NULL); + GF_FREE(priv->changelog_brick); + GF_FREE(priv->changelog_dir); +} - GF_OPTION_RECONF ("op-mode", tmp, options, str, out); - changelog_assign_opmode (priv, tmp); +static int +changelog_init_options(xlator_t *this, changelog_priv_t *priv) +{ + int ret = 0; + char *tmp = NULL; + uint32_t timeout = 0; + char htime_dir[PATH_MAX] = { + 0, + }; + char csnap_dir[PATH_MAX] = { + 0, + }; - tmp = NULL; + GF_OPTION_INIT("changelog-brick", tmp, str, error_return); + priv->changelog_brick = gf_strdup(tmp); + if (!priv->changelog_brick) + goto error_return; - GF_OPTION_RECONF ("encoding", tmp, options, str, out); - changelog_assign_encoding (priv, tmp); + tmp = NULL; - GF_OPTION_RECONF ("rollover-time", - priv->rollover_time, options, int32, out); - GF_OPTION_RECONF ("fsync-interval", - priv->fsync_interval, options, int32, out); + GF_OPTION_INIT("changelog-dir", tmp, str, dealloc_1); + priv->changelog_dir = gf_strdup(tmp); + if (!priv->changelog_dir) + goto dealloc_1; - if (active_now || active_earlier) { - ret = changelog_fill_rollover_data (&cld, !active_now); - if (ret) - goto out; + tmp = NULL; - slice = &priv->slice; + /** + * create the directory even if change-logging would be inactive + * so that consumers can _look_ into it (finding nothing...) + */ + ret = mkdir_p(priv->changelog_dir, 0600, _gf_true); - LOCK (&priv->lock); - { - ret = changelog_inject_single_event (this, priv, &cld); - if (!ret && active_now) - SLICE_VERSION_UPDATE (slice); - } - UNLOCK (&priv->lock); - - if (ret) - goto out; - - if (active_now) { - ret = changelog_spawn_notifier (this, priv); - if (!ret) - ret = changelog_spawn_helper_threads (this, - priv); - } else - ret = changelog_cleanup_notifier (this, priv); - } + if (ret) + goto dealloc_2; - out: - if (ret) { - ret = changelog_cleanup_notifier (this, priv); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "changelog reconfigured"); - if (active_now) - priv->active = _gf_true; - } + CHANGELOG_FILL_HTIME_DIR(priv->changelog_dir, htime_dir); + ret = mkdir_p(htime_dir, 0600, _gf_true); + if (ret) + goto dealloc_2; - return ret; -} + CHANGELOG_FILL_CSNAP_DIR(priv->changelog_dir, csnap_dir); + ret = mkdir_p(csnap_dir, 0600, _gf_true); + if (ret) + goto dealloc_2; -int32_t -init (xlator_t *this) -{ - int ret = -1; - char *tmp = NULL; - changelog_priv_t *priv = NULL; + GF_OPTION_INIT("changelog", priv->active, bool, dealloc_2); + GF_OPTION_INIT("changelog-notification", priv->rpc_active, bool, dealloc_2); + GF_OPTION_INIT("capture-del-path", priv->capture_del_path, bool, dealloc_2); - GF_VALIDATE_OR_GOTO ("changelog", this, out); + GF_OPTION_INIT("op-mode", tmp, str, dealloc_2); + changelog_assign_opmode(priv, tmp); - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "translator needs a single subvolume"); - goto out; - } + tmp = NULL; - if (!this->parents) { - gf_log (this->name, GF_LOG_ERROR, - "dangling volume. please check volfile"); - goto out; - } + GF_OPTION_INIT("encoding", tmp, str, dealloc_2); + changelog_assign_encoding(priv, tmp); + changelog_encode_change(priv); - priv = GF_CALLOC (1, sizeof (*priv), gf_changelog_mt_priv_t); - if (!priv) - goto out; + GF_OPTION_INIT("rollover-time", priv->rollover_time, int32, dealloc_2); - this->local_pool = mem_pool_new (changelog_local_t, 64); - if (!this->local_pool) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create local memory pool"); - goto out; - } + GF_OPTION_INIT("fsync-interval", priv->fsync_interval, int32, dealloc_2); - LOCK_INIT (&priv->lock); + GF_OPTION_INIT("changelog-barrier-timeout", timeout, time, dealloc_2); + changelog_assign_barrier_timeout(priv, timeout); - GF_OPTION_INIT ("changelog-brick", tmp, str, out); - if (!tmp) { - gf_log (this->name, GF_LOG_ERROR, - "\"changelog-brick\" option is not set"); - goto out; - } + GF_ASSERT(cb_bootstrap[priv->op_mode].mode == priv->op_mode); + priv->cb = &cb_bootstrap[priv->op_mode]; - priv->changelog_brick = gf_strdup (tmp); - if (!priv->changelog_brick) - goto out; - tmp = NULL; + /* ... now bootstrap the logger */ + ret = priv->cb->ctor(this, &priv->cd); + if (ret) + goto dealloc_2; - GF_OPTION_INIT ("changelog-dir", tmp, str, out); - if (!tmp) { - gf_log (this->name, GF_LOG_ERROR, - "\"changelog-dir\" option is not set"); - goto out; - } + priv->changelog_fd = -1; - priv->changelog_dir = gf_strdup (tmp); - if (!priv->changelog_dir) - goto out; - tmp = NULL; + return 0; - /** - * create the directory even if change-logging would be inactive - * so that consumers can _look_ into it (finding nothing...) - */ - ret = mkdir_p (priv->changelog_dir, 0600, _gf_true); - if (ret) - goto out; +dealloc_2: + GF_FREE(priv->changelog_dir); +dealloc_1: + GF_FREE(priv->changelog_brick); +error_return: + return -1; +} - GF_OPTION_INIT ("changelog", priv->active, bool, out); +static int +changelog_init_rpc(xlator_t *this, changelog_priv_t *priv) +{ + rpcsvc_t *rpc = NULL; + changelog_ev_selector_t *selection = NULL; - GF_OPTION_INIT ("op-mode", tmp, str, out); - changelog_assign_opmode (priv, tmp); + selection = &priv->ev_selection; - tmp = NULL; + /* initialize event selection */ + changelog_init_event_selection(this, selection); - GF_OPTION_INIT ("encoding", tmp, str, out); - changelog_assign_encoding (priv, tmp); + priv->rbuf = rbuf_init(NR_ROTT_BUFFS); + if (!priv->rbuf) + goto cleanup_thread; - GF_OPTION_INIT ("rollover-time", priv->rollover_time, int32, out); + rpc = changelog_init_rpc_listener(this, priv, priv->rbuf, NR_DISPATCHERS); + if (!rpc) + goto cleanup_rbuf; + priv->rpc = rpc; - GF_OPTION_INIT ("fsync-interval", priv->fsync_interval, int32, out); + return 0; - changelog_encode_change(priv); +cleanup_rbuf: + rbuf_dtor(priv->rbuf); +cleanup_thread: + if (priv->poller) + (void)changelog_thread_cleanup(this, priv->poller); - GF_ASSERT (cb_bootstrap[priv->op_mode].mode == priv->op_mode); - priv->cb = &cb_bootstrap[priv->op_mode]; + return -1; +} - /* ... now bootstrap the logger */ - ret = priv->cb->ctor (this, &priv->cd); +int32_t +init(xlator_t *this) +{ + int ret = -1; + changelog_priv_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("changelog", this, error_return); + + if (!this->children || this->children->next) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_CHILD_MISCONFIGURED, + NULL); + goto error_return; + } + + if (!this->parents) { + gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_VOL_MISCONFIGURED, + NULL); + goto error_return; + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_changelog_mt_priv_t); + if (!priv) + goto error_return; + + this->local_pool = mem_pool_new(changelog_local_t, 64); + if (!this->local_pool) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, CHANGELOG_MSG_NO_MEMORY, + NULL); + goto cleanup_priv; + } + + LOCK_INIT(&priv->lock); + LOCK_INIT(&priv->c_snap_lock); + GF_ATOMIC_INIT(priv->listnercnt, 0); + GF_ATOMIC_INIT(priv->clntcnt, 0); + GF_ATOMIC_INIT(priv->xprtcnt, 0); + INIT_LIST_HEAD(&priv->xprt_list); + priv->htime_fd = -1; + + ret = changelog_init_options(this, priv); + if (ret) + goto cleanup_mempool; + + /* snap dependency changes */ + priv->dm.black_fop_cnt = 0; + priv->dm.white_fop_cnt = 0; + priv->dm.drain_wait_black = _gf_false; + priv->dm.drain_wait_white = _gf_false; + priv->current_color = FOP_COLOR_BLACK; + priv->explicit_rollover = _gf_false; + + priv->cr.notify = _gf_false; + /* Mutex is not needed as threads are not spawned yet */ + priv->bn.bnotify = _gf_false; + priv->bn.bnotify_error = _gf_false; + ret = changelog_barrier_pthread_init(this, priv); + if (ret) + goto cleanup_options; + LOCK_INIT(&priv->bflags.lock); + priv->bflags.barrier_ext = _gf_false; + + /* Changelog barrier init */ + INIT_LIST_HEAD(&priv->queue); + priv->barrier_enabled = _gf_false; + + if (priv->rpc_active || priv->active) { + /* RPC ball rolling.. */ + ret = changelog_init_rpc(this, priv); if (ret) - goto out; + goto cleanup_barrier; + priv->rpc_active = _gf_true; + } + + ret = changelog_init(this, priv); + if (ret) + goto cleanup_rpc; + + gf_msg_debug(this->name, 0, "changelog translator loaded"); + + this->private = priv; + return 0; + +cleanup_rpc: + if (priv->rpc_active) { + changelog_cleanup_rpc(this, priv); + } +cleanup_barrier: + changelog_barrier_pthread_destroy(priv); +cleanup_options: + changelog_freeup_options(this, priv); +cleanup_mempool: + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; +cleanup_priv: + GF_FREE(priv); +error_return: + this->private = NULL; + return -1; +} - priv->changelog_fd = -1; - ret = changelog_init (this, priv); - if (ret) - goto out; +void +fini(xlator_t *this) +{ + changelog_priv_t *priv = NULL; + struct list_head queue = { + 0, + }; + + priv = this->private; + + if (priv) { + if (priv->active || priv->rpc_active) { + /* terminate RPC server/threads */ + changelog_cleanup_rpc(this, priv); + GF_FREE(priv->ev_dispatcher); + } + /* call barrier_disable to cancel timer */ + if (priv->barrier_enabled) + __chlog_barrier_disable(this, &queue); - gf_log (this->name, GF_LOG_DEBUG, "changelog translator loaded"); + /* cleanup barrier related objects */ + changelog_barrier_pthread_destroy(priv); - out: - if (ret) { - if (this->local_pool) - mem_pool_destroy (this->local_pool); - if (priv->cb) { - ret = priv->cb->dtor (this, &priv->cd); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "error in cleanup during init()"); - } - GF_FREE (priv->changelog_brick); - GF_FREE (priv->changelog_dir); - GF_FREE (priv); - this->private = NULL; - } else - this->private = priv; + /* cleanup helper threads */ + changelog_cleanup_helper_threads(this, priv); - return ret; -} + /* cleanup allocated options */ + changelog_freeup_options(this, priv); -void -fini (xlator_t *this) -{ - int ret = -1; - changelog_priv_t *priv = NULL; - - priv = this->private; - - if (priv) { - ret = priv->cb->dtor (this, &priv->cd); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "error in fini"); - mem_pool_destroy (this->local_pool); - GF_FREE (priv->changelog_brick); - GF_FREE (priv->changelog_dir); - GF_FREE (priv); + /* deallocate mempool */ + mem_pool_destroy(this->local_pool); + + if (priv->htime_fd != -1) { + sys_close(priv->htime_fd); } - this->private = NULL; + /* finally, dealloac private variable */ + GF_FREE(priv); + } - return; + this->private = NULL; + this->local_pool = NULL; + + return; } struct xlator_fops fops = { - .mknod = changelog_mknod, - .mkdir = changelog_mkdir, - .create = changelog_create, - .symlink = changelog_symlink, - .writev = changelog_writev, - .truncate = changelog_truncate, - .ftruncate = changelog_ftruncate, - .link = changelog_link, - .rename = changelog_rename, - .unlink = changelog_unlink, - .rmdir = changelog_rmdir, - .setattr = changelog_setattr, - .fsetattr = changelog_fsetattr, - .setxattr = changelog_setxattr, - .fsetxattr = changelog_fsetxattr, - .removexattr = changelog_removexattr, - .fremovexattr = changelog_fremovexattr, + .open = changelog_open, + .mknod = changelog_mknod, + .mkdir = changelog_mkdir, + .create = changelog_create, + .symlink = changelog_symlink, + .writev = changelog_writev, + .truncate = changelog_truncate, + .ftruncate = changelog_ftruncate, + .link = changelog_link, + .rename = changelog_rename, + .unlink = changelog_unlink, + .rmdir = changelog_rmdir, + .setattr = changelog_setattr, + .fsetattr = changelog_fsetattr, + .setxattr = changelog_setxattr, + .fsetxattr = changelog_fsetxattr, + .removexattr = changelog_removexattr, + .fremovexattr = changelog_fremovexattr, + .ipc = changelog_ipc, + .xattrop = changelog_xattrop, + .fxattrop = changelog_fxattrop, }; struct xlator_cbks cbks = { - .forget = changelog_forget, + .forget = changelog_forget, + .release = changelog_release, }; struct volume_options options[] = { - {.key = {"changelog"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - .description = "enable/disable change-logging" - }, - {.key = {"changelog-brick"}, - .type = GF_OPTION_TYPE_PATH, - .description = "brick path to generate unique socket file name." - " should be the export directory of the volume strictly." - }, - {.key = {"changelog-dir"}, - .type = GF_OPTION_TYPE_PATH, - .description = "directory for the changelog files" - }, - {.key = {"op-mode"}, - .type = GF_OPTION_TYPE_STR, - .default_value = "realtime", - .value = {"realtime"}, - .description = "operation mode - futuristic operation modes" - }, - {.key = {"encoding"}, - .type = GF_OPTION_TYPE_STR, - .default_value = "ascii", - .value = {"binary", "ascii"}, - .description = "encoding type for changelogs" - }, - {.key = {"rollover-time"}, - .default_value = "60", - .type = GF_OPTION_TYPE_TIME, - .description = "time to switch to a new changelog file (in seconds)" - }, - {.key = {"fsync-interval"}, - .type = GF_OPTION_TYPE_TIME, - .default_value = "0", - .description = "do not open CHANGELOG file with O_SYNC mode." - " instead perform fsync() at specified intervals" - }, - {.key = {NULL} - }, + {.key = {"changelog"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable change-logging", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE, + .level = OPT_STATUS_BASIC, + .tags = {"journal", "georep", "glusterfind"}}, + {.key = {"changelog-notification"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable changelog live notification", + .op_version = {3}, + .level = OPT_STATUS_BASIC, + .tags = {"bitrot", "georep"}}, + {.key = {"changelog-brick"}, + .type = GF_OPTION_TYPE_PATH, + .description = "brick path to generate unique socket file name." + " should be the export directory of the volume strictly.", + .default_value = "{{ brick.path }}", + .op_version = {3}, + .tags = {"journal"}}, + {.key = {"changelog-dir"}, + .type = GF_OPTION_TYPE_PATH, + .description = "directory for the changelog files", + .default_value = "{{ brick.path }}/.glusterfs/changelogs", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE, + .level = OPT_STATUS_ADVANCED, + .tags = {"journal", "georep", "glusterfind"}}, + {.key = {"op-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "realtime", + .value = {"realtime"}, + .description = "operation mode - futuristic operation modes", + .op_version = {3}, + .tags = {"journal"}}, + {.key = {"encoding"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "ascii", + .value = {"binary", "ascii"}, + .description = "encoding type for changelogs", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE, + .level = OPT_STATUS_ADVANCED, + .tags = {"journal"}}, + {.key = {"rollover-time"}, + .default_value = "15", + .type = GF_OPTION_TYPE_TIME, + .description = "time to switch to a new changelog file (in seconds)", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE, + .level = OPT_STATUS_ADVANCED, + .tags = {"journal", "georep", "glusterfind"}}, + {.key = {"fsync-interval"}, + .type = GF_OPTION_TYPE_TIME, + .default_value = "5", + .description = "do not open CHANGELOG file with O_SYNC mode." + " instead perform fsync() at specified intervals", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE, + .level = OPT_STATUS_ADVANCED, + .tags = {"journal"}}, + {.key = {"changelog-barrier-timeout"}, + .type = GF_OPTION_TYPE_TIME, + .default_value = BARRIER_TIMEOUT, + .description = "After 'timeout' seconds since the time 'barrier' " + "option was set to \"on\", unlink/rmdir/rename " + "operations are no longer blocked and previously " + "blocked fops are allowed to go through", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE, + .level = OPT_STATUS_ADVANCED, + .tags = {"journal"}}, + {.key = {"capture-del-path"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable capturing paths of deleted entries", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE, + .level = OPT_STATUS_BASIC, + .tags = {"journal", "glusterfind"}}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "changelog", + .category = GF_MAINTAINED, }; diff --git a/xlators/features/path-convertor/Makefile.am b/xlators/features/cloudsync/Makefile.am index d471a3f9243..a985f42a877 100644 --- a/xlators/features/path-convertor/Makefile.am +++ b/xlators/features/cloudsync/Makefile.am @@ -1,3 +1,3 @@ SUBDIRS = src -CLEANFILES = +CLEANFILES = diff --git a/xlators/features/cloudsync/src/Makefile.am b/xlators/features/cloudsync/src/Makefile.am new file mode 100644 index 00000000000..e2a277e372b --- /dev/null +++ b/xlators/features/cloudsync/src/Makefile.am @@ -0,0 +1,46 @@ +SUBDIRS = cloudsync-plugins + +xlator_LTLIBRARIES = cloudsync.la + +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +cloudsync_sources = cloudsync.c + +CLOUDSYNC_SRC = $(top_srcdir)/xlators/features/cloudsync/src +CLOUDSYNC_BLD = $(top_builddir)/xlators/features/cloudsync/src + +cloudsynccommon_sources = $(CLOUDSYNC_SRC)/cloudsync-common.c + +noinst_HEADERS = $(CLOUDSYNC_BLD)/cloudsync.h \ + $(CLOUDSYNC_BLD)/cloudsync-mem-types.h \ + $(CLOUDSYNC_BLD)/cloudsync-messages.h \ + $(CLOUDSYNC_BLD)/cloudsync-common.h + +cloudsync_la_SOURCES = $(cloudsync_sources) $(cloudsynccommon_sources) + +nodist_cloudsync_la_SOURCES = cloudsync-autogen-fops.c cloudsync-autogen-fops.h +BUILT_SOURCES = cloudsync-autogen-fops.h + +cloudsync_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +cloudsync_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIB_DL) + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -DCS_PLUGINDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/cloudsync-plugins\" +AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS) + +noinst_PYTHON = cloudsync-fops-c.py cloudsync-fops-h.py +EXTRA_DIST = cloudsync-autogen-fops-tmpl.c cloudsync-autogen-fops-tmpl.h + +cloudsync-autogen-fops.c: cloudsync-fops-c.py cloudsync-autogen-fops-tmpl.c + $(PYTHON) $(CLOUDSYNC_SRC)/cloudsync-fops-c.py \ + $(CLOUDSYNC_SRC)/cloudsync-autogen-fops-tmpl.c > $@ + +cloudsync-autogen-fops.h: cloudsync-fops-h.py cloudsync-autogen-fops-tmpl.h + $(PYTHON) $(CLOUDSYNC_SRC)/cloudsync-fops-h.py \ + $(CLOUDSYNC_SRC)/cloudsync-autogen-fops-tmpl.h > $@ + +CLEANFILES = $(nodist_cloudsync_la_SOURCES) + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/cloudsync.so diff --git a/xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.c b/xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.c new file mode 100644 index 00000000000..ee63f983980 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.c @@ -0,0 +1,30 @@ +/* + Copyright (c) 2008-2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/* File: cloudsync-autogen-fops-tmpl.c + * This file contains the CLOUDSYNC autogenerated FOPs. This is run through + * the code generator, generator.py to generate the required FOPs. + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <dlfcn.h> + +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include "cloudsync.h" +#include "cloudsync-common.h" +#include <glusterfs/call-stub.h> + +#pragma generate diff --git a/xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.h b/xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.h new file mode 100644 index 00000000000..d922c77d8aa --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2008-2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/* File: clousync-autogen-fops-tmpl.h + * This file contains the cloudsync autogenerated FOPs declarations. + */ + +#ifndef _CLOUDSYNC_AUTOGEN_FOPS_H +#define _CLOUDSYNC_AUTOGEN_FOPS_H + +#include <glusterfs/xlator.h> +#include "cloudsync.h" +#include "cloudsync-common.h" + +#pragma generate + +#endif /* _CLOUDSYNC_AUTOGEN_FOPS_H */ diff --git a/xlators/features/cloudsync/src/cloudsync-common.c b/xlators/features/cloudsync/src/cloudsync-common.c new file mode 100644 index 00000000000..445a31b90e7 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-common.c @@ -0,0 +1,60 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "cloudsync-common.h" + +void +cs_xattrinfo_wipe(cs_local_t *local) +{ + if (local->xattrinfo.lxattr) { + if (local->xattrinfo.lxattr->file_path) + GF_FREE(local->xattrinfo.lxattr->file_path); + + if (local->xattrinfo.lxattr->volname) + GF_FREE(local->xattrinfo.lxattr->volname); + + GF_FREE(local->xattrinfo.lxattr); + } +} + +void +cs_local_wipe(xlator_t *this, cs_local_t *local) +{ + if (!local) + return; + + loc_wipe(&local->loc); + + if (local->fd) { + fd_unref(local->fd); + local->fd = NULL; + } + + if (local->stub) { + call_stub_destroy(local->stub); + local->stub = NULL; + } + + if (local->xattr_req) + dict_unref(local->xattr_req); + + if (local->xattr_rsp) + dict_unref(local->xattr_rsp); + + if (local->dlfd) + fd_unref(local->dlfd); + + if (local->remotepath) + GF_FREE(local->remotepath); + + cs_xattrinfo_wipe(local); + + mem_put(local); +} diff --git a/xlators/features/cloudsync/src/cloudsync-common.h b/xlators/features/cloudsync/src/cloudsync-common.h new file mode 100644 index 00000000000..11d233460a4 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-common.h @@ -0,0 +1,134 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CLOUDSYNC_COMMON_H +#define _CLOUDSYNC_COMMON_H + +#include <glusterfs/glusterfs.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/xlator.h> +#include <glusterfs/syncop.h> +#include <glusterfs/compat-errno.h> +#include "cloudsync-mem-types.h" +#include "cloudsync-messages.h" + +typedef struct cs_loc_xattr { + char *file_path; + uuid_t uuid; + uuid_t gfid; + char *volname; +} cs_loc_xattr_t; + +typedef struct cs_size_xattr { + uint64_t size; + uint64_t blksize; + uint64_t blocks; +} cs_size_xattr_t; + +typedef struct cs_local { + loc_t loc; + fd_t *fd; + call_stub_t *stub; + call_frame_t *main_frame; + int op_errno; + int op_ret; + fd_t *dlfd; + off_t dloffset; + struct iatt stbuf; + dict_t *xattr_rsp; + dict_t *xattr_req; + glusterfs_fop_t fop; + gf_boolean_t locked; + int call_cnt; + inode_t *inode; + char *remotepath; + + struct { + /* offset, flags and size are the information needed + * by read fop for remote read operation. These will be + * populated in cloudsync read fop, before being passed + * on to the plugin performing remote read. + */ + off_t offset; + uint32_t flags; + size_t size; + cs_loc_xattr_t *lxattr; + } xattrinfo; + +} cs_local_t; + +typedef int (*fop_download_t)(call_frame_t *frame, void *config); + +typedef int (*fop_remote_read_t)(call_frame_t *, void *); + +typedef void *(*store_init)(xlator_t *this); + +typedef int (*store_reconfigure)(xlator_t *this, dict_t *options); + +typedef void (*store_fini)(void *config); + +struct cs_remote_stores { + char *name; /* store name */ + void *config; /* store related information */ + fop_download_t dlfop; /* store specific download function */ + fop_remote_read_t rdfop; /* store specific read function */ + store_init init; /* store init to initialize store config */ + store_reconfigure reconfigure; /* reconfigure store config */ + store_fini fini; + void *handle; /* shared library handle*/ +}; + +typedef struct cs_private { + xlator_t *this; + struct cs_remote_stores *stores; + gf_boolean_t abortdl; + pthread_spinlock_t lock; + gf_boolean_t remote_read; +} cs_private_t; + +void +cs_local_wipe(xlator_t *this, cs_local_t *local); + +void +cs_xattrinfo_wipe(cs_local_t *local); + +#define CS_STACK_UNWIND(fop, frame, params...) \ + do { \ + cs_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + if (frame) { \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + cs_local_wipe(__xl, __local); \ + } while (0) + +#define CS_STACK_DESTROY(frame) \ + do { \ + cs_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_DESTROY(frame->root); \ + cs_local_wipe(__xl, __local); \ + } while (0) + +typedef struct store_methods { + int (*fop_download)(call_frame_t *frame, void *config); + int (*fop_remote_read)(call_frame_t *, void *); + /* return type should be the store config */ + void *(*fop_init)(xlator_t *this); + int (*fop_reconfigure)(xlator_t *this, dict_t *options); + void (*fop_fini)(void *config); +} store_methods_t; + +#endif /* _CLOUDSYNC_COMMON_H */ diff --git a/xlators/features/cloudsync/src/cloudsync-fops-c.py b/xlators/features/cloudsync/src/cloudsync-fops-c.py new file mode 100755 index 00000000000..c27df97ae58 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-fops-c.py @@ -0,0 +1,324 @@ +#!/usr/bin/python3 + +from __future__ import print_function +import os +import sys + +curdir = os.path.dirname(sys.argv[0]) +gendir = os.path.join(curdir, '../../../../libglusterfs/src') +sys.path.append(gendir) +from generator import ops, fop_subs, cbk_subs, generate + +FD_DATA_MODIFYING_OP_FOP_TEMPLATE = """ +int32_t +cs_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + int op_errno = EINVAL ; + cs_local_t *local = NULL; + int ret = 0; + cs_inode_ctx_t *ctx = NULL; + gf_cs_obj_state state = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = cs_local_init (this, frame, NULL, fd, GF_FOP_@UPNAME@); + if (!local) { + + gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local init failed"); + op_errno = ENOMEM; + goto err; + } + + __cs_inode_ctx_get (this, fd->inode, &ctx); + + if (ctx) + state = __cs_get_file_state (fd->inode, ctx); + else + state = GF_CS_LOCAL; + + xdata = xdata ? dict_ref (xdata) : dict_new (); + + if (!xdata) { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, "insufficient memory"); + op_errno = ENOMEM; + goto err; + } + + local->xattr_req = xdata; + + ret = dict_set_uint32 (local->xattr_req, GF_CS_OBJECT_STATUS, 1); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, "dict_set failed key:" + " %s", GF_CS_OBJECT_STATUS); + goto err; + } + + local->stub = fop_@NAME@_stub (frame, cs_resume_@NAME@, + @SHORT_ARGS@); + if (!local->stub) { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, "insufficient memory"); + op_errno = ENOMEM; + goto err; + } + + + if (state == GF_CS_LOCAL) { + STACK_WIND (frame, cs_@NAME@_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, + @SHORT_ARGS@); + } else { + local->call_cnt++; + ret = locate_and_execute (frame); + if (ret) { + op_errno = ENOMEM; + goto err; + } + } + + return 0; + +err: + CS_STACK_UNWIND (@NAME@, frame, -1, op_errno, @CBK_ERROR_ARGS@); + + return 0; +} +""" + +FD_DATA_MODIFYING_RESUME_OP_FOP_TEMPLATE = """ +int32_t +cs_resume_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + int ret = 0; + + ret = cs_resume_postprocess (this, frame, fd->inode); + if (ret) { + goto unwind; + } + + cs_inodelk_unlock (frame); + + STACK_WIND (frame, cs_@NAME@_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, + @SHORT_ARGS@); + + return 0; + +unwind: + + cs_inodelk_unlock (frame); + + cs_common_cbk (frame); + + return 0; +} +""" +FD_DATA_MODIFYING_OP_FOP_CBK_TEMPLATE = """ +int32_t +cs_@NAME@_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + @LONG_ARGS@) +{ + cs_local_t *local = NULL; + int ret = 0; + uint64_t val = 0; + fd_t *fd = NULL; + + local = frame->local; + fd = local->fd; + + /* Do we need lock here? */ + local->call_cnt++; + + if (op_ret == -1) { + ret = dict_get_uint64 (xdata, GF_CS_OBJECT_STATUS, &val); + if (ret == 0) { + if (val == GF_CS_ERROR) { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, + "could not get file state, unwinding"); + op_ret = -1; + op_errno = EIO; + goto unwind; + } else { + __cs_inode_ctx_update (this, fd->inode, val); + gf_msg (this->name, GF_LOG_INFO, 0, 0, + " state = %" PRIu64, val); + + if (local->call_cnt == 1 && + (val == GF_CS_REMOTE || + val == GF_CS_DOWNLOADING)) { + gf_msg (this->name, GF_LOG_INFO, 0, + 0, " will repair and download " + "the file, current state : %" + PRIu64, val); + goto repair; + } else { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, + "second @NAME@, Unwinding"); + goto unwind; + } + } + } else { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, "file state " + "could not be figured, unwinding"); + goto unwind; + } + } else { + /* successful @NAME@ => file is local */ + __cs_inode_ctx_update (this, fd->inode, GF_CS_LOCAL); + gf_msg (this->name, GF_LOG_INFO, 0, 0, "state : GF_CS_LOCAL" + ", @NAME@ successful"); + + goto unwind; + } + +repair: + ret = locate_and_execute (frame); + if (ret) { + goto unwind; + } + + return 0; + +unwind: + CS_STACK_UNWIND (@NAME@, frame, op_ret, op_errno, @SHORT_ARGS@); + + return 0; +} +""" + +LOC_STAT_OP_FOP_TEMPLATE = """ +int32_t +cs_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + int op_errno = EINVAL; + cs_local_t *local = NULL; + int ret = 0; + + local = cs_local_init (this, frame, loc, NULL, GF_FOP_@UPNAME@); + if (!local) { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local is NULL"); + op_errno = ENOMEM; + goto err; + } + + if (loc->inode->ia_type == IA_IFDIR) + goto wind; + + xdata = xdata ? dict_ref (xdata) : dict_new (); + + if (!xdata) { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, "insufficient memory"); + op_errno = ENOMEM; + goto err; + } + + local->xattr_req = xdata; + + ret = dict_set_uint32 (local->xattr_req, GF_CS_OBJECT_STATUS, 1); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, "dict_set failed key:" + " %s", GF_CS_OBJECT_STATUS); + goto err; + } + +wind: + STACK_WIND (frame, cs_@NAME@_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->@NAME@, + @SHORT_ARGS@); + + return 0; +err: + CS_STACK_UNWIND (@NAME@, frame, -1, op_errno, @CBK_ERROR_ARGS@); + + return 0; +} +""" + +LOC_STAT_OP_FOP_CBK_TEMPLATE = """ +int32_t +cs_@NAME@_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + @LONG_ARGS@) +{ + int ret = 0; + uint64_t val = 0; + loc_t *loc = NULL; + cs_local_t *local = NULL; + + local = frame->local; + + loc = &local->loc; + + if (op_ret == 0) { + ret = dict_get_uint64 (xdata, GF_CS_OBJECT_STATUS, &val); + if (!ret) { + ret = __cs_inode_ctx_update (this, loc->inode, val); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, + "ctx update failed"); + } + } + } else { + cs_inode_ctx_reset (this, loc->inode); + } + + CS_STACK_UNWIND (@NAME@, frame, op_ret, op_errno, @SHORT_ARGS@); + + return 0; +} +""" + +# All xlator FOPs are covered in the following section just to create a clarity +# The lists themselves are not used. +entry_ops = ['mknod', 'mkdir', 'unlink', 'rmdir', 'symlink', 'rename', 'link', + 'create'] +special_ops = ['statfs', 'lookup', 'ipc', 'compound', 'icreate', 'namelink'] +ignored_ops = ['getspec'] +inode_ops = ['stat', 'readlink', 'truncate', 'open', 'setxattr', 'getxattr', + 'removexattr', 'opendir', 'access', 'inodelk', 'entrylk', + 'xattrop', 'setattr', 'lease', 'getactivelk', 'setactivelk', + 'discover'] +fd_ops = ['readv', 'writev', 'flush', 'fsync', 'fsyncdir', 'ftruncate', + 'fstat', 'lk', 'readdir', 'finodelk', 'fentrylk', 'fxattrop', + 'fsetxattr', 'fgetxattr', 'rchecksum', 'fsetattr', 'readdirp', + 'fremovexattr', 'fallocate', 'discard', 'zerofill', 'seek'] + + +# These are the current actual lists used to generate the code + +# The following list contains fops which are fd based that modifies data +fd_data_modify_op_fop_template = ['writev', 'flush', 'fsync', + 'ftruncate', 'rchecksum', 'fallocate', + 'discard', 'zerofill', 'seek'] + +# The following list contains fops which are entry based that does not change +# data +loc_stat_op_fop_template = ['lookup', 'stat', 'discover', 'access', 'setattr', + 'getattr'] + +# These fops need a separate implementation +special_fops = ['statfs', 'setxattr', 'unlink', 'getxattr', + 'truncate', 'fstat', 'readv', 'readdirp'] + +def gen_defaults(): + for name in ops: + if name in fd_data_modify_op_fop_template: + print(generate(FD_DATA_MODIFYING_OP_FOP_CBK_TEMPLATE, name, cbk_subs)) + print(generate(FD_DATA_MODIFYING_RESUME_OP_FOP_TEMPLATE, name, fop_subs)) + print(generate(FD_DATA_MODIFYING_OP_FOP_TEMPLATE, name, fop_subs)) + elif name in loc_stat_op_fop_template: + print(generate(LOC_STAT_OP_FOP_CBK_TEMPLATE, name, cbk_subs)) + print(generate(LOC_STAT_OP_FOP_TEMPLATE, name, fop_subs)) + +for l in open(sys.argv[1], 'r').readlines(): + if l.find('#pragma generate') != -1: + print("/* BEGIN GENERATED CODE - DO NOT MODIFY */") + gen_defaults() + print("/* END GENERATED CODE */") + else: + print(l[:-1]) diff --git a/xlators/features/cloudsync/src/cloudsync-fops-h.py b/xlators/features/cloudsync/src/cloudsync-fops-h.py new file mode 100755 index 00000000000..faa2de651a7 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-fops-h.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3 + +from __future__ import print_function +import os +import sys + +curdir = os.path.dirname(sys.argv[0]) +gendir = os.path.join(curdir, '../../../../libglusterfs/src') +sys.path.append(gendir) +from generator import ops, fop_subs, cbk_subs, generate + +OP_FOP_TEMPLATE = """ +int32_t +cs_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@); +""" + +def gen_defaults(): + for name, value in ops.items(): + if name == 'getspec': + continue + print(generate(OP_FOP_TEMPLATE, name, fop_subs)) + + +for l in open(sys.argv[1], 'r').readlines(): + if l.find('#pragma generate') != -1: + print("/* BEGIN GENERATED CODE - DO NOT MODIFY */") + gen_defaults() + print("/* END GENERATED CODE */") + else: + print(l[:-1]) diff --git a/xlators/features/cloudsync/src/cloudsync-mem-types.h b/xlators/features/cloudsync/src/cloudsync-mem-types.h new file mode 100644 index 00000000000..220346405d0 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-mem-types.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#ifndef __CLOUDSYNC_MEM_TYPES_H__ +#define __CLOUDSYNC_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> +enum cs_mem_types_ { + gf_cs_mt_cs_private_t = gf_common_mt_end + 1, + gf_cs_mt_cs_remote_stores_t, + gf_cs_mt_cs_inode_ctx_t, + gf_cs_mt_cs_lxattr_t, + gf_cs_mt_end +}; +#endif /* __CLOUDSYNC_MEM_TYPES_H__ */ diff --git a/xlators/features/cloudsync/src/cloudsync-messages.h b/xlators/features/cloudsync/src/cloudsync-messages.h new file mode 100644 index 00000000000..fb08f72de7f --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-messages.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#ifndef __CLOUDSYNC_MESSAGES_H__ +#define __CLOUDSYNC_MESSAGES_H__ + +/*TODO: define relevant message ids */ + +#endif /* __CLOUDSYNC_MESSAGES_H__ */ diff --git a/xlators/features/protect/Makefile.am b/xlators/features/cloudsync/src/cloudsync-plugins/Makefile.am index d471a3f9243..a985f42a877 100644 --- a/xlators/features/protect/Makefile.am +++ b/xlators/features/cloudsync/src/cloudsync-plugins/Makefile.am @@ -1,3 +1,3 @@ SUBDIRS = src -CLEANFILES = +CLEANFILES = diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/Makefile.am b/xlators/features/cloudsync/src/cloudsync-plugins/src/Makefile.am new file mode 100644 index 00000000000..fb6b0580c6d --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/Makefile.am @@ -0,0 +1,11 @@ +if BUILD_AMAZONS3_PLUGIN + AMAZONS3_DIR = cloudsyncs3 +endif + +if BUILD_CVLT_PLUGIN + CVLT_DIR = cvlt +endif + +SUBDIRS = ${AMAZONS3_DIR} ${CVLT_DIR} + +CLEANFILES = diff --git a/xlators/features/mac-compat/Makefile.am b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/Makefile.am index d471a3f9243..a985f42a877 100644 --- a/xlators/features/mac-compat/Makefile.am +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/Makefile.am @@ -1,3 +1,3 @@ SUBDIRS = src -CLEANFILES = +CLEANFILES = diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/Makefile.am b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/Makefile.am new file mode 100644 index 00000000000..6509426ef87 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/Makefile.am @@ -0,0 +1,12 @@ +csp_LTLIBRARIES = cloudsyncs3.la +cspdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/cloudsync-plugins + +cloudsyncs3_la_SOURCES = libcloudsyncs3.c $(top_srcdir)/xlators/features/cloudsync/src/cloudsync-common.c +cloudsyncs3_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +cloudsyncs3_la_LDFLAGS = -module -export-symbols $(top_srcdir)/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.sym $(GF_XLATOR_LDFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src -lcurlpp -lcryptopp +noinst_HEADERS = libcloudsyncs3.h libcloudsyncs3-mem-types.h +AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS) -lcurl -lcrypto -I$(top_srcdir)/xlators/features/cloudsync/src +CLEANFILES = + +EXTRA_DIST = libcloudsyncs3.sym diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3-mem-types.h b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3-mem-types.h new file mode 100644 index 00000000000..7ccfcc9f4b6 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3-mem-types.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#ifndef __LIBAWS_MEM_TYPES_H__ +#define __LIBAWS_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> +enum libaws_mem_types_ { + gf_libaws_mt_aws_private_t = gf_common_mt_end + 1, + gf_libaws_mt_end +}; +#endif /* __CLOUDSYNC_MEM_TYPES_H__ */ diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.c b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.c new file mode 100644 index 00000000000..23c3599825a --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.c @@ -0,0 +1,584 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <stdlib.h> +#include <openssl/hmac.h> +#include <openssl/evp.h> +#include <openssl/bio.h> +#include <openssl/buffer.h> +#include <openssl/crypto.h> +#include <curl/curl.h> +#include <glusterfs/xlator.h> +#include <glusterfs/glusterfs.h> +#include "libcloudsyncs3.h" +#include "cloudsync-common.h" + +#define RESOURCE_SIZE 4096 + +store_methods_t store_ops = { + .fop_download = aws_download_s3, + .fop_init = aws_init, + .fop_reconfigure = aws_reconfigure, + .fop_fini = aws_fini, +}; + +typedef struct aws_private { + char *hostname; + char *bucketid; + char *awssekey; + char *awskeyid; + gf_boolean_t abortdl; + pthread_spinlock_t lock; +} aws_private_t; + +void * +aws_init(xlator_t *this) +{ + aws_private_t *priv = NULL; + char *temp_str = NULL; + int ret = 0; + + priv = GF_CALLOC(1, sizeof(aws_private_t), gf_libaws_mt_aws_private_t); + if (!priv) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "insufficient memory"); + return NULL; + } + + priv->abortdl = _gf_false; + + pthread_spin_init(&priv->lock, PTHREAD_PROCESS_PRIVATE); + + pthread_spin_lock(&(priv->lock)); + { + if (dict_get_str(this->options, "s3plugin-seckey", &temp_str) == 0) { + priv->awssekey = gf_strdup(temp_str); + if (!priv->awssekey) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "initializing aws secret key failed"); + ret = -1; + goto unlock; + } + } + + if (dict_get_str(this->options, "s3plugin-keyid", &temp_str) == 0) { + priv->awskeyid = gf_strdup(temp_str); + if (!priv->awskeyid) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "initializing aws key ID failed"); + ret = -1; + goto unlock; + } + } + + if (dict_get_str(this->options, "s3plugin-bucketid", &temp_str) == 0) { + priv->bucketid = gf_strdup(temp_str); + if (!priv->bucketid) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "initializing aws bucketid failed"); + + ret = -1; + goto unlock; + } + } + + if (dict_get_str(this->options, "s3plugin-hostname", &temp_str) == 0) { + priv->hostname = gf_strdup(temp_str); + if (!priv->hostname) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "initializing aws hostname failed"); + + ret = -1; + goto unlock; + } + } + + gf_msg_debug(this->name, 0, + "stored key: %s id: %s " + "bucketid %s hostname: %s", + priv->awssekey, priv->awskeyid, priv->bucketid, + priv->hostname); + } +unlock: + pthread_spin_unlock(&(priv->lock)); + + if (ret == -1) { + GF_FREE(priv->awskeyid); + GF_FREE(priv->awssekey); + GF_FREE(priv->bucketid); + GF_FREE(priv->hostname); + GF_FREE(priv); + priv = NULL; + } + + return (void *)priv; +} + +int +aws_reconfigure(xlator_t *this, dict_t *options) +{ + aws_private_t *priv = NULL; + char *temp_str = NULL; + int ret = 0; + cs_private_t *cspriv = NULL; + + cspriv = this->private; + + priv = cspriv->stores->config; + + if (!priv) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "null priv"); + return -1; + } + + pthread_spin_lock(&(priv->lock)); + { + if (dict_get_str(options, "s3plugin-seckey", &temp_str) == 0) { + priv->awssekey = gf_strdup(temp_str); + if (!priv->awssekey) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "initializing aws secret key failed"); + ret = -1; + goto out; + } + } + + if (dict_get_str(options, "s3plugin-keyid", &temp_str) == 0) { + priv->awskeyid = gf_strdup(temp_str); + if (!priv->awskeyid) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "initializing aws key ID failed"); + ret = -1; + goto out; + } + } + + if (dict_get_str(options, "s3plugin-bucketid", &temp_str) == 0) { + priv->bucketid = gf_strdup(temp_str); + if (!priv->bucketid) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "initializing aws bucketid failed"); + ret = -1; + goto out; + } + } + + if (dict_get_str(options, "s3plugin-hostname", &temp_str) == 0) { + priv->hostname = gf_strdup(temp_str); + if (!priv->hostname) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "initializing aws hostname failed"); + ret = -1; + goto out; + } + } + } +out: + pthread_spin_unlock(&(priv->lock)); + + gf_msg_debug(this->name, 0, + "stored key: %s id: %s " + "bucketid %s hostname: %s", + priv->awssekey, priv->awskeyid, priv->bucketid, + priv->hostname); + + return ret; +} + +void +aws_fini(void *config) +{ + aws_private_t *priv = NULL; + + priv = (aws_private_t *)priv; + + if (priv) { + GF_FREE(priv->hostname); + GF_FREE(priv->bucketid); + GF_FREE(priv->awssekey); + GF_FREE(priv->awskeyid); + + pthread_spin_destroy(&priv->lock); + GF_FREE(priv); + } +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO("dht", this, out); + + ret = xlator_mem_acct_init(this, gf_libaws_mt_end + 1); + + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "Memory accounting init failed"); + return ret; + } +out: + return ret; +} +char * +aws_form_request(char *resource, char **date, char *reqtype, char *bucketid, + char *filepath) +{ + char httpdate[256]; + time_t ctime; + struct tm *gtime = NULL; + char *sign_req = NULL; + int signreq_len = -1; + int date_len = -1; + int res_len = -1; + + ctime = gf_time(); + gtime = gmtime(&ctime); + + date_len = strftime(httpdate, sizeof(httpdate), + "%a, %d %b %Y %H:%M:%S +0000", gtime); + + *date = gf_strndup(httpdate, date_len); + if (*date == NULL) { + gf_msg("CS", GF_LOG_ERROR, ENOMEM, 0, + "memory allocation " + "failure for date"); + goto out; + } + + res_len = snprintf(resource, RESOURCE_SIZE, "%s/%s", bucketid, filepath); + + gf_msg_debug("CS", 0, "resource %s", resource); + + /* 6 accounts for the 4 new line chars, one forward slash and + * one null char */ + signreq_len = res_len + date_len + strlen(reqtype) + 6; + + sign_req = GF_MALLOC(signreq_len, gf_common_mt_char); + if (sign_req == NULL) { + gf_msg("CS", GF_LOG_ERROR, ENOMEM, 0, + "memory allocation " + "failure for sign_req"); + goto out; + } + + snprintf(sign_req, signreq_len, "%s\n\n%s\n%s\n/%s", reqtype, "", *date, + resource); + +out: + return sign_req; +} + +char * +aws_b64_encode(const unsigned char *input, int length) +{ + BIO *bio, *b64; + BUF_MEM *bptr; + char *buff = NULL; + + b64 = BIO_new(BIO_f_base64()); + bio = BIO_new(BIO_s_mem()); + b64 = BIO_push(b64, bio); + BIO_write(b64, input, length); + BIO_flush(b64); + BIO_get_mem_ptr(b64, &bptr); + + buff = GF_MALLOC(bptr->length, gf_common_mt_char); + memcpy(buff, bptr->data, bptr->length - 1); + buff[bptr->length - 1] = 0; + + BIO_free_all(b64); + + return buff; +} + +char * +aws_sign_request(char *const str, char *awssekey) +{ +#if (OPENSSL_VERSION_NUMBER < 0x1010002f) + HMAC_CTX ctx; +#endif + HMAC_CTX *pctx = NULL; + ; + + unsigned char md[256]; + unsigned len; + char *base64 = NULL; + +#if (OPENSSL_VERSION_NUMBER < 0x1010002f) + HMAC_CTX_init(&ctx); + pctx = &ctx; +#else + pctx = HMAC_CTX_new(); +#endif + HMAC_Init_ex(pctx, awssekey, strlen(awssekey), EVP_sha1(), NULL); + HMAC_Update(pctx, (unsigned char *)str, strlen(str)); + HMAC_Final(pctx, (unsigned char *)md, &len); + +#if (OPENSSL_VERSION_NUMBER < 0x1010002f) + HMAC_CTX_cleanup(pctx); +#else + HMAC_CTX_free(pctx); +#endif + base64 = aws_b64_encode(md, len); + + return base64; +} + +int +aws_dlwritev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + aws_private_t *priv = NULL; + + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, op_errno, + "write failed " + ". Aborting Download"); + + priv = this->private; + pthread_spin_lock(&(priv->lock)); + { + priv->abortdl = _gf_true; + } + pthread_spin_unlock(&(priv->lock)); + } + + CS_STACK_DESTROY(frame); + + return op_ret; +} + +size_t +aws_write_callback(void *dlbuf, size_t size, size_t nitems, void *mainframe) +{ + call_frame_t *frame = NULL; + fd_t *dlfd = NULL; + int ret = 0; + cs_local_t *local = NULL; + struct iovec iov = { + 0, + }; + struct iobref *iobref = NULL; + struct iobuf *iobuf = NULL; + struct iovec dliov = { + 0, + }; + size_t tsize = 0; + xlator_t *this = NULL; + cs_private_t *xl_priv = NULL; + aws_private_t *priv = NULL; + call_frame_t *dlframe = NULL; + + frame = (call_frame_t *)mainframe; + this = frame->this; + xl_priv = this->private; + priv = xl_priv->stores->config; + + pthread_spin_lock(&(priv->lock)); + { + /* returning size other than the size passed from curl will + * abort further download*/ + if (priv->abortdl) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "aborting download"); + pthread_spin_unlock(&(priv->lock)); + return 0; + } + } + pthread_spin_unlock(&(priv->lock)); + + local = frame->local; + dlfd = local->dlfd; + tsize = size * nitems; + + dliov.iov_base = (void *)dlbuf; + dliov.iov_len = tsize; + + ret = iobuf_copy(this->ctx->iobuf_pool, &dliov, 1, &iobref, &iobuf, &iov); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "iobuf_copy failed"); + goto out; + } + + /* copy frame */ + dlframe = copy_frame(frame); + if (!dlframe) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "copy_frame failed"); + tsize = 0; + goto out; + } + + STACK_WIND(dlframe, aws_dlwritev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, dlfd, &iov, 1, local->dloffset, + 0, iobref, NULL); + + local->dloffset += tsize; + +out: + if (iobuf) + iobuf_unref(iobuf); + if (iobref) + iobref_unref(iobref); + + return tsize; +} + +int +aws_download_s3(call_frame_t *frame, void *config) +{ + char *buf; + int bufsize = -1; + CURL *handle = NULL; + struct curl_slist *slist = NULL; + struct curl_slist *tmp = NULL; + xlator_t *this = NULL; + int ret = 0; + int debug = 1; + CURLcode res; + char errbuf[CURL_ERROR_SIZE]; + size_t len = 0; + long responsecode; + char *sign_req = NULL; + char *date = NULL; + char *const reqtype = "GET"; + char *signature = NULL; + cs_local_t *local = NULL; + char resource[RESOURCE_SIZE] = { + 0, + }; + aws_private_t *priv = NULL; + + this = frame->this; + + local = frame->local; + + priv = (aws_private_t *)config; + + if (!priv->bucketid || !priv->hostname || !priv->awssekey || + !priv->awskeyid) { + ret = -1; + goto out; + } + + sign_req = aws_form_request(resource, &date, reqtype, priv->bucketid, + local->remotepath); + if (!sign_req) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "null sign_req, " + "aborting download"); + ret = -1; + goto out; + } + + gf_msg_debug("CS", 0, "sign_req %s date %s", sign_req, date); + + signature = aws_sign_request(sign_req, priv->awssekey); + if (!signature) { + gf_msg("CS", GF_LOG_ERROR, 0, 0, + "null signature, " + "aborting download"); + ret = -1; + goto out; + } + + handle = curl_easy_init(); + this = frame->this; + + /* special numbers 6, 20, 10 accounts for static characters in the + * below snprintf string format arguments*/ + bufsize = strlen(date) + 6 + strlen(priv->awskeyid) + strlen(signature) + + 20 + strlen(priv->hostname) + 10; + + buf = (char *)alloca(bufsize); + if (!buf) { + gf_msg("CS", GF_LOG_ERROR, ENOMEM, 0, + "mem allocation " + "failed for buf"); + ret = -1; + goto out; + } + + snprintf(buf, bufsize, "Date: %s", date); + slist = curl_slist_append(slist, buf); + snprintf(buf, bufsize, "Authorization: AWS %s:%s", priv->awskeyid, + signature); + slist = curl_slist_append(slist, buf); + snprintf(buf, bufsize, "https://%s/%s", priv->hostname, resource); + + if (gf_log_get_loglevel() >= GF_LOG_DEBUG) { + tmp = slist; + while (tmp) { + gf_msg_debug(this->name, 0, "slist for curl - %s", tmp->data); + tmp = tmp->next; + } + } + + curl_easy_setopt(handle, CURLOPT_HTTPHEADER, slist); + curl_easy_setopt(handle, CURLOPT_URL, buf); + curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, aws_write_callback); + curl_easy_setopt(handle, CURLOPT_WRITEDATA, frame); + curl_easy_setopt(handle, CURLOPT_VERBOSE, debug); + curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, errbuf); + + res = curl_easy_perform(handle); + if (res != CURLE_OK) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "download failed. err: %s\n", + curl_easy_strerror(res)); + ret = -1; + len = strlen(errbuf); + if (len) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "curl failure %s", errbuf); + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "curl error " + "%s\n", + curl_easy_strerror(res)); + } + } + + if (res == CURLE_OK) { + curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &responsecode); + gf_msg_debug(this->name, 0, "response code %ld", responsecode); + if (responsecode != 200) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "curl download failed"); + } + } + + curl_slist_free_all(slist); + curl_easy_cleanup(handle); + +out: + if (sign_req) + GF_FREE(sign_req); + if (date) + GF_FREE(date); + if (signature) + GF_FREE(signature); + + return ret; +} + +struct volume_options cs_options[] = { + {.key = {"s3plugin-seckey"}, + .type = GF_OPTION_TYPE_STR, + .description = "aws secret key"}, + {.key = {"s3plugin-keyid"}, + .type = GF_OPTION_TYPE_STR, + .description = "aws key ID" + + }, + {.key = {"s3plugin-bucketid"}, + .type = GF_OPTION_TYPE_STR, + .description = "aws bucketid"}, + {.key = {"s3plugin-hostname"}, + .type = GF_OPTION_TYPE_STR, + .description = "aws hostname e.g. s3.amazonaws.com"}, + {.key = {NULL}}, +}; diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.h b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.h new file mode 100644 index 00000000000..85ae669486b --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.h @@ -0,0 +1,50 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _LIBAWS_H +#define _LIBAWS_H + +#include <glusterfs/glusterfs.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/xlator.h> +#include <glusterfs/syncop.h> +#include <curl/curl.h> +#include "cloudsync-common.h" +#include "libcloudsyncs3-mem-types.h" + +char * +aws_b64_encode(const unsigned char *input, int length); + +size_t +aws_write_callback(void *dlbuf, size_t size, size_t nitems, void *mainframe); + +int +aws_download_s3(call_frame_t *frame, void *config); + +int +aws_dlwritev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + +void * +aws_init(xlator_t *this); + +int +aws_reconfigure(xlator_t *this, dict_t *options); + +char * +aws_form_request(char *resource, char **date, char *reqtype, char *bucketid, + char *filepath); +char * +aws_sign_request(char *const str, char *awssekey); + +void +aws_fini(void *config); + +#endif diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.sym b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.sym new file mode 100644 index 00000000000..0bc273670d5 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.sym @@ -0,0 +1 @@ +store_ops diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/Makefile.am b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/Makefile.am b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/Makefile.am new file mode 100644 index 00000000000..b512464f157 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/Makefile.am @@ -0,0 +1,12 @@ +csp_LTLIBRARIES = cloudsynccvlt.la +cspdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/cloudsync-plugins + +cloudsynccvlt_la_SOURCES = libcvlt.c $(top_srcdir)/xlators/features/cloudsync/src/cloudsync-common.c +cloudsynccvlt_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +cloudsynccvlt_la_LDFLAGS = -module -avoid-version -export-symbols $(top_srcdir)/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcloudsynccvlt.sym +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src +noinst_HEADERS = archivestore.h libcvlt.h libcvlt-mem-types.h cvlt-messages.h +AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS) -I$(top_srcdir)/xlators/features/cloudsync/src +CLEANFILES = + +EXTRA_DIST = libcloudsynccvlt.sym diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/archivestore.h b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/archivestore.h new file mode 100644 index 00000000000..7230ef77337 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/archivestore.h @@ -0,0 +1,203 @@ +/* + Copyright (c) 2018 Commvault Systems, Inc. <http://www.commvault.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __ARCHIVESTORE_H__ +#define __ARCHIVESTORE_H__ + +#include <stdlib.h> +#include <stddef.h> +#include <stdint.h> +#include <dlfcn.h> +#include <uuid/uuid.h> + +#define CS_XATTR_ARCHIVE_UUID "trusted.cloudsync.uuid" +#define CS_XATTR_PRODUCT_ID "trusted.cloudsync.product-id" +#define CS_XATTR_STORE_ID "trusted.cloudsync.store-id" + +struct _archstore_methods; +typedef struct _archstore_methods archstore_methods_t; + +struct _archstore_desc { + void *priv; /* Private field for store mgmt. */ + /* To be used only by archive store*/ +}; +typedef struct _archstore_desc archstore_desc_t; + +struct _archstore_info { + char *id; /* Identifier for the archivestore */ + uint32_t idlen; /* Length of identifier string */ + char *prod; /* Name of the data mgmt. product */ + uint32_t prodlen; /* Length of the product string */ +}; +typedef struct _archstore_info archstore_info_t; + +struct _archstore_fileinfo { + uuid_t uuid; /* uuid of the file */ + char *path; /* file path */ + uint32_t pathlength; /* length of file path */ +}; +typedef struct _archstore_fileinfo archstore_fileinfo_t; + +struct _app_callback_info { + archstore_info_t *src_archstore; + archstore_fileinfo_t *src_archfile; + archstore_info_t *dest_archstore; + archstore_fileinfo_t *dest_archfile; +}; +typedef struct _app_callback_info app_callback_info_t; + +typedef void (*app_callback_t)(archstore_desc_t *, app_callback_info_t *, + void *, int64_t, int32_t); + +enum _archstore_scan_type { FULL = 1, INCREMENTAL = 2 }; +typedef enum _archstore_scan_type archstore_scan_type_t; + +typedef int32_t archstore_errno_t; + +/* + * Initialize archive store. + * arg1 pointer to structure containing archive store information + * arg2 error number if any generated during the initialization + * arg3 name of the log file + */ +typedef int32_t (*init_archstore_t)(archstore_desc_t *, archstore_errno_t *, + const char *); + +/* + * Clean up archive store. + * arg1 pointer to structure containing archive store information + * arg2 error number if any generated during the cleanup + */ +typedef int32_t (*term_archstore_t)(archstore_desc_t *, archstore_errno_t *); + +/* + * Read the contents of the file from archive store + * arg1 pointer to structure containing archive store description + * arg2 pointer to structure containing archive store information + * arg3 pointer to structure containing information about file to be read + * arg4 offset in the file from which data should be read + * arg5 buffer where the data should be read + * arg6 number of bytes of data to be read + * arg7 error number if any generated during the read from file + * arg8 callback handler to be invoked after the data is read + * arg9 cookie to be passed when callback is invoked + */ +typedef int32_t (*read_archstore_t)(archstore_desc_t *, archstore_info_t *, + archstore_fileinfo_t *, off_t, char *, + size_t, archstore_errno_t *, app_callback_t, + void *); + +/* + * Restore the contents of the file from archive store + * This is basically in-place restore + * arg1 pointer to structure containing archive store description + * arg2 pointer to structure containing archive store information + * arg3 pointer to structure containing information about file to be restored + * arg4 error number if any generated during the file restore + * arg5 callback to be invoked after the file is restored + * arg6 cookie to be passed when callback is invoked + */ +typedef int32_t (*recall_archstore_t)(archstore_desc_t *, archstore_info_t *, + archstore_fileinfo_t *, + archstore_errno_t *, app_callback_t, + void *); + +/* + * Restore the contents of the file from archive store to a different store + * This is basically out-of-place restore + * arg1 pointer to structure containing archive store description + * arg2 pointer to structure containing source archive store information + * arg3 pointer to structure containing information about file to be restored + * arg4 pointer to structure containing destination archive store information + * arg5 pointer to structure containing information about the location to + which the file will be restored + * arg6 error number if any generated during the file restore + * arg7 callback to be invoked after the file is restored + * arg8 cookie to be passed when callback is invoked + */ +typedef int32_t (*restore_archstore_t)(archstore_desc_t *, archstore_info_t *, + archstore_fileinfo_t *, + archstore_info_t *, + archstore_fileinfo_t *, + archstore_errno_t *, app_callback_t, + void *); + +/* + * Archive the contents of the file to archive store + * arg1 pointer to structure containing archive store description + * arg2 pointer to structure containing source archive store information + * arg3 pointer to structure containing information about files to be archived + * arg4 pointer to structure containing destination archive store information + * arg5 pointer to structure containing information about files that failed + * to be archived + * arg6 error number if any generated during the file archival + * arg7 callback to be invoked after the file is archived + * arg8 cookie to be passed when callback is invoked + */ +typedef int32_t (*archive_archstore_t)(archstore_desc_t *, archstore_info_t *, + archstore_fileinfo_t *, + archstore_info_t *, + archstore_fileinfo_t *, + archstore_errno_t *, app_callback_t, + void *); + +/* + * Backup list of files provided in the input file + * arg1 pointer to structure containing archive store description + * arg2 pointer to structure containing source archive store information + * arg3 pointer to structure containing information about files to be backed up + * arg4 pointer to structure containing destination archive store information + * arg5 pointer to structure containing information about files that failed + * to be backed up + * arg6 error number if any generated during the file archival + * arg7 callback to be invoked after the file is archived + * arg8 cookie to be passed when callback is invoked + */ +typedef int32_t (*backup_archstore_t)(archstore_desc_t *, archstore_info_t *, + archstore_fileinfo_t *, + archstore_info_t *, + archstore_fileinfo_t *, + archstore_errno_t *, app_callback_t, + void *); + +/* + * Scan the contents of a store and determine the files which need to be + * backed up. + * arg1 pointer to structure containing archive store description + * arg2 pointer to structure containing archive store information + * arg3 type of scan whether full or incremental + * arg4 path to file that contains list of files to be backed up + * arg5 error number if any generated during scan operation + */ +typedef int32_t (*scan_archstore_t)(archstore_desc_t *, archstore_info_t *, + archstore_scan_type_t, char *, + archstore_errno_t *); + +struct _archstore_methods { + init_archstore_t init; + term_archstore_t fini; + backup_archstore_t backup; + archive_archstore_t archive; + scan_archstore_t scan; + restore_archstore_t restore; + recall_archstore_t recall; + read_archstore_t read; +}; + +typedef int (*get_archstore_methods_t)(archstore_methods_t *); + +/* + * Single function that will be invoked by applications for extracting + * the function pointers to all data management functions. + */ +int32_t +get_archstore_methods(archstore_methods_t *); + +#endif /* End of __ARCHIVESTORE_H__ */ diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/cvlt-messages.h b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/cvlt-messages.h new file mode 100644 index 00000000000..57c9aa77da0 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/cvlt-messages.h @@ -0,0 +1,30 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _CVLT_MESSAGES_H_ +#define _CVLT_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(CVLT, CVLT_EXTRACTION_FAILED, CVLT_FREE, + CVLT_RESOURCE_ALLOCATION_FAILED, CVLT_RESTORE_FAILED, + CVLT_READ_FAILED, CVLT_NO_MEMORY, CVLT_DLOPEN_FAILED); + +#endif /* !_CVLT_MESSAGES_H_ */ diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcloudsynccvlt.sym b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcloudsynccvlt.sym new file mode 100644 index 00000000000..0bc273670d5 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcloudsynccvlt.sym @@ -0,0 +1 @@ +store_ops diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt-mem-types.h b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt-mem-types.h new file mode 100644 index 00000000000..c24fab8bfe7 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt-mem-types.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2018 Commvault Systems, Inc. <http://www.commvault.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#ifndef __LIBCVLT_MEM_TYPES_H__ +#define __LIBCVLT_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> +enum libcvlt_mem_types_ { + gf_libcvlt_mt_cvlt_private_t = gf_common_mt_end + 1, + gf_libcvlt_mt_end +}; +#endif /* __LIBCVLT_MEM_TYPES_H__ */ diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.c b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.c new file mode 100644 index 00000000000..5b7272bb448 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.c @@ -0,0 +1,842 @@ +#include <stdlib.h> +#include <glusterfs/xlator.h> +#include <glusterfs/glusterfs.h> +#include "libcvlt.h" +#include "cloudsync-common.h" +#include "cvlt-messages.h" + +#define LIBARCHIVE_SO "libopenarchive.so" +#define ALIGN_SIZE 4096 +#define CVLT_TRAILER "cvltv1" + +store_methods_t store_ops = { + .fop_download = cvlt_download, + .fop_init = cvlt_init, + .fop_reconfigure = cvlt_reconfigure, + .fop_fini = cvlt_fini, + .fop_remote_read = cvlt_read, +}; + +static const int32_t num_req = 32; +static const int32_t num_iatt = 32; +static char *plugin = "cvlt_cloudSync"; + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init(this, gf_libcvlt_mt_end + 1); + + if (ret != 0) { + return ret; + } + + return ret; +} + +static void +cvlt_free_resources(archive_t *arch) +{ + /* + * We will release all the resources that were allocated by the xlator. + * Check whether there are any buffers which have not been released + * back to a mempool. + */ + + if (arch->handle) { + dlclose(arch->handle); + } + + if (arch->iobuf_pool) { + iobuf_pool_destroy(arch->iobuf_pool); + } + + if (arch->req_pool) { + mem_pool_destroy(arch->req_pool); + arch->req_pool = NULL; + } + + return; +} + +static int32_t +cvlt_extract_store_fops(xlator_t *this, archive_t *arch) +{ + int32_t op_ret = -1; + get_archstore_methods_t get_archstore_methods; + + /* + * libopenarchive.so defines methods for performing data management + * operations. We will extract the methods from library and these + * methods will be invoked for moving data between glusterfs volume + * and the data management product. + */ + + VALIDATE_OR_GOTO(arch, err); + + arch->handle = dlopen(LIBARCHIVE_SO, RTLD_NOW); + if (!arch->handle) { + gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_DLOPEN_FAILED, + " failed to open %s ", LIBARCHIVE_SO); + return op_ret; + } + + dlerror(); /* Clear any existing error */ + + get_archstore_methods = dlsym(arch->handle, "get_archstore_methods"); + if (!get_archstore_methods) { + gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED, + " Error extracting get_archstore_methods()"); + dlclose(arch->handle); + arch->handle = NULL; + return op_ret; + } + + op_ret = get_archstore_methods(&(arch->fops)); + if (op_ret) { + gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED, + " Failed to extract methods in get_archstore_methods"); + dlclose(arch->handle); + arch->handle = NULL; + return op_ret; + } + +err: + return op_ret; +} + +static int32_t +cvlt_alloc_resources(xlator_t *this, archive_t *arch, int num_req, int num_iatt) +{ + /* + * Initialize information about all the memory pools that will be + * used by this xlator. + */ + arch->nreqs = 0; + + arch->req_pool = NULL; + + arch->handle = NULL; + arch->xl = this; + + arch->req_pool = mem_pool_new(cvlt_request_t, num_req); + if (!arch->req_pool) { + goto err; + } + + arch->iobuf_pool = iobuf_pool_new(); + if (!arch->iobuf_pool) { + goto err; + } + + if (cvlt_extract_store_fops(this, arch)) { + goto err; + } + + return 0; + +err: + + return -1; +} + +static void +cvlt_req_init(cvlt_request_t *req) +{ + sem_init(&(req->sem), 0, 0); + + return; +} + +static void +cvlt_req_destroy(cvlt_request_t *req) +{ + if (req->iobuf) { + iobuf_unref(req->iobuf); + } + + if (req->iobref) { + iobref_unref(req->iobref); + } + + sem_destroy(&(req->sem)); + + return; +} + +static cvlt_request_t * +cvlt_alloc_req(archive_t *arch) +{ + cvlt_request_t *reqptr = NULL; + + if (!arch) { + goto err; + } + + if (arch->req_pool) { + reqptr = mem_get0(arch->req_pool); + if (reqptr) { + cvlt_req_init(reqptr); + } + } + + if (reqptr) { + LOCK(&(arch->lock)); + arch->nreqs++; + UNLOCK(&(arch->lock)); + } + +err: + return reqptr; +} + +static int32_t +cvlt_free_req(archive_t *arch, cvlt_request_t *reqptr) +{ + if (!reqptr) { + goto err; + } + + if (!arch) { + goto err; + } + + if (arch->req_pool) { + /* + * Free the request resources if they exist. + */ + + cvlt_req_destroy(reqptr); + mem_put(reqptr); + + LOCK(&(arch->lock)); + arch->nreqs--; + UNLOCK(&(arch->lock)); + } + + return 0; + +err: + return -1; +} + +static int32_t +cvlt_init_xlator(xlator_t *this, archive_t *arch, int num_req, int num_iatt) +{ + int32_t ret = -1; + int32_t errnum = -1; + int32_t locked = 0; + + /* + * Perform all the initializations needed for brining up the xlator. + */ + if (!arch) { + goto err; + } + + LOCK_INIT(&(arch->lock)); + LOCK(&(arch->lock)); + + locked = 1; + + ret = cvlt_alloc_resources(this, arch, num_req, num_iatt); + + if (ret) { + goto err; + } + + /* + * Now that the fops have been extracted initialize the store + */ + ret = arch->fops.init(&(arch->descinfo), &errnum, plugin); + if (ret) { + goto err; + } + + UNLOCK(&(arch->lock)); + locked = 0; + ret = 0; + + return ret; + +err: + if (arch) { + cvlt_free_resources(arch); + + if (locked) { + UNLOCK(&(arch->lock)); + } + } + + return ret; +} + +static int32_t +cvlt_term_xlator(archive_t *arch) +{ + int32_t errnum = -1; + + if (!arch) { + goto err; + } + + LOCK(&(arch->lock)); + + /* + * Release the resources that have been allocated inside store + */ + arch->fops.fini(&(arch->descinfo), &errnum); + + cvlt_free_resources(arch); + + UNLOCK(&(arch->lock)); + + GF_FREE(arch); + + return 0; + +err: + return -1; +} + +static int32_t +cvlt_init_store_info(archive_t *priv, archstore_info_t *store_info) +{ + if (!store_info) { + return -1; + } + + store_info->prod = priv->product_id; + store_info->prodlen = strlen(priv->product_id); + + store_info->id = priv->store_id; + store_info->idlen = strlen(priv->store_id); + + return 0; +} + +static int32_t +cvlt_init_file_info(cs_loc_xattr_t *xattr, archstore_fileinfo_t *file_info) +{ + if (!xattr || !file_info) { + return -1; + } + + gf_uuid_copy(file_info->uuid, xattr->uuid); + file_info->path = xattr->file_path; + file_info->pathlength = strlen(xattr->file_path); + + return 0; +} + +static int32_t +cvlt_init_gluster_store_info(cs_loc_xattr_t *xattr, + archstore_info_t *store_info) +{ + static char *product = "glusterfs"; + + if (!xattr || !store_info) { + return -1; + } + + store_info->prod = product; + store_info->prodlen = strlen(product); + + store_info->id = xattr->volname; + store_info->idlen = strlen(xattr->volname); + + return 0; +} + +static int32_t +cvlt_init_gluster_file_info(cs_loc_xattr_t *xattr, + archstore_fileinfo_t *file_info) +{ + if (!xattr || !file_info) { + return -1; + } + + gf_uuid_copy(file_info->uuid, xattr->gfid); + file_info->path = xattr->file_path; + file_info->pathlength = strlen(xattr->file_path); + + return 0; +} + +static void +cvlt_copy_stat_info(struct iatt *buf, cs_size_xattr_t *xattrs) +{ + /* + * If the file was archived then the reported size will not be a + * correct one. We need to fix this. + */ + if (buf && xattrs) { + buf->ia_size = xattrs->size; + buf->ia_blksize = xattrs->blksize; + buf->ia_blocks = xattrs->blocks; + } + + return; +} + +static void +cvlt_readv_complete(archstore_desc_t *desc, app_callback_info_t *cbkinfo, + void *cookie, int64_t op_ret, int32_t op_errno) +{ + struct iovec iov; + xlator_t *this = NULL; + struct iatt postbuf = { + 0, + }; + call_frame_t *frame = NULL; + cvlt_request_t *req = (cvlt_request_t *)cookie; + cs_local_t *local = NULL; + cs_private_t *cspriv = NULL; + archive_t *priv = NULL; + + frame = req->frame; + this = frame->this; + local = frame->local; + + cspriv = this->private; + priv = (archive_t *)cspriv->stores->config; + + if (strcmp(priv->trailer, CVLT_TRAILER)) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + gf_msg_debug(plugin, 0, + " Read callback invoked offset:%" PRIu64 "bytes: %" PRIu64 + " op : %d ret : %" PRId64 " errno : %d", + req->offset, req->bytes, req->op_type, op_ret, op_errno); + + if (op_ret < 0) { + goto out; + } + + req->iobref = iobref_new(); + if (!req->iobref) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + iobref_add(req->iobref, req->iobuf); + iov.iov_base = iobuf_ptr(req->iobuf); + iov.iov_len = op_ret; + + cvlt_copy_stat_info(&postbuf, &(req->szxattr)); + + /* + * Hack to notify higher layers of EOF. + */ + if (!postbuf.ia_size || (req->offset + iov.iov_len >= postbuf.ia_size)) { + gf_msg_debug(plugin, 0, " signalling end-of-file for uuid=%s", + uuid_utoa(req->file_info.uuid)); + op_errno = ENOENT; + } + +out: + + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, &iov, 1, &postbuf, + req->iobref, local->xattr_rsp); + + cvlt_free_req(priv, req); + + return; +} + +static void +cvlt_download_complete(archstore_desc_t *store, app_callback_info_t *cbk_info, + void *cookie, int64_t ret, int errcode) +{ + cvlt_request_t *req = (cvlt_request_t *)cookie; + + gf_msg_debug(plugin, 0, + " Download callback invoked ret : %" PRId64 " errno : %d", + ret, errcode); + + req->op_ret = ret; + req->op_errno = errcode; + sem_post(&(req->sem)); + + return; +} + +void * +cvlt_init(xlator_t *this) +{ + int ret = 0; + archive_t *priv = NULL; + + if (!this->children || this->children->next) { + gf_msg(plugin, GF_LOG_ERROR, ENOMEM, 0, + "should have exactly one child"); + ret = -1; + goto out; + } + + if (!this->parents) { + gf_msg(plugin, GF_LOG_ERROR, ENOMEM, 0, + "dangling volume. check volfile"); + ret = -1; + goto out; + } + + priv = GF_CALLOC(1, sizeof(archive_t), gf_libcvlt_mt_cvlt_private_t); + if (!priv) { + ret = -1; + goto out; + } + + priv->trailer = CVLT_TRAILER; + if (cvlt_init_xlator(this, priv, num_req, num_iatt)) { + gf_msg(plugin, GF_LOG_ERROR, ENOMEM, 0, "xlator init failed"); + ret = -1; + goto out; + } + + GF_OPTION_INIT("cloudsync-store-id", priv->store_id, str, out); + GF_OPTION_INIT("cloudsync-product-id", priv->product_id, str, out); + + gf_msg(plugin, GF_LOG_INFO, 0, 0, + "store id is : %s " + "product id is : %s.", + priv->store_id, priv->product_id); +out: + if (ret == -1) { + cvlt_term_xlator(priv); + return (NULL); + } + return priv; +} + +int +cvlt_reconfigure(xlator_t *this, dict_t *options) +{ + cs_private_t *cspriv = NULL; + archive_t *priv = NULL; + + cspriv = this->private; + priv = (archive_t *)cspriv->stores->config; + + if (strcmp(priv->trailer, CVLT_TRAILER)) + goto out; + + GF_OPTION_RECONF("cloudsync-store-id", priv->store_id, options, str, out); + + GF_OPTION_RECONF("cloudsync-product-id", priv->product_id, options, str, + out); + gf_msg_debug(plugin, 0, + "store id is : %s " + "product id is : %s.", + priv->store_id, priv->product_id); + return 0; +out: + return -1; +} + +void +cvlt_fini(void *config) +{ + archive_t *priv = NULL; + + priv = (archive_t *)config; + + if (strcmp(priv->trailer, CVLT_TRAILER)) + return; + + cvlt_term_xlator(priv); + gf_msg(plugin, GF_LOG_INFO, 0, CVLT_FREE, " released xlator resources"); + return; +} + +int +cvlt_download(call_frame_t *frame, void *config) +{ + archive_t *parch = NULL; + cs_local_t *local = frame->local; + cs_loc_xattr_t *locxattr = local->xattrinfo.lxattr; + cvlt_request_t *req = NULL; + archstore_info_t dest_storeinfo; + archstore_fileinfo_t dest_fileinfo; + int32_t op_ret, op_errno; + + parch = (archive_t *)config; + + if (strcmp(parch->trailer, CVLT_TRAILER)) { + op_ret = -1; + op_errno = EINVAL; + goto err; + } + + gf_msg_debug(plugin, 0, " download invoked for uuid = %s gfid=%s ", + locxattr->uuid, uuid_utoa(locxattr->gfid)); + + if (!(parch->fops.restore)) { + op_errno = ELIBBAD; + goto err; + } + + /* + * Download needs to be processed. Allocate a request. + */ + req = cvlt_alloc_req(parch); + + if (!req) { + gf_msg(plugin, GF_LOG_ERROR, ENOMEM, CVLT_RESOURCE_ALLOCATION_FAILED, + " failed to allocated request for gfid=%s", + uuid_utoa(locxattr->gfid)); + op_errno = ENOMEM; + goto err; + } + + /* + * Initialize the request object. + */ + req->op_type = CVLT_RESTORE_OP; + req->frame = frame; + + /* + * The file is currently residing inside a data management store. + * To restore the file contents we need to provide the information + * about data management store. + */ + op_ret = cvlt_init_store_info(parch, &(req->store_info)); + if (op_ret < 0) { + gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED, + " failed to extract store info for gfid=%s", + uuid_utoa(locxattr->gfid)); + goto err; + } + + op_ret = cvlt_init_file_info(locxattr, &(req->file_info)); + if (op_ret < 0) { + gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED, + " failed to extract file info for gfid=%s", + uuid_utoa(locxattr->gfid)); + goto err; + } + + /* + * We need to perform in-place restore of the file from data management + * store to gusterfs volume. + */ + op_ret = cvlt_init_gluster_store_info(locxattr, &dest_storeinfo); + if (op_ret < 0) { + gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED, + " failed to extract destination store info for gfid=%s", + uuid_utoa(locxattr->gfid)); + goto err; + } + + op_ret = cvlt_init_gluster_file_info(locxattr, &dest_fileinfo); + if (op_ret < 0) { + gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED, + " failed to extract file info for gfid=%s", + uuid_utoa(locxattr->gfid)); + goto err; + } + + /* + * Submit the restore request. + */ + op_ret = parch->fops.restore(&(parch->descinfo), &(req->store_info), + &(req->file_info), &dest_storeinfo, + &dest_fileinfo, &op_errno, + cvlt_download_complete, req); + if (op_ret < 0) { + gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_RESTORE_FAILED, + " failed to restore file gfid=%s from data management store", + uuid_utoa(locxattr->gfid)); + goto err; + } + + /* + * Wait for the restore to complete. + */ + sem_wait(&(req->sem)); + + if (req->op_ret < 0) { + gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_RESTORE_FAILED, + " restored failed for gfid=%s", uuid_utoa(locxattr->gfid)); + goto err; + } + + if (req) { + cvlt_free_req(parch, req); + } + + return 0; + +err: + + if (req) { + cvlt_free_req(parch, req); + } + + return -1; +} + +int +cvlt_read(call_frame_t *frame, void *config) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + archive_t *parch = NULL; + cvlt_request_t *req = NULL; + struct iovec iov = { + 0, + }; + struct iobref *iobref; + size_t size = 0; + off_t off = 0; + + cs_local_t *local = frame->local; + cs_loc_xattr_t *locxattr = local->xattrinfo.lxattr; + + size = local->xattrinfo.size; + off = local->xattrinfo.offset; + + parch = (archive_t *)config; + + if (strcmp(parch->trailer, CVLT_TRAILER)) { + op_ret = -1; + op_errno = EINVAL; + goto err; + } + + gf_msg_debug(plugin, 0, + " read invoked for gfid = %s offset = %" PRIu64 + " file_size = %" PRIu64, + uuid_utoa(locxattr->gfid), off, local->stbuf.ia_size); + + if (off >= local->stbuf.ia_size) { + /* + * Hack to notify higher layers of EOF. + */ + + op_errno = ENOENT; + op_ret = 0; + + gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_READ_FAILED, + " reporting end-of-file for gfid=%s", uuid_utoa(locxattr->gfid)); + + goto err; + } + + if (!size) { + op_errno = EINVAL; + + gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_READ_FAILED, + " zero size read attempted on gfid=%s", + uuid_utoa(locxattr->gfid)); + goto err; + } + + if (!(parch->fops.read)) { + op_errno = ELIBBAD; + goto err; + } + + /* + * The read request need to be processed. Allocate a request. + */ + req = cvlt_alloc_req(parch); + + if (!req) { + gf_msg(plugin, GF_LOG_ERROR, ENOMEM, CVLT_NO_MEMORY, + " failed to allocated request for gfid=%s", + uuid_utoa(locxattr->gfid)); + op_errno = ENOMEM; + goto err; + } + + req->iobuf = iobuf_get_page_aligned(parch->iobuf_pool, size, ALIGN_SIZE); + if (!req->iobuf) { + op_errno = ENOMEM; + goto err; + } + + /* + * Initialize the request object. + */ + req->op_type = CVLT_READ_OP; + req->offset = off; + req->bytes = size; + req->frame = frame; + req->szxattr.size = local->stbuf.ia_size; + req->szxattr.blocks = local->stbuf.ia_blocks; + req->szxattr.blksize = local->stbuf.ia_blksize; + + /* + * The file is currently residing inside a data management store. + * To read the file contents we need to provide the information + * about data management store. + */ + op_ret = cvlt_init_store_info(parch, &(req->store_info)); + if (op_ret < 0) { + gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED, + " failed to extract store info for gfid=%s" + " offset=%" PRIu64 " size=%" GF_PRI_SIZET + ", " + " buf=%p", + uuid_utoa(locxattr->gfid), off, size, req->iobuf->ptr); + goto err; + } + + op_ret = cvlt_init_file_info(locxattr, &(req->file_info)); + if (op_ret < 0) { + gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED, + " failed to extract file info for gfid=%s" + " offset=%" PRIu64 " size=%" GF_PRI_SIZET + ", " + " buf=%p", + uuid_utoa(locxattr->gfid), off, size, req->iobuf->ptr); + goto err; + } + + /* + * Submit the read request. + */ + op_ret = parch->fops.read(&(parch->descinfo), &(req->store_info), + &(req->file_info), off, req->iobuf->ptr, size, + &op_errno, cvlt_readv_complete, req); + + if (op_ret < 0) { + gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED, + " read failed on gfid=%s" + " offset=%" PRIu64 " size=%" GF_PRI_SIZET + ", " + " buf=%p", + uuid_utoa(locxattr->gfid), off, size, req->iobuf->ptr); + goto err; + } + + return 0; + +err: + + iobref = iobref_new(); + gf_msg_debug(plugin, 0, " read unwinding stack op_ret = %d, op_errno = %d", + op_ret, op_errno); + + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, &iov, 1, + &(local->stbuf), iobref, local->xattr_rsp); + + if (iobref) { + iobref_unref(iobref); + } + + if (req) { + cvlt_free_req(parch, req); + } + + return 0; +} diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.h b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.h new file mode 100644 index 00000000000..c45ac948f6c --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.h @@ -0,0 +1,84 @@ +/* + Copyright (c) 2018 Commvault Systems, Inc. <http://www.commvault.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _LIBCVLT_H +#define _LIBCVLT_H + +#include <semaphore.h> +#include <glusterfs/xlator.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/syncop.h> +#include <glusterfs/compat-errno.h> +#include "cloudsync-common.h" +#include "libcvlt-mem-types.h" +#include "archivestore.h" + +enum _cvlt_op { + CVLT_READ_OP = 1, + CVLT_WRITE_OP = 2, + CVLT_RESTORE_OP = 3, + CVLT_ARCHIVE_OP = 4, + CVLT_LOOKUP_OP = 5, + CVLT_XATTR_OP = 6, + CVLT_STAT_OP = 7, + CVLT_FSTAT_op = 8, + CVLT_UNDEF_OP = 127 +}; +typedef enum _cvlt_op cvlt_op_t; + +struct _archive; +struct _cvlt_request { + uint64_t offset; + uint64_t bytes; + struct iobuf *iobuf; + struct iobref *iobref; + call_frame_t *frame; + cvlt_op_t op_type; + int32_t op_ret; + int32_t op_errno; + xlator_t *this; + sem_t sem; + archstore_info_t store_info; + archstore_fileinfo_t file_info; + cs_size_xattr_t szxattr; +}; +typedef struct _cvlt_request cvlt_request_t; + +struct _archive { + gf_lock_t lock; /* lock for controlling access */ + xlator_t *xl; /* xlator */ + void *handle; /* handle returned from dlopen */ + int32_t nreqs; /* num requests active */ + struct mem_pool *req_pool; /* pool for requests */ + struct iobuf_pool *iobuf_pool; /* iobuff pool */ + archstore_desc_t descinfo; /* Archive store descriptor info */ + archstore_methods_t fops; /* function pointers */ + char *product_id; + char *store_id; + char *trailer; +}; +typedef struct _archive archive_t; + +void * +cvlt_init(xlator_t *); + +int +cvlt_reconfigure(xlator_t *, dict_t *); + +void +cvlt_fini(void *); + +int +cvlt_download(call_frame_t *, void *); + +int +cvlt_read(call_frame_t *, void *); + +#endif diff --git a/xlators/features/cloudsync/src/cloudsync.c b/xlators/features/cloudsync/src/cloudsync.c new file mode 100644 index 00000000000..7f0b9e563b8 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync.c @@ -0,0 +1,2076 @@ +/* + * Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include "cloudsync.h" +#include "cloudsync-common.h" +#include <glusterfs/call-stub.h> +#include "cloudsync-autogen-fops.h" + +#include <string.h> +#include <dlfcn.h> + +static void +cs_cleanup_private(cs_private_t *priv) +{ + if (priv) { + if (priv->stores) { + priv->stores->fini(priv->stores->config); + GF_FREE(priv->stores); + } + + pthread_spin_destroy(&priv->lock); + GF_FREE(priv); + } + + return; +} + +static struct cs_plugin plugins[] = { + {.name = "cloudsyncs3", + .library = "cloudsyncs3.so", + .description = "cloudsync s3 store."}, +#if defined(__linux__) + {.name = "cvlt", + .library = "cloudsynccvlt.so", + .description = "Commvault content store."}, +#endif + {.name = NULL}, +}; + +int +cs_init(xlator_t *this) +{ + cs_private_t *priv = NULL; + gf_boolean_t per_vol = _gf_false; + int ret = 0; + char *libpath = NULL; + store_methods_t *store_methods = NULL; + void *handle = NULL; + char *temp_str = NULL; + int index = 0; + char *libname = NULL; + + priv = GF_CALLOC(1, sizeof(*priv), gf_cs_mt_cs_private_t); + if (!priv) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "insufficient memory"); + goto out; + } + + priv->this = this; + + this->local_pool = mem_pool_new(cs_local_t, 512); + if (!this->local_pool) { + gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, "initialisation failed."); + ret = -1; + goto out; + } + + this->private = priv; + + GF_OPTION_INIT("cloudsync-remote-read", priv->remote_read, bool, out); + + /* temp workaround. Should be configurable through glusterd*/ + per_vol = _gf_true; + + if (per_vol) { + if (dict_get_str_sizen(this->options, "cloudsync-storetype", + &temp_str) == 0) { + for (index = 0; plugins[index].name; index++) { + if (!strcmp(temp_str, plugins[index].name)) { + libname = plugins[index].library; + break; + } + } + } else { + ret = 0; + } + + if (!libname) { + gf_msg(this->name, GF_LOG_WARNING, 0, 0, "no plugin enabled"); + ret = 0; + goto out; + } + + ret = gf_asprintf(&libpath, "%s/%s", CS_PLUGINDIR, libname); + if (ret == -1) { + goto out; + } + + handle = dlopen(libpath, RTLD_NOW); + if (!handle) { + gf_msg(this->name, GF_LOG_WARNING, 0, 0, + "could not " + "load the required library. %s", + dlerror()); + ret = 0; + goto out; + } else { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "loading library:%s successful", libname); + } + + priv->stores = GF_CALLOC(1, sizeof(struct cs_remote_stores), + gf_cs_mt_cs_remote_stores_t); + if (!priv->stores) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "Could not " + "allocate memory for priv->stores"); + ret = -1; + goto out; + } + + (void)dlerror(); /* clear out previous error string */ + + /* load library methods */ + store_methods = (store_methods_t *)dlsym(handle, "store_ops"); + if (!store_methods) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "null store_methods %s", + dlerror()); + ret = -1; + goto out; + } + + (void)dlerror(); + + if (priv->remote_read) { + priv->stores->rdfop = store_methods->fop_remote_read; + if (!priv->stores->rdfop) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "failed to get" + " read fop %s", + dlerror()); + ret = -1; + goto out; + } + } + + priv->stores->dlfop = store_methods->fop_download; + if (!priv->stores->dlfop) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "failed to get" + " download fop %s", + dlerror()); + ret = -1; + goto out; + } + + (void)dlerror(); + priv->stores->init = store_methods->fop_init; + if (!priv->stores->init) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "failed to get" + " init fop %s", + dlerror()); + ret = -1; + goto out; + } + + (void)dlerror(); + priv->stores->reconfigure = store_methods->fop_reconfigure; + if (!priv->stores->reconfigure) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "failed to get" + " reconfigure fop %s", + dlerror()); + ret = -1; + goto out; + } + + priv->stores->handle = handle; + + priv->stores->config = (void *)((priv->stores->init)(this)); + if (!priv->stores->config) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "null config"); + ret = -1; + goto out; + } + } + + ret = 0; + +out: + if (ret == -1) { + if (this->local_pool) { + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } + + cs_cleanup_private(priv); + + if (handle) { + dlclose(handle); + } + } + + GF_FREE(libpath); + + return ret; +} + +int +cs_forget(xlator_t *this, inode_t *inode) +{ + uint64_t ctx_int = 0; + cs_inode_ctx_t *ctx = NULL; + + inode_ctx_del(inode, this, &ctx_int); + if (!ctx_int) + return 0; + + ctx = (cs_inode_ctx_t *)(uintptr_t)ctx_int; + + GF_FREE(ctx); + return 0; +} + +void +cs_fini(xlator_t *this) +{ + cs_private_t *priv = NULL; + priv = this->private; + + cs_cleanup_private(priv); +} + +int +cs_reconfigure(xlator_t *this, dict_t *options) +{ + cs_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + if (!priv) { + ret = -1; + goto out; + } + + GF_OPTION_RECONF("cloudsync-remote-read", priv->remote_read, options, bool, + out); + + /* needed only for per volume configuration*/ + ret = priv->stores->reconfigure(this, options); + +out: + return ret; +} + +int32_t +cs_mem_acct_init(xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO("cloudsync", this, out); + + ret = xlator_mem_acct_init(this, gf_cs_mt_end + 1); + + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "Memory accounting init failed"); + return ret; + } +out: + return ret; +} + +int32_t +cs_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + int ret = 0; + int op_errno = ENOMEM; + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, + "failed to create " + "dict"); + goto err; + } + } + + ret = dict_set_uint32(xdata, GF_CS_OBJECT_STATUS, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "dict_set failed key:" + " %s", + GF_CS_OBJECT_STATUS); + goto err; + } + + STACK_WIND(frame, default_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata); + return 0; +err: + STACK_UNWIND_STRICT(readdirp, frame, -1, op_errno, NULL, NULL); + return 0; +} + +int32_t +cs_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + cs_local_t *local = NULL; + int ret = 0; + uint64_t val = 0; + + local = frame->local; + + local->call_cnt++; + + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "truncate failed"); + ret = dict_get_uint64(xdata, GF_CS_OBJECT_STATUS, &val); + if (ret == 0) { + if (val == GF_CS_ERROR) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "could not get file state, unwinding"); + op_ret = -1; + op_errno = EIO; + goto unwind; + } else { + __cs_inode_ctx_update(this, local->loc.inode, val); + gf_msg(this->name, GF_LOG_INFO, 0, 0, " state = %" PRIu64, val); + + if (local->call_cnt == 1 && + (val == GF_CS_REMOTE || val == GF_CS_DOWNLOADING)) { + gf_msg(this->name, GF_LOG_WARNING, 0, 0, + "will repair and download " + "the file, current state : %" PRIu64, + val); + goto repair; + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "second truncate, Unwinding"); + goto unwind; + } + } + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "file state " + "could not be figured, unwinding"); + goto unwind; + } + } else { + /* successful write => file is local */ + __cs_inode_ctx_update(this, local->loc.inode, GF_CS_LOCAL); + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "state : GF_CS_LOCAL" + ", truncate successful"); + + goto unwind; + } + +repair: + ret = locate_and_execute(frame); + if (ret) { + goto unwind; + } + + return 0; + +unwind: + CS_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} + +int32_t +cs_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + cs_local_t *local = NULL; + int ret = 0; + cs_inode_ctx_t *ctx = NULL; + gf_cs_obj_state state = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + local = cs_local_init(this, frame, loc, NULL, GF_FOP_TRUNCATE); + if (!local) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "local init failed"); + goto err; + } + + __cs_inode_ctx_get(this, loc->inode, &ctx); + + if (ctx) + state = __cs_get_file_state(loc->inode, ctx); + else + state = GF_CS_LOCAL; + + local->xattr_req = xdata ? dict_ref(xdata) : (xdata = dict_new()); + + ret = dict_set_uint32(local->xattr_req, GF_CS_OBJECT_STATUS, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "dict_set failed key:" + " %s", + GF_CS_OBJECT_STATUS); + goto err; + } + + local->stub = fop_truncate_stub(frame, cs_resume_truncate, loc, offset, + xdata); + if (!local->stub) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "insufficient memory"); + goto err; + } + + if (state == GF_CS_LOCAL) { + STACK_WIND(frame, cs_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + + } else { + local->call_cnt++; + ret = locate_and_execute(frame); + if (ret) { + goto err; + } + } + + return 0; +err: + CS_STACK_UNWIND(truncate, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +} + +int32_t +cs_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct statvfs *buf, dict_t *xdata) +{ + STACK_UNWIND_STRICT(statfs, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int32_t +cs_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + STACK_WIND(frame, cs_statfs_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, loc, xdata); + return 0; +} + +int32_t +cs_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +int32_t +cs_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xattr_req) +{ + STACK_WIND(frame, cs_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xattr_req); + return 0; +} + +int32_t +cs_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + cs_local_t *local = NULL; + + local = frame->local; + + if (local->locked) + cs_inodelk_unlock(frame); + + CS_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +int32_t +cs_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + data_t *tmp = NULL; + cs_local_t *local = NULL; + int ret = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + + local = cs_local_init(this, frame, loc, NULL, GF_FOP_SETXATTR); + if (!local) { + ret = -1; + goto err; + } + + local->xattr_req = xdata ? dict_ref(xdata) : (xdata = dict_new()); + + tmp = dict_get_sizen(dict, GF_CS_OBJECT_UPLOAD_COMPLETE); + if (tmp) { + /* Value of key should be the atime */ + local->stub = fop_setxattr_stub(frame, cs_resume_setxattr, loc, dict, + flags, xdata); + + if (!local->stub) + goto err; + + ret = locate_and_execute(frame); + if (ret) { + goto err; + } + + return 0; + } + + STACK_WIND(frame, cs_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata); + return 0; +err: + CS_STACK_UNWIND(setxattr, frame, -1, errno, NULL); + return 0; +} + +int32_t +cs_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +int32_t +cs_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + STACK_WIND(frame, cs_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + return 0; +} + +int32_t +cs_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +cs_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + STACK_WIND(frame, cs_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + return 0; +} + +int32_t +cs_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); + return 0; +} + +int32_t +cs_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + dict_t *xattr_req) +{ + cs_local_t *local = NULL; + int ret = 0; + + local = cs_local_init(this, frame, loc, NULL, GF_FOP_UNLINK); + if (!local) + goto err; + + local->xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new(); + + ret = dict_set_uint32(local->xattr_req, GF_CS_OBJECT_STATUS, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "dict_set failed key:" + " %s", + GF_CS_OBJECT_STATUS); + goto err; + } + STACK_WIND(frame, cs_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, flags, local->xattr_req); + return 0; +err: + CS_STACK_UNWIND(unlink, frame, -1, errno, NULL, NULL, NULL); + return 0; +} + +int32_t +cs_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + int ret = 0; + uint64_t val = 0; + + if (op_ret == 0) { + ret = dict_get_uint64(xdata, GF_CS_OBJECT_STATUS, &val); + if (!ret) { + ret = __cs_inode_ctx_update(this, fd->inode, val); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "ctx update failed"); + } + } + } else { + cs_inode_ctx_reset(this, fd->inode); + } + + CS_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata); + return 0; +} + +int32_t +cs_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xattr_req) +{ + cs_local_t *local = NULL; + int ret = 0; + + local = cs_local_init(this, frame, NULL, fd, GF_FOP_OPEN); + if (!local) + goto err; + + local->xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new(); + + ret = dict_set_uint32(local->xattr_req, GF_CS_OBJECT_STATUS, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "dict_set failed key:" + " %s", + GF_CS_OBJECT_STATUS); + goto err; + } + + STACK_WIND(frame, cs_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, local->xattr_req); + return 0; +err: + CS_STACK_UNWIND(open, frame, -1, errno, NULL, NULL); + return 0; +} + +int32_t +cs_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) +{ + int ret = 0; + uint64_t val = 0; + fd_t *fd = NULL; + cs_local_t *local = NULL; + + local = frame->local; + + fd = local->fd; + + if (op_ret == 0) { + ret = dict_get_uint64(xdata, GF_CS_OBJECT_STATUS, &val); + if (!ret) { + gf_msg_debug(this->name, 0, "state %" PRIu64, val); + ret = __cs_inode_ctx_update(this, fd->inode, val); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "ctx update failed"); + } + } + } else { + cs_inode_ctx_reset(this, fd->inode); + } + + CS_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata); + + return 0; +} + +int32_t +cs_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr_req) +{ + cs_local_t *local = NULL; + int ret = 0; + + local = cs_local_init(this, frame, NULL, fd, GF_FOP_FSTAT); + if (!local) + goto err; + + if (fd->inode->ia_type == IA_IFDIR) + goto wind; + + local->xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new(); + + ret = dict_set_uint32(local->xattr_req, GF_CS_OBJECT_STATUS, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "dict_set failed key:" + " %s", + GF_CS_OBJECT_STATUS); + goto err; + } + +wind: + STACK_WIND(frame, cs_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, local->xattr_req); + return 0; +err: + CS_STACK_UNWIND(fstat, frame, -1, errno, NULL, NULL); + return 0; +} + +cs_local_t * +cs_local_init(xlator_t *this, call_frame_t *frame, loc_t *loc, fd_t *fd, + glusterfs_fop_t fop) +{ + cs_local_t *local = NULL; + int ret = 0; + + local = mem_get0(this->local_pool); + if (!local) + goto out; + + if (loc) { + ret = loc_copy(&local->loc, loc); + if (ret) + goto out; + } + + if (fd) { + local->fd = fd_ref(fd); + } + + local->op_ret = -1; + local->op_errno = EUCLEAN; + local->fop = fop; + local->dloffset = 0; + frame->local = local; + local->locked = _gf_false; + local->call_cnt = 0; +out: + if (ret) { + if (local) + mem_put(local); + local = NULL; + } + + return local; +} + +call_frame_t * +cs_lock_frame(call_frame_t *parent_frame) +{ + call_frame_t *lock_frame = NULL; + + lock_frame = copy_frame(parent_frame); + + if (lock_frame == NULL) + goto out; + + set_lk_owner_from_ptr(&lock_frame->root->lk_owner, parent_frame->root); + +out: + return lock_frame; +} + +void +cs_lock_wipe(call_frame_t *lock_frame) +{ + CS_STACK_DESTROY(lock_frame); +} + +int32_t +cs_inodelk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + cs_lock_wipe(frame); + + return 0; +} + +int +cs_inodelk_unlock(call_frame_t *main_frame) +{ + xlator_t *this = NULL; + struct gf_flock flock = { + 0, + }; + call_frame_t *lock_frame = NULL; + cs_local_t *lock_local = NULL; + cs_local_t *main_local = NULL; + int ret = 0; + + this = main_frame->this; + main_local = main_frame->local; + + lock_frame = cs_lock_frame(main_frame); + if (!lock_frame) + goto out; + + lock_local = cs_local_init(this, lock_frame, NULL, NULL, 0); + if (!lock_local) + goto out; + + ret = cs_build_loc(&lock_local->loc, main_frame); + if (ret) { + goto out; + } + + flock.l_type = F_UNLCK; + + main_local->locked = _gf_false; + + STACK_WIND(lock_frame, cs_inodelk_unlock_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->inodelk, CS_LOCK_DOMAIN, + &lock_local->loc, F_SETLKW, &flock, NULL); + + return 0; + +out: + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "Stale lock would be found on" + " server"); + + if (lock_frame) + cs_lock_wipe(lock_frame); + + return 0; +} + +int +cs_download_task(void *arg) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + cs_private_t *priv = NULL; + int ret = -1; + char *sign_req = NULL; + fd_t *fd = NULL; + cs_local_t *local = NULL; + dict_t *dict = NULL; + + frame = (call_frame_t *)arg; + + this = frame->this; + + priv = this->private; + + if (!priv->stores) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "No remote store " + "plugins found"); + ret = -1; + goto out; + } + + local = frame->local; + + if (local->fd) + fd = fd_anonymous(local->fd->inode); + else + fd = fd_anonymous(local->loc.inode); + + if (!fd) { + gf_msg("CS", GF_LOG_ERROR, 0, 0, "fd creation failed"); + ret = -1; + goto out; + } + + local->dlfd = fd; + local->dloffset = 0; + + dict = dict_new(); + if (!dict) { + gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, + "failed to create " + "dict"); + ret = -1; + goto out; + } + + ret = dict_set_uint32(dict, GF_CS_OBJECT_DOWNLOADING, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "dict_set failed"); + ret = -1; + goto out; + } + + ret = syncop_fsetxattr(this, local->fd, dict, 0, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "fsetxattr failed " + "key %s", + GF_CS_OBJECT_DOWNLOADING); + ret = -1; + goto out; + } + /*this calling method is for per volume setting */ + ret = priv->stores->dlfop(frame, priv->stores->config); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "download failed" + ", remotepath: %s", + local->remotepath); + + /*using dlfd as it is anonymous and have RDWR flag*/ + ret = syncop_ftruncate(FIRST_CHILD(this), local->dlfd, 0, NULL, NULL, + NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, -ret, "ftruncate failed"); + } else { + gf_msg_debug(this->name, 0, "ftruncate succeed"); + } + + ret = -1; + goto out; + } else { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "download success, path" + " : %s", + local->remotepath); + + ret = syncop_fremovexattr(this, local->fd, GF_CS_OBJECT_REMOTE, NULL, + NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, -ret, + "removexattr failed, remotexattr"); + ret = -1; + goto out; + } else { + gf_msg_debug(this->name, 0, + "fremovexattr success, " + "path : %s", + local->remotepath); + } + + ret = syncop_fremovexattr(this, local->fd, GF_CS_OBJECT_DOWNLOADING, + NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, -ret, + "removexattr failed, downloading xattr, path %s", + local->remotepath); + ret = -1; + goto out; + } else { + gf_msg_debug(this->name, 0, + "fremovexattr success" + " path %s", + local->remotepath); + } + } + +out: + GF_FREE(sign_req); + + if (dict) + dict_unref(dict); + + if (fd) { + fd_unref(fd); + local->dlfd = NULL; + } + + return ret; +} + +int +cs_download(call_frame_t *frame) +{ + int ret = 0; + cs_local_t *local = NULL; + xlator_t *this = NULL; + + local = frame->local; + this = frame->this; + + if (!local->remotepath) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "remote path not" + " available. Check posix logs to resolve"); + goto out; + } + + ret = cs_download_task((void *)frame); +out: + return ret; +} + +int +cs_set_xattr_req(call_frame_t *frame) +{ + cs_local_t *local = NULL; + GF_UNUSED int ret = 0; + + local = frame->local; + + /* When remote reads are performed (i.e. reads on remote store), + * there needs to be a way to associate a file on gluster volume + * with its correspnding file on the remote store. In order to do + * that, a unique key can be maintained as an xattr + * (GF_CS_XATTR_ARCHIVE_UUID)on the stub file on gluster bricks. + * This xattr should be provided to the plugin to + * perform the read fop on the correct file. This assumes that the file + * hierarchy and name need not be the same on remote store as that of + * the gluster volume. + */ + ret = dict_set_sizen_str_sizen(local->xattr_req, GF_CS_XATTR_ARCHIVE_UUID, + "1"); + + return 0; +} + +int +cs_update_xattrs(call_frame_t *frame, dict_t *xdata) +{ + cs_local_t *local = NULL; + xlator_t *this = NULL; + int size = -1; + GF_UNUSED int ret = 0; + + local = frame->local; + this = frame->this; + + local->xattrinfo.lxattr = GF_CALLOC(1, sizeof(cs_loc_xattr_t), + gf_cs_mt_cs_lxattr_t); + if (!local->xattrinfo.lxattr) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + + gf_uuid_copy(local->xattrinfo.lxattr->gfid, local->loc.gfid); + + if (local->remotepath) { + local->xattrinfo.lxattr->file_path = gf_strdup(local->remotepath); + if (!local->xattrinfo.lxattr->file_path) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + } + + ret = dict_get_gfuuid(xdata, GF_CS_XATTR_ARCHIVE_UUID, + &(local->xattrinfo.lxattr->uuid)); + + if (ret) { + gf_uuid_clear(local->xattrinfo.lxattr->uuid); + } + size = strlen(this->name) - strlen("-cloudsync") + 1; + local->xattrinfo.lxattr->volname = GF_CALLOC(1, size, gf_common_mt_char); + if (!local->xattrinfo.lxattr->volname) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + strncpy(local->xattrinfo.lxattr->volname, this->name, size - 1); + local->xattrinfo.lxattr->volname[size - 1] = '\0'; + + return 0; +err: + cs_xattrinfo_wipe(local); + return -1; +} + +int +cs_serve_readv(call_frame_t *frame, off_t offset, size_t size, uint32_t flags) +{ + xlator_t *this = NULL; + cs_private_t *priv = NULL; + int ret = -1; + fd_t *fd = NULL; + cs_local_t *local = NULL; + + local = frame->local; + this = frame->this; + priv = this->private; + + if (!local->remotepath) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "remote path not" + " available. Check posix logs to resolve"); + goto out; + } + + if (!priv->stores) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "No remote store " + "plugins found"); + ret = -1; + goto out; + } + + if (local->fd) { + fd = fd_anonymous(local->fd->inode); + } else { + fd = fd_anonymous(local->loc.inode); + } + + local->xattrinfo.size = size; + local->xattrinfo.offset = offset; + local->xattrinfo.flags = flags; + + if (!fd) { + gf_msg("CS", GF_LOG_ERROR, 0, 0, "fd creation failed"); + ret = -1; + goto out; + } + + local->dlfd = fd; + local->dloffset = offset; + + /*this calling method is for per volume setting */ + ret = priv->stores->rdfop(frame, priv->stores->config); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "read failed" + ", remotepath: %s", + local->remotepath); + ret = -1; + goto out; + } else { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "read success, path" + " : %s", + local->remotepath); + } + +out: + if (fd) { + fd_unref(fd); + local->dlfd = NULL; + } + return ret; +} + +int32_t +cs_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) +{ + cs_local_t *local = NULL; + int ret = 0; + uint64_t val = 0; + fd_t *fd = NULL; + + local = frame->local; + fd = local->fd; + + local->call_cnt++; + + if (op_ret == -1) { + ret = dict_get_uint64(xdata, GF_CS_OBJECT_STATUS, &val); + if (ret == 0) { + if (val == GF_CS_ERROR) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "could not get file state, unwinding"); + op_ret = -1; + op_errno = EIO; + goto unwind; + } else { + __cs_inode_ctx_update(this, fd->inode, val); + gf_msg(this->name, GF_LOG_INFO, 0, 0, " state = %" PRIu64, val); + + if (local->call_cnt == 1 && + (val == GF_CS_REMOTE || val == GF_CS_DOWNLOADING)) { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + " will read from remote : %" PRIu64, val); + goto repair; + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "second readv, Unwinding"); + goto unwind; + } + } + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "file state " + "could not be figured, unwinding"); + goto unwind; + } + } else { + /* successful readv => file is local */ + __cs_inode_ctx_update(this, fd->inode, GF_CS_LOCAL); + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "state : GF_CS_LOCAL" + ", readv successful"); + + goto unwind; + } + +repair: + ret = locate_and_execute(frame); + if (ret) { + goto unwind; + } + + return 0; + +unwind: + CS_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); + + return 0; +} + +int32_t +cs_resume_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int ret = 0; + + ret = cs_resume_postprocess(this, frame, fd->inode); + if (ret) { + goto unwind; + } + + cs_inodelk_unlock(frame); + + STACK_WIND(frame, cs_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + + return 0; + +unwind: + cs_inodelk_unlock(frame); + + cs_common_cbk(frame); + + return 0; +} + +int32_t +cs_resume_remote_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t offset, uint32_t flags, dict_t *xdata) +{ + int ret = 0; + cs_local_t *local = NULL; + gf_cs_obj_state state = -1; + cs_inode_ctx_t *ctx = NULL; + + cs_inodelk_unlock(frame); + + local = frame->local; + if (!local) { + ret = -1; + goto unwind; + } + + __cs_inode_ctx_get(this, fd->inode, &ctx); + + state = __cs_get_file_state(fd->inode, ctx); + if (state == GF_CS_ERROR) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "status is GF_CS_ERROR." + " Aborting readv"); + local->op_ret = -1; + local->op_errno = EREMOTE; + ret = -1; + goto unwind; + } + + /* Serve readv from remote store only if it is remote. */ + gf_msg_debug(this->name, 0, "status of file %s is %d", + local->remotepath ? local->remotepath : "", state); + + /* We will reach this condition if local inode ctx had REMOTE + * state when the control was in cs_readv but after stat + * we got an updated state saying that the file is LOCAL. + */ + if (state == GF_CS_LOCAL) { + STACK_WIND(frame, cs_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); + } else if (state == GF_CS_REMOTE) { + ret = cs_resume_remote_readv_postprocess(this, frame, fd->inode, offset, + size, flags); + /* Failed to submit the remote readv fop to plugin */ + if (ret) { + local->op_ret = -1; + local->op_errno = EREMOTE; + goto unwind; + } + /* When the file is in any other intermediate state, + * we should not perform remote reads. + */ + } else { + local->op_ret = -1; + local->op_errno = EINVAL; + goto unwind; + } + + return 0; + +unwind: + cs_common_cbk(frame); + + return 0; +} + +int32_t +cs_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int op_errno = ENOMEM; + cs_local_t *local = NULL; + int ret = 0; + cs_inode_ctx_t *ctx = NULL; + gf_cs_obj_state state = -1; + cs_private_t *priv = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + priv = this->private; + + local = cs_local_init(this, frame, NULL, fd, GF_FOP_READ); + if (!local) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "local init failed"); + goto err; + } + + __cs_inode_ctx_get(this, fd->inode, &ctx); + + if (ctx) + state = __cs_get_file_state(fd->inode, ctx); + else + state = GF_CS_LOCAL; + + local->xattr_req = xdata ? dict_ref(xdata) : (xdata = dict_new()); + + ret = dict_set_uint32(local->xattr_req, GF_CS_OBJECT_STATUS, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "dict_set failed key:" + " %s", + GF_CS_OBJECT_STATUS); + goto err; + } + + if (priv->remote_read) { + local->stub = fop_readv_stub(frame, cs_resume_remote_readv, fd, size, + offset, flags, xdata); + } else { + local->stub = fop_readv_stub(frame, cs_resume_readv, fd, size, offset, + flags, xdata); + } + if (!local->stub) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "insufficient memory"); + goto err; + } + + if (state == GF_CS_LOCAL) { + STACK_WIND(frame, cs_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); + } else { + local->call_cnt++; + ret = locate_and_execute(frame); + if (ret) { + goto err; + } + } + + return 0; + +err: + CS_STACK_UNWIND(readv, frame, -1, op_errno, NULL, -1, NULL, NULL, NULL); + + return 0; +} + +int +cs_resume_remote_readv_postprocess(xlator_t *this, call_frame_t *frame, + inode_t *inode, off_t offset, size_t size, + uint32_t flags) +{ + int ret = 0; + + ret = cs_serve_readv(frame, offset, size, flags); + + return ret; +} + +int +cs_stat_check_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *stbuf, dict_t *xdata) +{ + cs_local_t *local = NULL; + call_stub_t *stub = NULL; + char *filepath = NULL; + int ret = 0; + inode_t *inode = NULL; + uint64_t val = 0; + + local = frame->local; + + if (op_ret == -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; + gf_msg(this->name, GF_LOG_ERROR, 0, op_errno, "stat check failed"); + goto err; + } else { + if (local->fd) + inode = local->fd->inode; + else + inode = local->loc.inode; + + if (!inode) { + local->op_ret = -1; + local->op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "null inode " + "returned"); + goto err; + } + + ret = dict_get_uint64(xdata, GF_CS_OBJECT_STATUS, &val); + if (ret == 0) { + if (val == GF_CS_ERROR) { + cs_inode_ctx_reset(this, inode); + local->op_ret = -1; + local->op_errno = EIO; + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "status = GF_CS_ERROR. failed to get " + " file state"); + goto err; + } else { + ret = __cs_inode_ctx_update(this, inode, val); + gf_msg_debug(this->name, 0, "status : %" PRIu64, val); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "ctx update failed"); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + } + } else { + gf_msg_debug(this->name, 0, "status not found in dict"); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + + ret = dict_get_str_sizen(xdata, GF_CS_OBJECT_REMOTE, &filepath); + if (filepath) { + gf_msg_debug(this->name, 0, "filepath returned %s", filepath); + local->remotepath = gf_strdup(filepath); + if (!local->remotepath) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + } else { + gf_msg_debug(this->name, 0, "NULL filepath"); + } + + ret = cs_update_xattrs(frame, xdata); + if (ret) + goto err; + + local->op_ret = 0; + local->xattr_rsp = dict_ref(xdata); + memcpy(&local->stbuf, stbuf, sizeof(struct iatt)); + } + + stub = local->stub; + local->stub = NULL; + call_resume(stub); + + return 0; +err: + cs_inodelk_unlock(frame); + + cs_common_cbk(frame); + + return 0; +} + +int +cs_do_stat_check(call_frame_t *main_frame) +{ + cs_local_t *local = NULL; + xlator_t *this = NULL; + int ret = 0; + + local = main_frame->local; + this = main_frame->this; + + ret = dict_set_uint32(local->xattr_req, GF_CS_OBJECT_REPAIR, 256); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "dict_set failed"); + goto err; + } + + cs_set_xattr_req(main_frame); + + if (local->fd) { + STACK_WIND(main_frame, cs_stat_check_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, local->fd, local->xattr_req); + } else { + STACK_WIND(main_frame, cs_stat_check_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, &local->loc, + local->xattr_req); + } + + return 0; + +err: + cs_inodelk_unlock(main_frame); + + cs_common_cbk(main_frame); + + return 0; +} + +void +cs_common_cbk(call_frame_t *frame) +{ + glusterfs_fop_t fop = -1; + cs_local_t *local = NULL; + + local = frame->local; + + fop = local->fop; + + /*Note: Only the failure case needs to be handled here. Since for + * successful stat check the fop will resume anyway. The unwind can + * happen from the fop_cbk and each cbk can unlock the inodelk in case + * a lock was taken before. The lock status can be stored in frame */ + + /* for failure case */ + + /*TODO: add other fops*/ + switch (fop) { + case GF_FOP_WRITE: + CS_STACK_UNWIND(writev, frame, local->op_ret, local->op_errno, NULL, + NULL, NULL); + break; + + case GF_FOP_SETXATTR: + CS_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, + NULL); + break; + case GF_FOP_READ: + CS_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, NULL, + 0, NULL, NULL, NULL); + break; + case GF_FOP_FTRUNCATE: + CS_STACK_UNWIND(ftruncate, frame, local->op_ret, local->op_errno, + NULL, NULL, NULL); + break; + + case GF_FOP_TRUNCATE: + CS_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno, + NULL, NULL, NULL); + break; + default: + break; + } + + return; +} + +int +cs_blocking_inodelk_cbk(call_frame_t *lock_frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + cs_local_t *main_local = NULL; + call_frame_t *main_frame = NULL; + cs_local_t *lock_local = NULL; + + lock_local = lock_frame->local; + + main_frame = lock_local->main_frame; + main_local = main_frame->local; + + if (op_ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "inodelk failed"); + main_local->op_errno = op_errno; + main_local->op_ret = op_ret; + goto err; + } + + main_local->locked = _gf_true; + + cs_lock_wipe(lock_frame); + + cs_do_stat_check(main_frame); + + return 0; +err: + cs_common_cbk(main_frame); + + cs_lock_wipe(lock_frame); + + return 0; +} + +int +cs_build_loc(loc_t *loc, call_frame_t *frame) +{ + cs_local_t *local = NULL; + int ret = -1; + + local = frame->local; + + if (local->fd) { + loc->inode = inode_ref(local->fd->inode); + if (loc->inode) { + gf_uuid_copy(loc->gfid, loc->inode->gfid); + ret = 0; + goto out; + } else { + ret = -1; + goto out; + } + } else { + loc->inode = inode_ref(local->loc.inode); + if (loc->inode) { + gf_uuid_copy(loc->gfid, loc->inode->gfid); + ret = 0; + goto out; + } else { + ret = -1; + goto out; + } + } +out: + return ret; +} + +int +cs_blocking_inodelk(call_frame_t *parent_frame) +{ + call_frame_t *lock_frame = NULL; + cs_local_t *lock_local = NULL; + xlator_t *this = NULL; + struct gf_flock flock = { + 0, + }; + int ret = 0; + + this = parent_frame->this; + + lock_frame = cs_lock_frame(parent_frame); + if (!lock_frame) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "insuffcient memory"); + goto err; + } + + lock_local = cs_local_init(this, lock_frame, NULL, NULL, 0); + if (!lock_local) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "local init failed"); + goto err; + } + + lock_local->main_frame = parent_frame; + + flock.l_type = F_WRLCK; + + ret = cs_build_loc(&lock_local->loc, parent_frame); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "build_loc failed"); + goto err; + } + + STACK_WIND(lock_frame, cs_blocking_inodelk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->inodelk, CS_LOCK_DOMAIN, + &lock_local->loc, F_SETLKW, &flock, NULL); + + return 0; +err: + if (lock_frame) + cs_lock_wipe(lock_frame); + + return -1; +} + +int +locate_and_execute(call_frame_t *frame) +{ + int ret = 0; + + ret = cs_blocking_inodelk(frame); + + if (ret) + return -1; + else + return 0; +} + +int32_t +cs_resume_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xattr_req) +{ + cs_local_t *local = NULL; + int ret = 0; + + local = frame->local; + + ret = cs_resume_postprocess(this, frame, loc->inode); + if (ret) { + goto unwind; + } + + cs_inodelk_unlock(frame); + + STACK_WIND(frame, cs_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, + local->xattr_req); + + return 0; + +unwind: + cs_inodelk_unlock(frame); + + cs_common_cbk(frame); + + return 0; +} + +int32_t +cs_resume_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *dict, int32_t flags, dict_t *xdata) +{ + cs_local_t *local = NULL; + cs_inode_ctx_t *ctx = NULL; + gf_cs_obj_state state = GF_CS_ERROR; + + local = frame->local; + + __cs_inode_ctx_get(this, loc->inode, &ctx); + + state = __cs_get_file_state(loc->inode, ctx); + + if (state == GF_CS_ERROR) { + /* file is already remote */ + local->op_ret = -1; + local->op_errno = EINVAL; + gf_msg(this->name, GF_LOG_WARNING, 0, 0, + "file %s , could not figure file state", loc->path); + goto unwind; + } + + if (state == GF_CS_REMOTE) { + /* file is already remote */ + local->op_ret = -1; + local->op_errno = EINVAL; + gf_msg(this->name, GF_LOG_WARNING, 0, EINVAL, + "file %s is already remote", loc->path); + goto unwind; + } + + if (state == GF_CS_DOWNLOADING) { + gf_msg(this->name, GF_LOG_WARNING, 0, 0, + " file is in downloading state."); + local->op_ret = -1; + local->op_errno = EINVAL; + goto unwind; + } + + STACK_WIND(frame, cs_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, + local->xattr_req); + + return 0; +unwind: + cs_inodelk_unlock(frame); + + cs_common_cbk(frame); + + return 0; +} + +gf_cs_obj_state +__cs_get_file_state(inode_t *inode, cs_inode_ctx_t *ctx) +{ + gf_cs_obj_state state = -1; + + if (!ctx) + return GF_CS_ERROR; + + LOCK(&inode->lock); + { + state = ctx->state; + } + UNLOCK(&inode->lock); + + return state; +} + +void +__cs_inode_ctx_get(xlator_t *this, inode_t *inode, cs_inode_ctx_t **ctx) +{ + uint64_t ctxint = 0; + int ret = 0; + + LOCK(&inode->lock); + { + ret = __inode_ctx_get(inode, this, &ctxint); + } + UNLOCK(&inode->lock); + + if (ret) + *ctx = NULL; + else + *ctx = (cs_inode_ctx_t *)(uintptr_t)ctxint; + + return; +} + +int +__cs_inode_ctx_update(xlator_t *this, inode_t *inode, uint64_t val) +{ + cs_inode_ctx_t *ctx = NULL; + uint64_t ctxint = 0; + int ret = 0; + + LOCK(&inode->lock); + { + ret = __inode_ctx_get(inode, this, &ctxint); + if (ret) { + ctx = GF_CALLOC(1, sizeof(*ctx), gf_cs_mt_cs_inode_ctx_t); + if (!ctx) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "ctx allocation failed"); + ret = -1; + goto out; + } + + ctx->state = val; + + ctxint = (uint64_t)(uintptr_t)ctx; + + ret = __inode_ctx_set(inode, this, &ctxint); + if (ret) { + GF_FREE(ctx); + goto out; + } + } else { + ctx = (cs_inode_ctx_t *)(uintptr_t)ctxint; + + ctx->state = val; + } + } + +out: + UNLOCK(&inode->lock); + + return ret; +} + +int +cs_inode_ctx_reset(xlator_t *this, inode_t *inode) +{ + cs_inode_ctx_t *ctx = NULL; + uint64_t ctxint = 0; + + inode_ctx_del(inode, this, &ctxint); + if (!ctxint) { + return 0; + } + + ctx = (cs_inode_ctx_t *)(uintptr_t)ctxint; + + GF_FREE(ctx); + return 0; +} + +int +cs_resume_postprocess(xlator_t *this, call_frame_t *frame, inode_t *inode) +{ + cs_local_t *local = NULL; + gf_cs_obj_state state = -1; + cs_inode_ctx_t *ctx = NULL; + int ret = 0; + + local = frame->local; + if (!local) { + ret = -1; + goto out; + } + + __cs_inode_ctx_get(this, inode, &ctx); + + state = __cs_get_file_state(inode, ctx); + if (state == GF_CS_ERROR) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "status is GF_CS_ERROR." + " Aborting write"); + local->op_ret = -1; + local->op_errno = EREMOTE; + ret = -1; + goto out; + } + + if (state == GF_CS_REMOTE || state == GF_CS_DOWNLOADING) { + gf_msg_debug(this->name, 0, "status is %d", state); + ret = cs_download(frame); + if (ret == 0) { + gf_msg_debug(this->name, 0, "Winding for Final Write"); + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + " download failed, unwinding writev"); + local->op_ret = -1; + local->op_errno = EREMOTE; + ret = -1; + } + } +out: + return ret; +} + +int32_t +cs_fdctx_to_dict(xlator_t *this, fd_t *fd, dict_t *dict) +{ + return 0; +} + +int32_t +cs_inode(xlator_t *this) +{ + return 0; +} + +int32_t +cs_inode_to_dict(xlator_t *this, dict_t *dict) +{ + return 0; +} + +int32_t +cs_history(xlator_t *this) +{ + return 0; +} + +int32_t +cs_fd(xlator_t *this) +{ + return 0; +} + +int32_t +cs_fd_to_dict(xlator_t *this, dict_t *dict) +{ + return 0; +} + +int32_t +cs_fdctx(xlator_t *this, fd_t *fd) +{ + return 0; +} + +int32_t +cs_inodectx(xlator_t *this, inode_t *ino) +{ + return 0; +} + +int32_t +cs_inodectx_to_dict(xlator_t *this, inode_t *ino, dict_t *dict) +{ + return 0; +} + +int32_t +cs_priv_to_dict(xlator_t *this, dict_t *dict, char *brickname) +{ + return 0; +} + +int32_t +cs_priv(xlator_t *this) +{ + return 0; +} + +int +cs_notify(xlator_t *this, int event, void *data, ...) +{ + return default_notify(this, event, data); +} + +struct xlator_fops cs_fops = { + .stat = cs_stat, + .readdirp = cs_readdirp, + .truncate = cs_truncate, + .seek = cs_seek, + .statfs = cs_statfs, + .fallocate = cs_fallocate, + .discard = cs_discard, + .getxattr = cs_getxattr, + .writev = cs_writev, + .setxattr = cs_setxattr, + .fgetxattr = cs_fgetxattr, + .lookup = cs_lookup, + .fsetxattr = cs_fsetxattr, + .readv = cs_readv, + .ftruncate = cs_ftruncate, + .rchecksum = cs_rchecksum, + .unlink = cs_unlink, + .open = cs_open, + .fstat = cs_fstat, + .zerofill = cs_zerofill, +}; + +struct xlator_cbks cs_cbks = { + .forget = cs_forget, +}; + +struct xlator_dumpops cs_dumpops = { + .fdctx_to_dict = cs_fdctx_to_dict, + .inode = cs_inode, + .inode_to_dict = cs_inode_to_dict, + .history = cs_history, + .fd = cs_fd, + .fd_to_dict = cs_fd_to_dict, + .fdctx = cs_fdctx, + .inodectx = cs_inodectx, + .inodectx_to_dict = cs_inodectx_to_dict, + .priv_to_dict = cs_priv_to_dict, + .priv = cs_priv, +}; + +struct volume_options cs_options[] = { + {.key = {"cloudsync-storetype"}, + .type = GF_OPTION_TYPE_STR, + .description = "Defines which remote store is enabled"}, + {.key = {"cloudsync-remote-read"}, + .type = GF_OPTION_TYPE_BOOL, + .description = "Defines a remote read fop when on"}, + {.key = {"cloudsync-store-id"}, + .type = GF_OPTION_TYPE_STR, + .description = "Defines a volume wide store id"}, + {.key = {"cloudsync-product-id"}, + .type = GF_OPTION_TYPE_STR, + .description = "Defines a volume wide product id"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = cs_init, + .fini = cs_fini, + .notify = cs_notify, + .reconfigure = cs_reconfigure, + .mem_acct_init = cs_mem_acct_init, + .dumpops = &cs_dumpops, + .fops = &cs_fops, + .cbks = &cs_cbks, + .options = cs_options, + .identifier = "cloudsync", + .category = GF_TECH_PREVIEW, +}; diff --git a/xlators/features/cloudsync/src/cloudsync.h b/xlators/features/cloudsync/src/cloudsync.h new file mode 100644 index 00000000000..d24141978d6 --- /dev/null +++ b/xlators/features/cloudsync/src/cloudsync.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#ifndef __CLOUDSYNC_H__ +#define __CLOUDSYNC_H__ + +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include <glusterfs/syncop.h> +#include <glusterfs/call-stub.h> +#include "cloudsync-common.h" +#include "cloudsync-autogen-fops.h" + +#define ALIGN_SIZE 4096 +#define CS_LOCK_DOMAIN "cs.protect.file.stat" +typedef struct cs_dlstore { + off_t off; + struct iovec *vector; + int32_t count; + struct iobref *iobref; + uint32_t flags; +} cs_dlstore; + +typedef struct cs_inode_ctx { + cs_loc_xattr_t locxattr; + gf_cs_obj_state state; +} cs_inode_ctx_t; + +struct cs_plugin { + char *name; /* store name */ + char *library; /* library to load for the given store */ + char *description; /* description about the store */ +}; + +cs_local_t * +cs_local_init(xlator_t *this, call_frame_t *frame, loc_t *loc, fd_t *fd, + glusterfs_fop_t fop); + +int +locate_and_execute(call_frame_t *frame); + +int32_t +cs_resume_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *dict, int32_t flags, dict_t *xdata); + +int32_t +cs_inodelk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +size_t +cs_write_callback(void *lcurlbuf, size_t size, size_t nitems, void *frame); + +void +cs_common_cbk(call_frame_t *frame); + +gf_boolean_t +cs_is_file_remote(struct iatt *stbuf, dict_t *xattr); + +int32_t +cs_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +cs_build_loc(loc_t *loc, call_frame_t *frame); + +int +cs_blocking_inodelk_cbk(call_frame_t *lock_frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +cs_read_authinfo(xlator_t *this); + +int +__cs_inode_ctx_update(xlator_t *this, inode_t *inode, uint64_t val); + +int +cs_inode_ctx_reset(xlator_t *this, inode_t *inode); + +void +__cs_inode_ctx_get(xlator_t *this, inode_t *inode, cs_inode_ctx_t **ctx); + +gf_cs_obj_state +__cs_get_file_state(inode_t *inode, cs_inode_ctx_t *ctx); + +int +cs_inodelk_unlock(call_frame_t *main_frame); + +int +cs_resume_postprocess(xlator_t *this, call_frame_t *frame, inode_t *inode); + +int32_t +cs_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata); +int32_t +cs_resume_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xattr_req); + +int32_t +cs_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata); +int32_t +cs_resume_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); +int32_t +cs_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int +cs_resume_remote_readv_postprocess(xlator_t *this, call_frame_t *frame, + inode_t *inode, off_t offset, size_t size, + uint32_t flags); +int +cs_serve_readv(call_frame_t *frame, off_t offset, size_t size, uint32_t flags); +#endif /* __CLOUDSYNC_H__ */ diff --git a/xlators/features/compress/src/Makefile.am b/xlators/features/compress/src/Makefile.am index 4a64b52a9a1..98271a9f3fc 100644 --- a/xlators/features/compress/src/Makefile.am +++ b/xlators/features/compress/src/Makefile.am @@ -4,13 +4,15 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features noinst_HEADERS = cdc.h cdc-mem-types.h -cdc_la_LDFLAGS = -module -avoidversion $(LIBZ_LIBS) +cdc_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) cdc_la_SOURCES = cdc.c cdc-helper.c -cdc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +cdc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(ZLIB_LIBS) -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) \ --shared -nostartfiles $(LIBZ_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) \ + $(LIBZ_CFLAGS) AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/features/compress/src/cdc-helper.c b/xlators/features/compress/src/cdc-helper.c index 54432ff455c..f973ff56cf5 100644 --- a/xlators/features/compress/src/cdc-helper.c +++ b/xlators/features/compress/src/cdc-helper.c @@ -8,13 +8,9 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/syscall.h> #include "cdc.h" #include "cdc-mem-types.h" @@ -38,118 +34,110 @@ * gzip_header is added only during debugging. * Refer to the function cdc_dump_iovec_to_disk */ -static const char gzip_header[10] = - { - '\037', '\213', Z_DEFLATED, 0, - 0, 0, 0, 0, - 0, GF_CDC_OS_ID - }; +static const char gzip_header[10] = {'\037', '\213', Z_DEFLATED, 0, 0, 0, 0, + 0, 0, GF_CDC_OS_ID}; static int32_t -cdc_next_iovec (xlator_t *this, cdc_info_t *ci) +cdc_next_iovec(xlator_t *this, cdc_info_t *ci) { - int ret = -1; - - ci->ncount++; - /* check for iovec overflow -- should not happen */ - if (ci->ncount == MAX_IOVEC) { - gf_log (this->name, GF_LOG_ERROR, - "Zlib output buffer overflow" - " ->ncount (%d) | ->MAX_IOVEC (%d)", - ci->ncount, MAX_IOVEC); - goto out; - } - - ret = 0; - - out: - return ret; + int ret = -1; + + ci->ncount++; + /* check for iovec overflow -- should not happen */ + if (ci->ncount == MAX_IOVEC) { + gf_log(this->name, GF_LOG_ERROR, + "Zlib output buffer overflow" + " ->ncount (%d) | ->MAX_IOVEC (%d)", + ci->ncount, MAX_IOVEC); + goto out; + } + + ret = 0; + +out: + return ret; } static void -cdc_put_long (unsigned char *string, unsigned long x) +cdc_put_long(unsigned char *string, unsigned long x) { - string[0] = (unsigned char) (x & 0xff); - string[1] = (unsigned char) ((x & 0xff00) >> 8); - string[2] = (unsigned char) ((x & 0xff0000) >> 16); - string[3] = (unsigned char) ((x & 0xff000000) >> 24); + string[0] = (unsigned char)(x & 0xff); + string[1] = (unsigned char)((x & 0xff00) >> 8); + string[2] = (unsigned char)((x & 0xff0000) >> 16); + string[3] = (unsigned char)((x & 0xff000000) >> 24); } static unsigned long -cdc_get_long (unsigned char *buf) +cdc_get_long(unsigned char *buf) { - return ((unsigned long) buf[0]) - | (((unsigned long) buf[1]) << 8) - | (((unsigned long) buf[2]) << 16) - | (((unsigned long) buf[3]) << 24); + return ((unsigned long)buf[0]) | (((unsigned long)buf[1]) << 8) | + (((unsigned long)buf[2]) << 16) | (((unsigned long)buf[3]) << 24); } static int32_t -cdc_init_gzip_trailer (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci) +cdc_init_gzip_trailer(xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci) { - int ret = -1; - char *buf = NULL; + int ret = -1; + char *buf = NULL; - ret = cdc_next_iovec (this, ci); - if (ret) - goto out; + ret = cdc_next_iovec(this, ci); + if (ret) + goto out; - buf = CURR_VEC(ci).iov_base = - (char *) GF_CALLOC (1, GF_CDC_VALIDATION_SIZE, - gf_cdc_mt_gzip_trailer_t); + buf = CURR_VEC(ci).iov_base = (char *)GF_CALLOC(1, GF_CDC_VALIDATION_SIZE, + gf_cdc_mt_gzip_trailer_t); - if (!CURR_VEC(ci).iov_base) - goto out; + if (!CURR_VEC(ci).iov_base) + goto out; - CURR_VEC(ci).iov_len = GF_CDC_VALIDATION_SIZE; + CURR_VEC(ci).iov_len = GF_CDC_VALIDATION_SIZE; - cdc_put_long ((unsigned char *)&buf[0], ci->crc); - cdc_put_long ((unsigned char *)&buf[4], ci->stream.total_in); + cdc_put_long((unsigned char *)&buf[0], ci->crc); + cdc_put_long((unsigned char *)&buf[4], ci->stream.total_in); - ret = 0; + ret = 0; - out: - return ret; +out: + return ret; } static int32_t -cdc_alloc_iobuf_and_init_vec (xlator_t *this, - cdc_priv_t *priv, cdc_info_t *ci, - int size) +cdc_alloc_iobuf_and_init_vec(xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, + int size) { - int ret = -1; - int alloc_len = 0; - struct iobuf *iobuf = NULL; + int ret = -1; + int alloc_len = 0; + struct iobuf *iobuf = NULL; - ret = cdc_next_iovec (this, ci); - if (ret) - goto out; + ret = cdc_next_iovec(this, ci); + if (ret) + goto out; - alloc_len = size ? size : ci->buffer_size; + alloc_len = size ? size : ci->buffer_size; - iobuf = iobuf_get2 (this->ctx->iobuf_pool, alloc_len); - if (!iobuf) - goto out; + iobuf = iobuf_get2(this->ctx->iobuf_pool, alloc_len); + if (!iobuf) + goto out; - ret = iobref_add (ci->iobref, iobuf); - if (ret) - goto out; + ret = iobref_add(ci->iobref, iobuf); + if (ret) + goto out; - /* Initialize this iovec */ - CURR_VEC(ci).iov_base = iobuf->ptr; - CURR_VEC(ci).iov_len = alloc_len; + /* Initialize this iovec */ + CURR_VEC(ci).iov_base = iobuf->ptr; + CURR_VEC(ci).iov_len = alloc_len; - ret = 0; + ret = 0; - out: - return ret; +out: + return ret; } static void -cdc_init_zlib_output_stream (cdc_priv_t *priv, cdc_info_t *ci, int size) +cdc_init_zlib_output_stream(cdc_priv_t *priv, cdc_info_t *ci, int size) { - ci->stream.next_out = (unsigned char *) CURR_VEC(ci).iov_base; - ci->stream.avail_out = size ? size : ci->buffer_size; + ci->stream.next_out = (unsigned char *)CURR_VEC(ci).iov_base; + ci->stream.avail_out = size ? size : ci->buffer_size; } /* This routine is for testing and debugging only. @@ -157,391 +145,383 @@ cdc_init_zlib_output_stream (cdc_priv_t *priv, cdc_info_t *ci, int size) * So each gzip dump file is at least 18 bytes in size. */ void -cdc_dump_iovec_to_disk (xlator_t *this, cdc_info_t *ci, const char *file) +cdc_dump_iovec_to_disk(xlator_t *this, cdc_info_t *ci, const char *file) { - int i = 0; - int fd = 0; - size_t writen = 0; - size_t total_writen = 0; - - fd = open (file, O_WRONLY|O_CREAT|O_TRUNC, 0777 ); - if (fd < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Cannot open file: %s", file); - return; - } - - writen = write (fd, (char *) gzip_header, 10); - total_writen += writen; - for (i = 0; i < ci->ncount; i++) { - writen = write (fd, (char *) ci->vec[i].iov_base, ci->vec[i].iov_len); - total_writen += writen; - } - - gf_log (this->name, GF_LOG_DEBUG, - "dump'd %zu bytes to %s", total_writen, GF_CDC_DEBUG_DUMP_FILE ); - - close (fd); + int i = 0; + int fd = 0; + size_t written = 0; + size_t total_written = 0; + + fd = open(file, O_WRONLY | O_CREAT | O_TRUNC, 0777); + if (fd < 0) { + gf_log(this->name, GF_LOG_ERROR, "Cannot open file: %s", file); + return; + } + + written = sys_write(fd, (char *)gzip_header, 10); + total_written += written; + for (i = 0; i < ci->ncount; i++) { + written = sys_write(fd, (char *)ci->vec[i].iov_base, + ci->vec[i].iov_len); + total_written += written; + } + + gf_log(this->name, GF_LOG_DEBUG, "dump'd %zu bytes to %s", total_written, + GF_CDC_DEBUG_DUMP_FILE); + + sys_close(fd); } static int32_t -cdc_flush_libz_buffer (cdc_priv_t *priv, xlator_t *this, cdc_info_t *ci, - int (*libz_func)(z_streamp, int), - int flush) +cdc_flush_libz_buffer(cdc_priv_t *priv, xlator_t *this, cdc_info_t *ci, + int (*libz_func)(z_streamp, int), int flush) { - int32_t ret = Z_OK; - int done = 0; - unsigned int deflate_len = 0; + int32_t ret = Z_OK; + int done = 0; + unsigned int deflate_len = 0; - for (;;) { - deflate_len = ci->buffer_size - ci->stream.avail_out; + for (;;) { + deflate_len = ci->buffer_size - ci->stream.avail_out; - if (deflate_len != 0) { - CURR_VEC(ci).iov_len = deflate_len; + if (deflate_len != 0) { + CURR_VEC(ci).iov_len = deflate_len; - ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); - if (ret) { - ret = Z_MEM_ERROR; - break; - } + ret = cdc_alloc_iobuf_and_init_vec(this, priv, ci, 0); + if (ret) { + ret = Z_MEM_ERROR; + break; + } - /* Re-position Zlib output buffer */ - cdc_init_zlib_output_stream (priv, ci, 0); - } + /* Re-position Zlib output buffer */ + cdc_init_zlib_output_stream(priv, ci, 0); + } - if (done) { - ci->ncount--; - break; - } + if (done) { + ci->ncount--; + break; + } - ret = libz_func (&ci->stream, flush); + ret = libz_func(&ci->stream, flush); - if (ret == Z_BUF_ERROR) { - ret = Z_OK; - ci->ncount--; - break; - } + if (ret == Z_BUF_ERROR) { + ret = Z_OK; + ci->ncount--; + break; + } - done = (ci->stream.avail_out != 0 || ret == Z_STREAM_END); + done = (ci->stream.avail_out != 0 || ret == Z_STREAM_END); - if (ret != Z_OK && ret != Z_STREAM_END) - break; - } + if (ret != Z_OK && ret != Z_STREAM_END) + break; + } - return ret; + return ret; } static int32_t -do_cdc_compress (struct iovec *vec, xlator_t *this, cdc_priv_t *priv, - cdc_info_t *ci) +do_cdc_compress(struct iovec *vec, xlator_t *this, cdc_priv_t *priv, + cdc_info_t *ci) { - int ret = -1; - - /* Initialize defalte */ - ret = deflateInit2 (&ci->stream, priv->cdc_level, Z_DEFLATED, - priv->window_size, priv->mem_level, - Z_DEFAULT_STRATEGY); + int ret = -1; - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "unable to init Zlib (retval: %d)", ret); - goto out; - } - - ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); - if (ret) - goto out; + /* Initialize defalte */ + ret = deflateInit2(&ci->stream, priv->cdc_level, Z_DEFLATED, + priv->window_size, priv->mem_level, Z_DEFAULT_STRATEGY); - /* setup output buffer */ - cdc_init_zlib_output_stream (priv, ci, 0); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "unable to init Zlib (retval: %d)", + ret); + goto out; + } - /* setup input buffer */ - ci->stream.next_in = (unsigned char *) vec->iov_base; - ci->stream.avail_in = vec->iov_len; + ret = cdc_alloc_iobuf_and_init_vec(this, priv, ci, 0); + if (ret) + goto out; - ci->crc = crc32 (ci->crc, (const Bytef *) vec->iov_base, vec->iov_len); + /* setup output buffer */ + cdc_init_zlib_output_stream(priv, ci, 0); - gf_log (this->name, GF_LOG_DEBUG, "crc=%lu len=%d buffer_size=%d", - ci->crc, ci->stream.avail_in, ci->buffer_size); + /* setup input buffer */ + ci->stream.next_in = (unsigned char *)vec->iov_base; + ci->stream.avail_in = vec->iov_len; - /* compress !! */ - while (ci->stream.avail_in != 0) { - if (ci->stream.avail_out == 0) { + ci->crc = crc32(ci->crc, (const Bytef *)vec->iov_base, vec->iov_len); - CURR_VEC(ci).iov_len = ci->buffer_size; + gf_log(this->name, GF_LOG_DEBUG, "crc=%lu len=%d buffer_size=%d", ci->crc, + ci->stream.avail_in, ci->buffer_size); - ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); - if (ret) - break; + /* compress !! */ + while (ci->stream.avail_in != 0) { + if (ci->stream.avail_out == 0) { + CURR_VEC(ci).iov_len = ci->buffer_size; - /* Re-position Zlib output buffer */ - cdc_init_zlib_output_stream (priv, ci, 0); - } + ret = cdc_alloc_iobuf_and_init_vec(this, priv, ci, 0); + if (ret) + break; - ret = deflate (&ci->stream, Z_NO_FLUSH); - if (ret != Z_OK) - break; + /* Re-position Zlib output buffer */ + cdc_init_zlib_output_stream(priv, ci, 0); } - out: - return ret; + ret = deflate(&ci->stream, Z_NO_FLUSH); + if (ret != Z_OK) + break; + } + +out: + return ret; } int32_t -cdc_compress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, - dict_t **xdata) +cdc_compress(xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, dict_t **xdata) { - int ret = -1; - int i = 0; + int ret = -1; + int i = 0; - ci->iobref = iobref_new (); - if (!ci->iobref) - goto out; + ci->iobref = iobref_new(); + if (!ci->iobref) + goto out; + if (!*xdata) { + *xdata = dict_new(); if (!*xdata) { - *xdata = dict_new (); - if (!*xdata) { - gf_log (this->name, GF_LOG_ERROR, "Cannot allocate xdata" - " dict"); - goto out; - } - } - - /* data */ - for (i = 0; i < ci->count; i++) { - ret = do_cdc_compress (&ci->vector[i], this, priv, ci); - if (ret != Z_OK) - goto deflate_cleanup_out; - } - - /* flush zlib buffer */ - ret = cdc_flush_libz_buffer (priv, this, ci, deflate, Z_FINISH); - if (!(ret == Z_OK || ret == Z_STREAM_END)) { - gf_log (this->name, GF_LOG_ERROR, - "Compression Error: ret (%d)", ret); - ret = -1; - goto deflate_cleanup_out; - } - - /* trailer */ - ret = cdc_init_gzip_trailer (this, priv, ci); - if (ret) - goto deflate_cleanup_out; - - gf_log (this->name, GF_LOG_DEBUG, - "Compressed %ld to %ld bytes", - ci->stream.total_in, ci->stream.total_out); - - ci->nbytes = ci->stream.total_out + GF_CDC_VALIDATION_SIZE; - - /* set deflated canary value for identification */ - ret = dict_set_int32 (*xdata, GF_CDC_DEFLATE_CANARY_VAL, 1); - if (ret) { - /* Send uncompressed data if we can't _tell_ the client - * that deflated data is on it's way. So, we just log - * the faliure and continue as usual. - */ - gf_log (this->name, GF_LOG_ERROR, - "Data deflated, but could not set canary" - " value in dict for identification"); + gf_log(this->name, GF_LOG_ERROR, + "Cannot allocate xdata" + " dict"); + goto out; } + } + + /* data */ + for (i = 0; i < ci->count; i++) { + ret = do_cdc_compress(&ci->vector[i], this, priv, ci); + if (ret != Z_OK) + goto deflate_cleanup_out; + } + + /* flush zlib buffer */ + ret = cdc_flush_libz_buffer(priv, this, ci, deflate, Z_FINISH); + if (!(ret == Z_OK || ret == Z_STREAM_END)) { + gf_log(this->name, GF_LOG_ERROR, "Compression Error: ret (%d)", ret); + ret = -1; + goto deflate_cleanup_out; + } + + /* trailer */ + ret = cdc_init_gzip_trailer(this, priv, ci); + if (ret) + goto deflate_cleanup_out; + + gf_log(this->name, GF_LOG_DEBUG, "Compressed %ld to %ld bytes", + ci->stream.total_in, ci->stream.total_out); + + ci->nbytes = ci->stream.total_out + GF_CDC_VALIDATION_SIZE; + + /* set deflated canary value for identification */ + ret = dict_set_int32(*xdata, GF_CDC_DEFLATE_CANARY_VAL, 1); + if (ret) { + /* Send uncompressed data if we can't _tell_ the client + * that deflated data is on it's way. So, we just log + * the failure and continue as usual. + */ + gf_log(this->name, GF_LOG_ERROR, + "Data deflated, but could not set canary" + " value in dict for identification"); + } - /* This is to be used in testing */ - if ( priv->debug ) { - cdc_dump_iovec_to_disk (this, ci, GF_CDC_DEBUG_DUMP_FILE ); - } + /* This is to be used in testing */ + if (priv->debug) { + cdc_dump_iovec_to_disk(this, ci, GF_CDC_DEBUG_DUMP_FILE); + } - deflate_cleanup_out: - (void) deflateEnd(&ci->stream); +deflate_cleanup_out: + (void)deflateEnd(&ci->stream); - out: - return ret; +out: + return ret; } - /* deflate content is checked by the presence of a canary * value in the dict as the key */ static int32_t -cdc_check_content_for_deflate (dict_t *xdata) +cdc_check_content_for_deflate(dict_t *xdata) { - return dict_get (xdata, GF_CDC_DEFLATE_CANARY_VAL) ? -1 : 0; + return dict_get(xdata, GF_CDC_DEFLATE_CANARY_VAL) ? -1 : 0; } static unsigned long -cdc_extract_crc (char *trailer) +cdc_extract_crc(char *trailer) { - return cdc_get_long ((unsigned char *) &trailer[0]); + return cdc_get_long((unsigned char *)&trailer[0]); } static unsigned long -cdc_extract_size (char *trailer) +cdc_extract_size(char *trailer) { - return cdc_get_long ((unsigned char *) &trailer[4]); + return cdc_get_long((unsigned char *)&trailer[4]); } static int32_t -cdc_validate_inflate (cdc_info_t *ci, unsigned long crc, - unsigned long len) +cdc_validate_inflate(cdc_info_t *ci, unsigned long crc, unsigned long len) { - return !((crc == ci->crc) - /* inflated length is hidden inside - * Zlib stream struct */ - && (len == ci->stream.total_out)); + return !((crc == ci->crc) + /* inflated length is hidden inside + * Zlib stream struct */ + && (len == ci->stream.total_out)); } static int32_t -do_cdc_decompress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci) +do_cdc_decompress(xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci) { - int ret = -1; - int i = 0; - int len = 0; - char *inflte = NULL; - char *trailer = NULL; - struct iovec vec = {0,}; - unsigned long computed_crc = 0; - unsigned long computed_len = 0; - - ret = inflateInit2 (&ci->stream, priv->window_size); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Zlib: Unable to initialize inflate"); - goto out; + int ret = -1; + int i = 0; + int len = 0; + char *inflte = NULL; + char *trailer = NULL; + struct iovec vec = { + 0, + }; + unsigned long computed_crc = 0; + unsigned long computed_len = 0; + + ret = inflateInit2(&ci->stream, priv->window_size); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Zlib: Unable to initialize inflate"); + goto out; + } + + vec = THIS_VEC(ci, 0); + + trailer = (char *)(((char *)vec.iov_base) + vec.iov_len - + GF_CDC_VALIDATION_SIZE); + + /* CRC of uncompressed data */ + computed_crc = cdc_extract_crc(trailer); + + /* size of uncomrpessed data */ + computed_len = cdc_extract_size(trailer); + + gf_log(this->name, GF_LOG_DEBUG, "crc=%lu len=%lu buffer_size=%d", + computed_crc, computed_len, ci->buffer_size); + + inflte = vec.iov_base; + len = vec.iov_len - GF_CDC_VALIDATION_SIZE; + + /* allocate buffer of the original length of the data */ + ret = cdc_alloc_iobuf_and_init_vec(this, priv, ci, 0); + if (ret) + goto out; + + /* setup output buffer */ + cdc_init_zlib_output_stream(priv, ci, 0); + + /* setup input buffer */ + ci->stream.next_in = (unsigned char *)inflte; + ci->stream.avail_in = len; + + while (ci->stream.avail_in != 0) { + if (ci->stream.avail_out == 0) { + CURR_VEC(ci).iov_len = ci->buffer_size; + + ret = cdc_alloc_iobuf_and_init_vec(this, priv, ci, 0); + if (ret) + break; + + /* Re-position Zlib output buffer */ + cdc_init_zlib_output_stream(priv, ci, 0); } - vec = THIS_VEC(ci, 0); - - trailer = (char *) (((char *) vec.iov_base) + vec.iov_len - - GF_CDC_VALIDATION_SIZE); - - /* CRC of uncompressed data */ - computed_crc = cdc_extract_crc (trailer); - - /* size of uncomrpessed data */ - computed_len = cdc_extract_size (trailer); - - gf_log (this->name, GF_LOG_DEBUG, "crc=%lu len=%lu buffer_size=%d", - computed_crc, computed_len, ci->buffer_size); - - inflte = vec.iov_base ; - len = vec.iov_len - GF_CDC_VALIDATION_SIZE; - - /* allocate buffer of the original length of the data */ - ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); - if (ret) - goto out; - - /* setup output buffer */ - cdc_init_zlib_output_stream (priv, ci, 0); - - /* setup input buffer */ - ci->stream.next_in = (unsigned char *) inflte; - ci->stream.avail_in = len; - - while (ci->stream.avail_in != 0) { - if (ci->stream.avail_out == 0) { - CURR_VEC(ci).iov_len = ci->buffer_size; - - ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); - if (ret) - break; - - /* Re-position Zlib output buffer */ - cdc_init_zlib_output_stream (priv, ci, 0); - } - - ret = inflate (&ci->stream, Z_NO_FLUSH); - if (ret == Z_STREAM_ERROR) - break; - } - - /* flush zlib buffer */ - ret = cdc_flush_libz_buffer (priv, this, ci, inflate, Z_SYNC_FLUSH); - if (!(ret == Z_OK || ret == Z_STREAM_END)) { - gf_log (this->name, GF_LOG_ERROR, - "Decompression Error: ret (%d)", ret); - ret = -1; - goto out; - } - - /* compute CRC of the uncompresses data to check for - * correctness */ - - for (i = 0; i < ci->ncount; i++) { - ci->crc = crc32 (ci->crc, - (const Bytef *) ci->vec[i].iov_base, - ci->vec[i].iov_len); - } - - /* validate inflated data */ - ret = cdc_validate_inflate (ci, computed_crc, computed_len); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Checksum or length mismatched in inflated data"); - } - - out: - return ret; + ret = inflate(&ci->stream, Z_NO_FLUSH); + if (ret == Z_STREAM_ERROR) + break; + } + + /* flush zlib buffer */ + ret = cdc_flush_libz_buffer(priv, this, ci, inflate, Z_SYNC_FLUSH); + if (!(ret == Z_OK || ret == Z_STREAM_END)) { + gf_log(this->name, GF_LOG_ERROR, "Decompression Error: ret (%d)", ret); + ret = -1; + goto out; + } + + /* compute CRC of the uncompresses data to check for + * correctness */ + + for (i = 0; i < ci->ncount; i++) { + ci->crc = crc32(ci->crc, (const Bytef *)ci->vec[i].iov_base, + ci->vec[i].iov_len); + } + + /* validate inflated data */ + ret = cdc_validate_inflate(ci, computed_crc, computed_len); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "Checksum or length mismatched in inflated data"); + } + +out: + return ret; } int32_t -cdc_decompress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, - dict_t *xdata) +cdc_decompress(xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, dict_t *xdata) { - int32_t ret = -1; - - /* check for deflate content */ - if (!cdc_check_content_for_deflate (xdata)) { - gf_log (this->name, GF_LOG_DEBUG, - "Content not deflated, passing through ..."); - goto passthrough_out; - } - - ci->iobref = iobref_new (); - if (!ci->iobref) - goto passthrough_out; - - /* do we need to do this? can we assume that one iovec - * will hold per request data everytime? - * - * server/client protocol seems to deal with a single - * iovec even if op_ret > 1M. So, it looks ok to - * assume that a single iovec will contain all the - * data (This saves us a lot from finding the trailer - * and the data since it could have been split-up onto - * two adjacent iovec's. - * - * But, in case this translator is loaded above quick-read - * for some reason, then it's entirely possible that we get - * multiple iovec's... - * - * This case (handled below) is not tested. (by loading the - * xlator below quick-read) - */ - - /* @@ I_HOPE_THIS_IS_NEVER_HIT */ - if (ci->count > 1) { - gf_log (this->name, GF_LOG_WARNING, "unable to handle" - " multiple iovecs (%d in number)", ci->count); - goto inflate_cleanup_out; - /* TODO: coallate all iovecs in one */ - } - - ret = do_cdc_decompress (this, priv, ci); - if (ret) - goto inflate_cleanup_out; - - ci->nbytes = ci->stream.total_out; - - gf_log (this->name, GF_LOG_DEBUG, - "Inflated %ld to %ld bytes", - ci->stream.total_in, ci->stream.total_out); - - inflate_cleanup_out: - (void) inflateEnd (&ci->stream); - - passthrough_out: - return ret; + int32_t ret = -1; + + /* check for deflate content */ + if (!cdc_check_content_for_deflate(xdata)) { + gf_log(this->name, GF_LOG_DEBUG, + "Content not deflated, passing through ..."); + goto passthrough_out; + } + + ci->iobref = iobref_new(); + if (!ci->iobref) + goto passthrough_out; + + /* do we need to do this? can we assume that one iovec + * will hold per request data every time? + * + * server/client protocol seems to deal with a single + * iovec even if op_ret > 1M. So, it looks ok to + * assume that a single iovec will contain all the + * data (This saves us a lot from finding the trailer + * and the data since it could have been split-up onto + * two adjacent iovec's. + * + * But, in case this translator is loaded above quick-read + * for some reason, then it's entirely possible that we get + * multiple iovec's... + * + * This case (handled below) is not tested. (by loading the + * xlator below quick-read) + */ + + /* @@ I_HOPE_THIS_IS_NEVER_HIT */ + if (ci->count > 1) { + gf_log(this->name, GF_LOG_WARNING, + "unable to handle" + " multiple iovecs (%d in number)", + ci->count); + goto inflate_cleanup_out; + /* TODO: coallate all iovecs in one */ + } + + ret = do_cdc_decompress(this, priv, ci); + if (ret) + goto inflate_cleanup_out; + + ci->nbytes = ci->stream.total_out; + + gf_log(this->name, GF_LOG_DEBUG, "Inflated %ld to %ld bytes", + ci->stream.total_in, ci->stream.total_out); + +inflate_cleanup_out: + (void)inflateEnd(&ci->stream); + +passthrough_out: + return ret; } #endif diff --git a/xlators/features/compress/src/cdc-mem-types.h b/xlators/features/compress/src/cdc-mem-types.h index efa00805987..928afdd2efe 100644 --- a/xlators/features/compress/src/cdc-mem-types.h +++ b/xlators/features/compress/src/cdc-mem-types.h @@ -11,12 +11,13 @@ #ifndef __CDC_MEM_TYPES_H #define __CDC_MEM_TYPES_H -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_cdc_mem_types { - gf_cdc_mt_priv_t = gf_common_mt_end + 1, - gf_cdc_mt_vec_t = gf_common_mt_end + 2, - gf_cdc_mt_gzip_trailer_t = gf_common_mt_end + 3, + gf_cdc_mt_priv_t = gf_common_mt_end + 1, + gf_cdc_mt_vec_t = gf_common_mt_end + 2, + gf_cdc_mt_gzip_trailer_t = gf_common_mt_end + 3, + gf_cdc_mt_end = gf_common_mt_end + 4, }; #endif diff --git a/xlators/features/compress/src/cdc.c b/xlators/features/compress/src/cdc.c index eb7d87c5698..b0b51e914ed 100644 --- a/xlators/features/compress/src/cdc.c +++ b/xlators/features/compress/src/cdc.c @@ -10,333 +10,339 @@ #include <sys/uio.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "defaults.h" -#include "logging.h" +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include <glusterfs/logging.h> #include "cdc.h" #include "cdc-mem-types.h" static void -cdc_cleanup_iobref (cdc_info_t *ci) +cdc_cleanup_iobref(cdc_info_t *ci) { - assert(ci->iobref != NULL); - iobref_clear (ci->iobref); + assert(ci->iobref != NULL); + iobref_clear(ci->iobref); } int32_t -cdc_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, - struct iatt *stbuf, struct iobref *iobref, - dict_t *xdata) +cdc_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { - int ret = -1; - cdc_priv_t *priv = NULL; - cdc_info_t ci = {0,}; - - GF_VALIDATE_OR_GOTO ("cdc", this, default_out); - GF_VALIDATE_OR_GOTO (this->name, frame, default_out); - - priv = this->private; - - if (op_ret <= 0) - goto default_out; - - if ( (priv->min_file_size != 0) - && (op_ret < priv->min_file_size) ) - goto default_out; - - ci.count = count; - ci.ibytes = op_ret; - ci.vector = vector; - ci.buf = NULL; - ci.iobref = NULL; - ci.ncount = 0; - ci.crc = 0; - ci.buffer_size = GF_CDC_DEF_BUFFERSIZE; - -/* A readv compresses on the server side and decompresses on the client side - */ - if (priv->op_mode == GF_CDC_MODE_SERVER) { - ret = cdc_compress (this, priv, &ci, &xdata); - } else if (priv->op_mode == GF_CDC_MODE_CLIENT) { - ret = cdc_decompress (this, priv, &ci, xdata); - } else { - gf_log (this->name, GF_LOG_ERROR, - "Invalid operation mode (%d)", priv->op_mode); - } - - if (ret) - goto default_out; - - STACK_UNWIND_STRICT (readv, frame, ci.nbytes, op_errno, - ci.vec, ci.ncount, stbuf, iobref, - xdata); - cdc_cleanup_iobref (&ci); - return 0; - - default_out: - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, - vector, count, stbuf, iobref, xdata); - return 0; + int ret = -1; + cdc_priv_t *priv = NULL; + cdc_info_t ci = { + 0, + }; + + GF_VALIDATE_OR_GOTO("cdc", this, default_out); + GF_VALIDATE_OR_GOTO(this->name, frame, default_out); + + priv = this->private; + + if (op_ret <= 0) + goto default_out; + + if ((priv->min_file_size != 0) && (op_ret < priv->min_file_size)) + goto default_out; + + ci.count = count; + ci.ibytes = op_ret; + ci.vector = vector; + ci.buf = NULL; + ci.iobref = NULL; + ci.ncount = 0; + ci.crc = 0; + ci.buffer_size = GF_CDC_DEF_BUFFERSIZE; + + /* A readv compresses on the server side and decompresses on the client side + */ + if (priv->op_mode == GF_CDC_MODE_SERVER) { + ret = cdc_compress(this, priv, &ci, &xdata); + } else if (priv->op_mode == GF_CDC_MODE_CLIENT) { + ret = cdc_decompress(this, priv, &ci, xdata); + } else { + gf_log(this->name, GF_LOG_ERROR, "Invalid operation mode (%d)", + priv->op_mode); + } + + if (ret) + goto default_out; + + STACK_UNWIND_STRICT(readv, frame, ci.nbytes, op_errno, ci.vec, ci.ncount, + stbuf, iobref, xdata); + cdc_cleanup_iobref(&ci); + return 0; + +default_out: + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); + return 0; } int32_t -cdc_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, uint32_t flags, - dict_t *xdata) +cdc_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - fop_readv_cbk_t cbk = NULL; + fop_readv_cbk_t cbk = NULL; #ifdef HAVE_LIB_Z - cbk = cdc_readv_cbk; + cbk = cdc_readv_cbk; #else - cbk = default_readv_cbk; + cbk = default_readv_cbk; #endif - STACK_WIND (frame, cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, - fd, size, offset, flags, xdata); - return 0; + STACK_WIND(frame, cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; } int32_t -cdc_writev_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +cdc_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, xdata); - return 0; +int32_t +cdc_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + int ret = -1; + cdc_priv_t *priv = NULL; + cdc_info_t ci = { + 0, + }; + size_t isize = 0; + + GF_VALIDATE_OR_GOTO("cdc", this, err); + GF_VALIDATE_OR_GOTO(this->name, frame, err); + + priv = this->private; + + isize = iov_length(vector, count); + + if (isize <= 0) + goto default_out; + + if ((priv->min_file_size != 0) && (isize < priv->min_file_size)) + goto default_out; + + ci.count = count; + ci.ibytes = isize; + ci.vector = vector; + ci.buf = NULL; + ci.iobref = NULL; + ci.ncount = 0; + ci.crc = 0; + ci.buffer_size = GF_CDC_DEF_BUFFERSIZE; + + /* A writev compresses on the client side and decompresses on the server + * side + */ + if (priv->op_mode == GF_CDC_MODE_CLIENT) { + ret = cdc_compress(this, priv, &ci, &xdata); + } else if (priv->op_mode == GF_CDC_MODE_SERVER) { + ret = cdc_decompress(this, priv, &ci, xdata); + } else { + gf_log(this->name, GF_LOG_ERROR, "Invalid operation mode (%d) ", + priv->op_mode); + } + + if (ret) + goto default_out; + + STACK_WIND(frame, cdc_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, ci.vec, ci.ncount, offset, + flags, iobref, xdata); + + cdc_cleanup_iobref(&ci); + return 0; + +default_out: + STACK_WIND(frame, cdc_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; +err: + STACK_UNWIND_STRICT(writev, frame, -1, EINVAL, NULL, NULL, NULL); + return 0; } int32_t -cdc_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t offset, - uint32_t flags, - struct iobref *iobref, dict_t *xdata) +mem_acct_init(xlator_t *this) { - int ret = -1; - cdc_priv_t *priv = NULL; - cdc_info_t ci = {0,}; - size_t isize = 0; - - GF_VALIDATE_OR_GOTO ("cdc", this, default_out); - GF_VALIDATE_OR_GOTO (this->name, frame, default_out); - - priv = this->private; - - isize = iov_length(vector, count); - - if (isize <= 0) - goto default_out; - - if ( (priv->min_file_size != 0) - && (isize < priv->min_file_size) ) - goto default_out; - - ci.count = count; - ci.ibytes = isize; - ci.vector = vector; - ci.buf = NULL; - ci.iobref = NULL; - ci.ncount = 0; - ci.crc = 0; - ci.buffer_size = GF_CDC_DEF_BUFFERSIZE; - -/* A writev compresses on the client side and decompresses on the server side - */ - if (priv->op_mode == GF_CDC_MODE_CLIENT) { - ret = cdc_compress (this, priv, &ci, &xdata); - } else if (priv->op_mode == GF_CDC_MODE_SERVER) { - ret = cdc_decompress (this, priv, &ci, xdata); - } else { - gf_log (this->name, GF_LOG_ERROR, "Invalid operation mode (%d) ", priv->op_mode); - } - - if (ret) - goto default_out; - - STACK_WIND (frame, - cdc_writev_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->writev, - fd, ci.vec, ci.ncount, offset, flags, - iobref, xdata); - - cdc_cleanup_iobref (&ci); - return 0; - - default_out: - STACK_WIND (frame, - cdc_writev_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->writev, - fd, vector, count, offset, flags, - iobref, xdata); - return 0; + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init(this, gf_cdc_mt_end); + + if (ret != 0) { + gf_log(this->name, GF_LOG_ERROR, + "Memory accounting init" + "failed"); + return ret; + } + + return ret; } int32_t -init (xlator_t *this) +init(xlator_t *this) { - int ret = -1; - char *temp_str = NULL; - cdc_priv_t *priv = NULL; - - GF_VALIDATE_OR_GOTO ("cdc", this, err); - - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "Need subvolume == 1"); - goto err; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "Dangling volume. Check volfile"); - } - - priv = GF_CALLOC (1, sizeof (*priv), gf_cdc_mt_priv_t); - if (!priv) { - goto err; - } - - /* Check if debug mode is turned on */ - GF_OPTION_INIT ("debug", priv->debug, bool, err); - if( priv->debug ) { - gf_log (this->name, GF_LOG_DEBUG, "CDC debug option turned on"); - } - - /* Set Gzip Window Size */ - GF_OPTION_INIT ("window-size", priv->window_size, int32, err); - if ( (priv->window_size > GF_CDC_MAX_WINDOWSIZE) - || (priv->window_size < GF_CDC_DEF_WINDOWSIZE) ) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid gzip window size (%d), using default", - priv->window_size); - priv->window_size = GF_CDC_DEF_WINDOWSIZE; - } - - /* Set Gzip (De)Compression Level */ - GF_OPTION_INIT ("compression-level", priv->cdc_level, int32, err); - if ( ((priv->cdc_level < 1) || (priv->cdc_level > 9)) - && (priv->cdc_level != GF_CDC_DEF_COMPRESSION) ) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid gzip (de)compression level (%d)," - " using default", priv->cdc_level); - priv->cdc_level = GF_CDC_DEF_COMPRESSION; - } - - /* Set Gzip Memory Level */ - GF_OPTION_INIT ("mem-level", priv->mem_level, int32, err); - if ( (priv->mem_level < 1) || (priv->mem_level > 9) ) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid gzip memory level, using the default"); - priv->mem_level = GF_CDC_DEF_MEMLEVEL; - } - - /* Set min file size to enable compression */ - GF_OPTION_INIT ("min-size", priv->min_file_size, int32, err); - - /* Mode of operation - Server/Client */ - ret = dict_get_str (this->options, "mode", &temp_str); - if (ret) { - gf_log (this->name, GF_LOG_CRITICAL, - "Operation mode not specified !!"); - goto err; - } - - if (GF_CDC_MODE_IS_CLIENT (temp_str)) { - priv->op_mode = GF_CDC_MODE_CLIENT; - } else if (GF_CDC_MODE_IS_SERVER (temp_str)) { - priv->op_mode = GF_CDC_MODE_SERVER; - } else { - gf_log (this->name, GF_LOG_CRITICAL, - "Bogus operation mode (%s) specified", temp_str); - goto err; - } - - this->private = priv; - gf_log (this->name, GF_LOG_DEBUG, "CDC xlator loaded in (%s) mode",temp_str); - return 0; - - err: - if (priv) - GF_FREE (priv); - - return -1; + int ret = -1; + char *temp_str = NULL; + cdc_priv_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("cdc", this, err); + + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_ERROR, "Need subvolume == 1"); + goto err; + } + + if (!this->parents) { + gf_log(this->name, GF_LOG_WARNING, "Dangling volume. Check volfile"); + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_cdc_mt_priv_t); + if (!priv) { + goto err; + } + + /* Check if debug mode is turned on */ + GF_OPTION_INIT("debug", priv->debug, bool, err); + if (priv->debug) { + gf_log(this->name, GF_LOG_DEBUG, "CDC debug option turned on"); + } + + /* Set Gzip Window Size */ + GF_OPTION_INIT("window-size", priv->window_size, int32, err); + if ((priv->window_size > GF_CDC_MAX_WINDOWSIZE) || + (priv->window_size < GF_CDC_DEF_WINDOWSIZE)) { + gf_log(this->name, GF_LOG_WARNING, + "Invalid gzip window size (%d), using default", + priv->window_size); + priv->window_size = GF_CDC_DEF_WINDOWSIZE; + } + + /* Set Gzip (De)Compression Level */ + GF_OPTION_INIT("compression-level", priv->cdc_level, int32, err); + if (((priv->cdc_level < 1) || (priv->cdc_level > 9)) && + (priv->cdc_level != GF_CDC_DEF_COMPRESSION)) { + gf_log(this->name, GF_LOG_WARNING, + "Invalid gzip (de)compression level (%d)," + " using default", + priv->cdc_level); + priv->cdc_level = GF_CDC_DEF_COMPRESSION; + } + + /* Set Gzip Memory Level */ + GF_OPTION_INIT("mem-level", priv->mem_level, int32, err); + if ((priv->mem_level < 1) || (priv->mem_level > 9)) { + gf_log(this->name, GF_LOG_WARNING, + "Invalid gzip memory level, using the default"); + priv->mem_level = GF_CDC_DEF_MEMLEVEL; + } + + /* Set min file size to enable compression */ + GF_OPTION_INIT("min-size", priv->min_file_size, int32, err); + + /* Mode of operation - Server/Client */ + ret = dict_get_str(this->options, "mode", &temp_str); + if (ret) { + gf_log(this->name, GF_LOG_CRITICAL, "Operation mode not specified !!"); + goto err; + } + + if (GF_CDC_MODE_IS_CLIENT(temp_str)) { + priv->op_mode = GF_CDC_MODE_CLIENT; + } else if (GF_CDC_MODE_IS_SERVER(temp_str)) { + priv->op_mode = GF_CDC_MODE_SERVER; + } else { + gf_log(this->name, GF_LOG_CRITICAL, + "Bogus operation mode (%s) specified", temp_str); + goto err; + } + + this->private = priv; + gf_log(this->name, GF_LOG_DEBUG, "CDC xlator loaded in (%s) mode", + temp_str); + return 0; + +err: + if (priv) + GF_FREE(priv); + + return -1; } void -fini (xlator_t *this) +fini(xlator_t *this) { - cdc_priv_t *priv = this->private; + cdc_priv_t *priv = this->private; - if (priv) - GF_FREE (priv); - this->private = NULL; - return; + if (priv) + GF_FREE(priv); + this->private = NULL; + return; } struct xlator_fops fops = { - .readv = cdc_readv, - .writev = cdc_writev, + .readv = cdc_readv, + .writev = cdc_writev, }; -struct xlator_cbks cbks = { -}; +struct xlator_cbks cbks = {}; struct volume_options options[] = { - { .key = {"window-size"}, - .default_value = "-15", - .type = GF_OPTION_TYPE_INT, - .description = "Size of the zlib history buffer." - }, - { .key = {"mem-level"}, - .default_value = "8", - .type = GF_OPTION_TYPE_INT, - .description = "Memory allocated for internal compression state.\ - 1 uses minimum memory but is slow and reduces \ - compression ratio; memLevel=9 uses maximum memory \ - for optimal speed. The default value is 8." - }, - { .key = {"compression-level"}, - .default_value = "-1", - .type = GF_OPTION_TYPE_INT, - .description = "Compression levels \ - 0 : no compression, 1 : best speed, \ - 9 : best compression, -1 : default compression " - }, - { .key = {"min-size"}, - .default_value = "0", - .type = GF_OPTION_TYPE_INT, - .description = "Data is compressed only when its size exceeds this." - }, - { .key = {"mode"}, - .value = {"server", "client"}, - .type = GF_OPTION_TYPE_STR, - .description = "Set on the basis of where the xlator is loaded." - }, - { .key = {"debug"}, - .default_value = "false", - .type = GF_OPTION_TYPE_BOOL, - .description = "This is used in testing. Will dump compressed data \ - to disk as a gzip file." - }, - { .key = {NULL} - }, + {.key = {"window-size"}, + .default_value = "-15", + .type = GF_OPTION_TYPE_INT, + .description = "Size of the zlib history buffer."}, + {.key = {"mem-level"}, + .default_value = "8", + .type = GF_OPTION_TYPE_INT, + .description = "Memory allocated for internal compression state. " + "1 uses minimum memory but is slow and reduces " + "compression ratio; memLevel=9 uses maximum memory " + "for optimal speed. The default value is 8."}, + {.key = {"compression-level"}, + .default_value = "-1", + .type = GF_OPTION_TYPE_INT, + .description = "Compression levels \n" + "0 : no compression, 1 : best speed, \n" + "9 : best compression, -1 : default compression "}, + {.key = {"min-size"}, + .default_value = "0", + .type = GF_OPTION_TYPE_INT, + .description = "Data is compressed only when its size exceeds this."}, + {.key = {"mode"}, + .value = {"server", "client"}, + .type = GF_OPTION_TYPE_STR, + .description = "Set on the basis of where the xlator is loaded. " + "This option should NOT be configured by user."}, + {.key = {"debug"}, + .default_value = "false", + .type = GF_OPTION_TYPE_BOOL, + .description = "This is used in testing. Will dump compressed data " + "to disk as a gzip file."}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .mem_acct_init = mem_acct_init, + .op_version = {GD_OP_VERSION_3_9_0}, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "cdc", + .category = GF_TECH_PREVIEW, }; diff --git a/xlators/features/compress/src/cdc.h b/xlators/features/compress/src/cdc.h index 71f4d2317bb..cb87b06a989 100644 --- a/xlators/features/compress/src/cdc.h +++ b/xlators/features/compress/src/cdc.h @@ -15,41 +15,41 @@ #include "zlib.h" #endif -#include "xlator.h" +#include <glusterfs/xlator.h> #ifndef MAX_IOVEC #define MAX_IOVEC 16 #endif typedef struct cdc_priv { - int window_size; - int mem_level; - int cdc_level; - int min_file_size; - int op_mode; - gf_boolean_t debug; - gf_lock_t lock; + int window_size; + int mem_level; + int cdc_level; + int min_file_size; + int op_mode; + gf_boolean_t debug; + gf_lock_t lock; } cdc_priv_t; typedef struct cdc_info { - /* input bits */ - int count; - int32_t ibytes; - struct iovec *vector; - struct iatt *buf; - - /* output bits */ - int ncount; - int nbytes; - int buffer_size; - struct iovec vec[MAX_IOVEC]; - struct iobref *iobref; - - /* zlib bits */ + /* input bits */ + int count; + int32_t ibytes; + struct iovec *vector; + struct iatt *buf; + + /* output bits */ + int ncount; + int nbytes; + int buffer_size; + struct iovec vec[MAX_IOVEC]; + struct iobref *iobref; + + /* zlib bits */ #ifdef HAVE_LIB_Z - z_stream stream; + z_stream stream; #endif - unsigned long crc; + unsigned long crc; } cdc_info_t; #define NVEC(ci) (ci->ncount - 1) @@ -57,8 +57,8 @@ typedef struct cdc_info { #define THIS_VEC(ci, i) ci->vector[i] /* Gzip defaults */ -#define GF_CDC_DEF_WINDOWSIZE -15 /* default value */ -#define GF_CDC_MAX_WINDOWSIZE -8 /* max value */ +#define GF_CDC_DEF_WINDOWSIZE -15 /* default value */ +#define GF_CDC_MAX_WINDOWSIZE -8 /* max value */ #ifdef HAVE_LIB_Z #define GF_CDC_DEF_COMPRESSION Z_DEFAULT_COMPRESSION @@ -66,15 +66,15 @@ typedef struct cdc_info { #define GF_CDC_DEF_COMPRESSION -1 #endif -#define GF_CDC_DEF_MEMLEVEL 8 -#define GF_CDC_DEF_BUFFERSIZE 262144 // 256K - default compression buffer size +#define GF_CDC_DEF_MEMLEVEL 8 +#define GF_CDC_DEF_BUFFERSIZE 262144 // 256K - default compression buffer size /* Operation mode * If xlator is loaded on client, readv decompresses and writev compresses * If xlator is loaded on server, readv compresses and writev decompresses */ -#define GF_CDC_MODE_CLIENT 0 -#define GF_CDC_MODE_SERVER 1 +#define GF_CDC_MODE_CLIENT 0 +#define GF_CDC_MODE_SERVER 1 /* min size of data to do cmpression * 0 == compress even 1byte @@ -87,21 +87,13 @@ typedef struct cdc_info { #define GF_CDC_DEFLATE_CANARY_VAL "deflate" #define GF_CDC_DEBUG_DUMP_FILE "/tmp/cdcdump.gz" -#define GF_CDC_MODE_IS_CLIENT(m) \ - (strcmp (m, "client") == 0) +#define GF_CDC_MODE_IS_CLIENT(m) (strcmp(m, "client") == 0) -#define GF_CDC_MODE_IS_SERVER(m) \ - (strcmp (m, "server") == 0) +#define GF_CDC_MODE_IS_SERVER(m) (strcmp(m, "server") == 0) int32_t -cdc_compress (xlator_t *this, - cdc_priv_t *priv, - cdc_info_t *ci, - dict_t **xdata); +cdc_compress(xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, dict_t **xdata); int32_t -cdc_decompress (xlator_t *this, - cdc_priv_t *priv, - cdc_info_t *ci, - dict_t *xdata); +cdc_decompress(xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, dict_t *xdata); #endif diff --git a/xlators/features/filter/src/Makefile.am b/xlators/features/filter/src/Makefile.am deleted file mode 100644 index d1fda8b0a9d..00000000000 --- a/xlators/features/filter/src/Makefile.am +++ /dev/null @@ -1,16 +0,0 @@ -xlator_LTLIBRARIES = filter.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/features - -filter_la_LDFLAGS = -module -avoid-version - -filter_la_SOURCES = filter.c -filter_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -noinst_HEADERS = filter-mem-types.h - -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src - -AM_CFLAGS = -Wall $(GF_CFLAGS) - -CLEANFILES = - diff --git a/xlators/features/filter/src/filter.c b/xlators/features/filter/src/filter.c deleted file mode 100644 index 1d4887b7143..00000000000 --- a/xlators/features/filter/src/filter.c +++ /dev/null @@ -1,1734 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include <sys/types.h> -#include <sys/stat.h> -#include <unistd.h> - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "filter-mem-types.h" - -#define GF_FILTER_NOBODY_UID 65534 -#define GF_FILTER_NOBODY_GID 65534 -#define GF_FILTER_ROOT_UID 0 -#define GF_FILTER_ROOT_GID 0 - -#define GF_MAXIMUM_FILTERING_ALLOWED 32 - -/* - option root-filtering on (off by default) - option translate-uid <uid-range=newuid,uid=newuid> - option translate-gid <gid-range=newgid,gid=newgid> - option read-only <yes|true> - option fixed-uid <uid> - option fixed-gid <gid> - option filter-uid <uid-range,uid> - option filter-gid <gid-range,gid> // not supported yet - -*/ - -struct gf_filter { - /* Flags */ - gf_boolean_t complete_read_only; - char fixed_uid_set; - char fixed_gid_set; - char partial_filter; - - /* Options */ - /* Mapping/Filtering/Translate whatever you want to call */ - int translate_num_uid_entries; - int translate_num_gid_entries; - int translate_input_uid[GF_MAXIMUM_FILTERING_ALLOWED][2]; - int translate_output_uid[GF_MAXIMUM_FILTERING_ALLOWED]; - int translate_input_gid[GF_MAXIMUM_FILTERING_ALLOWED][2]; - int translate_output_gid[GF_MAXIMUM_FILTERING_ALLOWED]; - - /* Fixed uid/gid */ - int fixed_uid; - int fixed_gid; - - /* Filter */ - int filter_num_uid_entries; - int filter_num_gid_entries; - int filter_input_uid[GF_MAXIMUM_FILTERING_ALLOWED][2]; - int filter_input_gid[GF_MAXIMUM_FILTERING_ALLOWED][2]; - -}; - -/* update_frame: The main logic of the whole translator. - Return values: - 0: no change - // TRANSLATE - 1: only uid changed - 2: only gid changed - 3: both uid/gid changed - // FILTER - 4: uid in filter range - 5: gid in filter range // not supported yet - 6: complete fs is readonly -*/ - -#define GF_FILTER_NO_CHANGE 0 -#define GF_FILTER_MAP_UID 1 -#define GF_FILTER_MAP_GID 2 -#define GF_FILTER_MAP_BOTH 3 -#define GF_FILTER_FILTER_UID 4 -#define GF_FILTER_FILTER_GID 5 -#define GF_FILTER_RO_FS 6 - -static int32_t -update_frame (call_frame_t *frame, - inode_t *inode, - struct gf_filter *filter) -{ - uid_t uid = 0; - int32_t idx = 0; - int32_t ret = 0; - int32_t dictret = 0; - uint64_t tmp_uid = 0; - - for (idx = 0; idx < filter->translate_num_uid_entries; idx++) { - if ((frame->root->uid >=filter->translate_input_uid[idx][0]) && - (frame->root->uid <=filter->translate_input_uid[idx][1])) { - dictret = inode_ctx_get (inode, frame->this, &tmp_uid); - uid = (uid_t)tmp_uid; - if (dictret == 0) { - if (frame->root->uid != uid) - ret = GF_FILTER_MAP_UID; - } else { - ret = GF_FILTER_MAP_UID; - } - break; - } - } - - for (idx = 0; idx < filter->translate_num_gid_entries; idx++) { - if ((frame->root->gid >=filter->translate_input_gid[idx][0]) && - (frame->root->gid <=filter->translate_input_gid[idx][1])) { - if (ret == GF_FILTER_NO_CHANGE) - ret = GF_FILTER_MAP_GID; - else - ret = GF_FILTER_MAP_BOTH; - break; - } - } - - - if (filter->complete_read_only) - return GF_FILTER_RO_FS; - - if (filter->partial_filter) { - dictret = inode_ctx_get (inode, frame->this, &tmp_uid); - uid = (uid_t)tmp_uid; - if (dictret != -1) { - for (idx = 0; idx < filter->filter_num_uid_entries; - idx++) { - if ((uid >=filter->filter_input_uid[idx][0]) && - (uid <=filter->filter_input_uid[idx][1])) { - return GF_FILTER_FILTER_UID; - } - } - } - } - - return ret; -} - -/* if 'root' don't change the uid/gid */ -static int32_t -update_stat (struct iatt *stbuf, - struct gf_filter *filter) -{ - int32_t idx = 0; - for (idx = 0; idx < filter->translate_num_uid_entries; idx++) { - if (stbuf->ia_uid == GF_FILTER_ROOT_UID) - continue; - if ((stbuf->ia_uid >= filter->translate_input_uid[idx][0]) && - (stbuf->ia_uid <= filter->translate_input_uid[idx][1])) { - stbuf->ia_uid = filter->translate_output_uid[idx]; - break; - } - } - - for (idx = 0; idx < filter->translate_num_gid_entries; idx++) { - if (stbuf->ia_gid == GF_FILTER_ROOT_GID) - continue; - if ((stbuf->ia_gid >= filter->translate_input_gid[idx][0]) && - (stbuf->ia_gid <= filter->translate_input_gid[idx][1])) { - stbuf->ia_gid = filter->translate_output_gid[idx]; - break; - } - } - - if (filter->fixed_uid_set) { - stbuf->ia_uid = filter->fixed_uid; - } - - if (filter->fixed_gid_set) { - stbuf->ia_gid = filter->fixed_gid; - } - - return 0; -} - -static int32_t -filter_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - int ret = 0; - if (op_ret >= 0) { - update_stat (buf, this->private); - ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->ia_uid); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "couldn't set context"); - } - - update_stat (postparent, this->private); - } - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, dict, postparent); - return 0; -} - -int32_t -filter_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xattr_req) -{ - STACK_WIND (frame, - filter_lookup_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, - loc, - xattr_req); - return 0; -} - - -static int32_t -filter_stat_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf) -{ - if (op_ret >= 0) { - update_stat (buf, this->private); - } - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -int32_t -filter_stat (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - STACK_WIND (frame, - filter_stat_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, - loc); - return 0; -} - -static int32_t -filter_setattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preop, - struct iatt *postop) -{ - if (op_ret >= 0) { - update_stat (preop, this->private); - update_stat (postop, this->private); - } - STACK_UNWIND (frame, op_ret, op_errno, preop, postop); - return 0; -} - -int32_t -filter_setattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - struct iatt *stbuf, - int32_t valid) -{ - int32_t ret = 0; - ret = update_frame (frame, loc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (loc->inode->st_mode & S_IWGRP) - break; - case GF_FILTER_MAP_BOTH: - if (loc->inode->st_mode & S_IWOTH) - break; - gf_log (this->name, GF_LOG_DEBUG, - "%s: returning permission denied", loc->path); - STACK_UNWIND (frame, -1, EPERM, NULL, NULL, NULL); - return 0; - - case GF_FILTER_FILTER_UID: - case GF_FILTER_FILTER_GID: - case GF_FILTER_RO_FS: - STACK_UNWIND (frame, -1, EROFS, NULL, NULL); - return 0; - default: - break; - } - - STACK_WIND (frame, - filter_setattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setattr, - loc, - stbuf, valid); - return 0; -} - -static int32_t -filter_fsetattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preop, - struct iatt *postop) -{ - if (op_ret >= 0) { - update_stat (preop, this->private); - update_stat (postop, this->private); - } - STACK_UNWIND (frame, - op_ret, - op_errno, - preop, postop); - return 0; -} - -int32_t -filter_fsetattr (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iatt *stbuf, - int32_t valid) -{ - STACK_WIND (frame, - filter_fsetattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsetattr, - fd, - stbuf, valid); - return 0; -} - - -static int32_t -filter_truncate_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - if (op_ret >= 0) { - update_stat (prebuf, this->private); - update_stat (postbuf, this->private); - } - STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} - -int32_t -filter_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset) -{ - int32_t ret = 0; - ret = update_frame (frame, loc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (loc->inode->st_mode & S_IWGRP) - break; - case GF_FILTER_MAP_BOTH: - if (loc->inode->st_mode & S_IWOTH) - break; - gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); - STACK_UNWIND (frame, -1, EPERM, NULL, NULL); - return 0; - - case GF_FILTER_FILTER_UID: - case GF_FILTER_FILTER_GID: - case GF_FILTER_RO_FS: - STACK_UNWIND (frame, -1, EROFS, NULL, NULL); - return 0; - } - - STACK_WIND (frame, - filter_truncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, - loc, - offset); - return 0; -} - -static int32_t -filter_ftruncate_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - if (op_ret >= 0) { - update_stat (prebuf, this->private); - update_stat (postbuf, this->private); - } - STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} - -int32_t -filter_ftruncate (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - off_t offset) -{ - STACK_WIND (frame, - filter_ftruncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, - fd, - offset); - return 0; -} - - -static int32_t -filter_readlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - const char *path, - struct iatt *sbuf) -{ - if (op_ret >= 0) - update_stat (sbuf, this->private); - - STACK_UNWIND (frame, op_ret, op_errno, path, sbuf); - return 0; -} - -int32_t -filter_readlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - size_t size) -{ - int32_t ret = 0; - ret = update_frame (frame, loc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (loc->inode->st_mode & S_IRGRP) - break; - case GF_FILTER_MAP_BOTH: - if (loc->inode->st_mode & S_IROTH) - break; - gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); - STACK_UNWIND (frame, -1, EPERM, NULL); - return 0; - } - STACK_WIND (frame, - filter_readlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readlink, - loc, - size); - return 0; -} - - -static int32_t -filter_mknod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - int ret = 0; - - if (op_ret >= 0) { - update_stat (buf, this->private); - ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->ia_uid); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "couldn't set context"); - } - - update_stat (preparent, this->private); - update_stat (postparent, this->private); - } - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; -} - -int32_t -filter_mknod (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode, - dev_t rdev) -{ - int ret = 0; - inode_t *parent = loc->parent; - ret = update_frame (frame, loc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (parent->st_mode & S_IWGRP) - break; - case GF_FILTER_MAP_BOTH: - if (parent->st_mode & S_IWOTH) - break; - gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); - STACK_UNWIND (frame, -1, EPERM, NULL, NULL, - NULL, NULL); - return 0; - - case GF_FILTER_FILTER_UID: - case GF_FILTER_FILTER_GID: - case GF_FILTER_RO_FS: - STACK_UNWIND (frame, -1, EROFS, NULL, NULL, - NULL, NULL); - return 0; - } - STACK_WIND (frame, - filter_mknod_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mknod, - loc, mode, rdev); - return 0; -} - -static int32_t -filter_mkdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - int ret = 0; - if (op_ret >= 0) { - update_stat (buf, this->private); - ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->ia_uid); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "couldn't set context"); - } - - update_stat (preparent, this->private); - update_stat (postparent, this->private); - } - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; -} - -int32_t -filter_mkdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode) -{ - int ret = 0; - inode_t *parent = loc->parent; - ret = update_frame (frame, loc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (parent->st_mode & S_IWGRP) - break; - case GF_FILTER_MAP_BOTH: - if (parent->st_mode & S_IWOTH) - break; - gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); - STACK_UNWIND (frame, -1, EPERM, NULL, NULL, - NULL, NULL); - return 0; - - case GF_FILTER_FILTER_UID: - case GF_FILTER_FILTER_GID: - case GF_FILTER_RO_FS: - STACK_UNWIND (frame, -1, EROFS, NULL, NULL, - NULL, NULL); - return 0; - } - STACK_WIND (frame, - filter_mkdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mkdir, - loc, mode); - return 0; -} - -static int32_t -filter_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - if (op_ret >= 0) { - update_stat (preparent, this->private); - update_stat (postparent, this->private); - } - - STACK_UNWIND (frame, op_ret, op_errno, preparent, postparent); - return 0; -} - -int32_t -filter_unlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - int32_t ret = 0; - inode_t *parent = loc->parent; - if (!parent) - parent = inode_parent (loc->inode, 0, NULL); - ret = update_frame (frame, loc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (parent->st_mode & S_IWGRP) - break; - if (loc->inode->st_mode & S_IWGRP) - break; - case GF_FILTER_MAP_BOTH: - if (parent->st_mode & S_IWOTH) - break; - if (loc->inode->st_mode & S_IWOTH) - break; - gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); - STACK_UNWIND (frame, -1, EPERM, NULL, NULL); - return 0; - case GF_FILTER_FILTER_UID: - case GF_FILTER_FILTER_GID: - case GF_FILTER_RO_FS: - STACK_UNWIND (frame, -1, EROFS, NULL, NULL); - return 0; - } - STACK_WIND (frame, - filter_unlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, - loc); - return 0; -} - -static int32_t -filter_rmdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - if (op_ret >= 0) { - update_stat (preparent, this->private); - update_stat (postparent, this->private); - } - - STACK_UNWIND (frame, op_ret, op_errno, preparent, postparent); - return 0; -} - -int32_t -filter_rmdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - int32_t ret = 0; - inode_t *parent = loc->parent; - if (!parent) - parent = inode_parent (loc->inode, 0, NULL); - ret = update_frame (frame, loc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (parent->st_mode & S_IWGRP) - break; - if (loc->inode->st_mode & S_IWGRP) - break; - case GF_FILTER_MAP_BOTH: - if (parent->st_mode & S_IWOTH) - break; - if (loc->inode->st_mode & S_IWOTH) - break; - gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); - STACK_UNWIND (frame, -1, EPERM, NULL, NULL); - return 0; - case GF_FILTER_FILTER_UID: - case GF_FILTER_FILTER_GID: - case GF_FILTER_RO_FS: - STACK_UNWIND (frame, -1, EROFS, NULL, NULL); - return 0; - } - STACK_WIND (frame, - filter_rmdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rmdir, - loc); - return 0; -} - -static int32_t -filter_symlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - int ret = 0; - if (op_ret >= 0) { - update_stat (buf, this->private); - ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->ia_uid); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "couldn't set context"); - } - - update_stat (preparent, this->private); - update_stat (postparent, this->private); - } - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; -} - -int32_t -filter_symlink (call_frame_t *frame, - xlator_t *this, - const char *linkpath, - loc_t *loc) -{ - int ret = 0; - inode_t *parent = loc->parent; - ret = update_frame (frame, loc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (parent->st_mode & S_IWGRP) - break; - case GF_FILTER_MAP_BOTH: - if (parent->st_mode & S_IWOTH) - break; - gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); - STACK_UNWIND (frame, -1, EPERM, NULL, NULL, - NULL, NULL); - return 0; - - case GF_FILTER_FILTER_UID: - case GF_FILTER_FILTER_GID: - case GF_FILTER_RO_FS: - STACK_UNWIND (frame, -1, EROFS, NULL, NULL, - NULL, NULL); - return 0; - } - STACK_WIND (frame, - filter_symlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->symlink, - linkpath, loc); - return 0; -} - - -static int32_t -filter_rename_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf, - struct iatt *preoldparent, - struct iatt *postoldparent, - struct iatt *prenewparent, - struct iatt *postnewparent) -{ - if (op_ret >= 0) { - update_stat (buf, this->private); - - update_stat (preoldparent, this->private); - update_stat (postoldparent, this->private); - - update_stat (prenewparent, this->private); - update_stat (postnewparent, this->private); - } - - STACK_UNWIND (frame, op_ret, op_errno, buf, - preoldparent, postoldparent, - prenewparent, postnewparent); - return 0; -} - -int32_t -filter_rename (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - int32_t ret = 0; - inode_t *parent = oldloc->parent; - if (!parent) - parent = inode_parent (oldloc->inode, 0, NULL); - ret = update_frame (frame, oldloc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (parent->st_mode & S_IWGRP) - break; - if (oldloc->inode->st_mode & S_IWGRP) - break; - case GF_FILTER_MAP_BOTH: - if (parent->st_mode & S_IWOTH) - break; - if (oldloc->inode->st_mode & S_IWOTH) - break; - gf_log (this->name, GF_LOG_DEBUG, - "%s -> %s: returning permission denied", oldloc->path, newloc->path); - STACK_UNWIND (frame, -1, EPERM, NULL, - NULL, NULL, - NULL, NULL); - return 0; - - case GF_FILTER_FILTER_UID: - case GF_FILTER_FILTER_GID: - case GF_FILTER_RO_FS: - STACK_UNWIND (frame, -1, EROFS, NULL, - NULL, NULL, - NULL, NULL); - return 0; - } - STACK_WIND (frame, - filter_rename_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rename, - oldloc, newloc); - return 0; -} - - -static int32_t -filter_link_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - int ret = 0; - if (op_ret >= 0) { - update_stat (buf, this->private); - ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->ia_uid); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "couldn't set context"); - } - - update_stat (preparent, this->private); - update_stat (postparent, this->private); - } - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; -} - -int32_t -filter_link (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - int ret = 0; - ret = update_frame (frame, oldloc->inode, this->private); - switch (ret) { - case GF_FILTER_FILTER_UID: - case GF_FILTER_FILTER_GID: - case GF_FILTER_RO_FS: - STACK_UNWIND (frame, -1, EROFS, NULL, NULL, - NULL, NULL); - return 0; - } - STACK_WIND (frame, - filter_link_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->link, - oldloc, newloc); - return 0; -} - - -static int32_t -filter_create_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - int ret = 0; - if (op_ret >= 0) { - update_stat (buf, this->private); - ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->ia_uid); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "couldn't set context"); - } - update_stat (preparent, this->private); - update_stat (postparent, this->private); - } - STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent); - return 0; -} - -int32_t -filter_create (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - mode_t mode, fd_t *fd) -{ - int ret = 0; - inode_t *parent = loc->parent; - ret = update_frame (frame, loc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (parent->st_mode & S_IWGRP) - break; - case GF_FILTER_MAP_BOTH: - if (parent->st_mode & S_IWOTH) - break; - gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); - STACK_UNWIND (frame, -1, EPERM, NULL, NULL, NULL, - NULL, NULL); - return 0; - - case GF_FILTER_FILTER_UID: - case GF_FILTER_FILTER_GID: - case GF_FILTER_RO_FS: - STACK_UNWIND (frame, -1, EROFS, NULL, NULL, NULL, - NULL, NULL); - return 0; - } - STACK_WIND (frame, filter_create_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - loc, flags, mode, fd); - return 0; -} - -static int32_t -filter_open_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - STACK_UNWIND (frame, op_ret, op_errno, fd); - return 0; -} - -int32_t -filter_open (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - fd_t *fd, - int32_t wbflags) -{ - int32_t ret = 0; - ret = update_frame (frame, loc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (loc->inode->st_mode & S_IWGRP) - break; - if (!(((flags & O_ACCMODE) == O_WRONLY) - || ((flags & O_ACCMODE) == O_RDWR)) - && (loc->inode->st_mode & S_IRGRP)) - break; - case GF_FILTER_MAP_BOTH: - if (loc->inode->st_mode & S_IWOTH) - break; - if (!(((flags & O_ACCMODE) == O_WRONLY) - || ((flags & O_ACCMODE) == O_RDWR)) - && (loc->inode->st_mode & S_IROTH)) - break; - gf_log (this->name, GF_LOG_DEBUG, - "%s: returning permission denied (mode: 0%o, flag=0%o)", - loc->path, loc->inode->st_mode, flags); - STACK_UNWIND (frame, -1, EPERM, fd); - return 0; - case GF_FILTER_FILTER_UID: - case GF_FILTER_FILTER_GID: - case GF_FILTER_RO_FS: - if (!(((flags & O_ACCMODE) == O_WRONLY) - || ((flags & O_ACCMODE) == O_RDWR))) - break; - STACK_UNWIND (frame, -1, EROFS, NULL); - return 0; - - } - STACK_WIND (frame, - filter_open_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, - loc, flags, fd, wbflags); - return 0; -} - -static int32_t -filter_readv_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iovec *vector, - int32_t count, - struct iatt *stbuf, - struct iobref *iobref) -{ - if (op_ret >= 0) { - update_stat (stbuf, this->private); - } - STACK_UNWIND (frame, - op_ret, - op_errno, - vector, - count, - stbuf, - iobref); - return 0; -} - -int32_t -filter_readv (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset) -{ - STACK_WIND (frame, - filter_readv_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, - fd, - size, - offset); - return 0; -} - - -static int32_t -filter_writev_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - if (op_ret >= 0) { - update_stat (prebuf, this->private); - update_stat (postbuf, this->private); - } - STACK_UNWIND (frame, - op_ret, - op_errno, - prebuf, - postbuf); - return 0; -} - -int32_t -filter_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t off, - struct iobref *iobref) -{ - int32_t ret = 0; - ret = update_frame (frame, fd->inode, this->private); - switch (ret) { - case GF_FILTER_FILTER_UID: - case GF_FILTER_FILTER_GID: - case GF_FILTER_RO_FS: - STACK_UNWIND (frame, -1, EROFS, NULL, NULL); - return 0; - } - - STACK_WIND (frame, - filter_writev_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, - fd, - vector, - count, - off, - iobref); - return 0; -} - -static int32_t -filter_fstat_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf) -{ - if (op_ret >= 0) { - update_stat (buf, this->private); - } - STACK_UNWIND (frame, - op_ret, - op_errno, - buf); - return 0; -} - -int32_t -filter_fstat (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - STACK_WIND (frame, - filter_fstat_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fstat, - fd); - return 0; -} - -static int32_t -filter_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - STACK_UNWIND (frame, - op_ret, - op_errno, - fd); - return 0; -} - -int32_t -filter_opendir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, fd_t *fd) -{ - int32_t ret = 0; - ret = update_frame (frame, loc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (loc->inode->st_mode & S_IWGRP) - break; - if (loc->inode->st_mode & S_IRGRP) - break; - case GF_FILTER_MAP_BOTH: - if (loc->inode->st_mode & S_IWOTH) - break; - if (loc->inode->st_mode & S_IROTH) - break; - gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); - STACK_UNWIND (frame, -1, EPERM, fd); - return 0; - } - STACK_WIND (frame, - filter_opendir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->opendir, - loc, fd); - return 0; -} - - -static int32_t -filter_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, - op_ret, - op_errno); - return 0; -} - -int32_t -filter_setxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *dict, - int32_t flags) -{ - - int32_t ret = 0; - ret = update_frame (frame, loc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (loc->inode->st_mode & S_IWGRP) - break; - case GF_FILTER_MAP_BOTH: - if (loc->inode->st_mode & S_IWOTH) - break; - gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); - STACK_UNWIND (frame, -1, EPERM); - return 0; - case GF_FILTER_FILTER_UID: - case GF_FILTER_FILTER_GID: - case GF_FILTER_RO_FS: - STACK_UNWIND (frame, -1, EROFS); - return 0; - } - - STACK_WIND (frame, - filter_setxattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, - loc, - dict, - flags); - return 0; -} - -static int32_t -filter_getxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *dict) -{ - STACK_UNWIND (frame, - op_ret, - op_errno, - dict); - return 0; -} - -int32_t -filter_getxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - int32_t ret = 0; - ret = update_frame (frame, loc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (loc->inode->st_mode & S_IRGRP) - break; - case GF_FILTER_MAP_BOTH: - if (loc->inode->st_mode & S_IROTH) - break; - gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); - STACK_UNWIND (frame, -1, EPERM, NULL); - return 0; - } - - STACK_WIND (frame, - filter_getxattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, - loc, - name); - return 0; -} - -static int32_t -filter_removexattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -int32_t -filter_removexattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - int32_t ret = 0; - ret = update_frame (frame, loc->inode, this->private); - switch (ret) { - case GF_FILTER_MAP_UID: - if (loc->inode->st_mode & S_IWGRP) - break; - case GF_FILTER_MAP_BOTH: - if (loc->inode->st_mode & S_IWOTH) - break; - gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); - STACK_UNWIND (frame, -1, EPERM); - return 0; - case GF_FILTER_FILTER_UID: - case GF_FILTER_FILTER_GID: - case GF_FILTER_RO_FS: - STACK_UNWIND (frame, -1, EROFS); - return 0; - } - - STACK_WIND (frame, - filter_removexattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->removexattr, - loc, - name); - return 0; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_filter_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - -int32_t -init (xlator_t *this) -{ - char *value = NULL; - char *tmp_str = NULL; - char *tmp_str1 = NULL; - char *tmp_str2 = NULL; - char *dup_str = NULL; - char *input_value_str1 = NULL; - char *input_value_str2 = NULL; - char *output_value_str = NULL; - int32_t input_value = 0; - int32_t output_value = 0; - data_t *option_data = NULL; - struct gf_filter *filter = NULL; - gf_boolean_t tmp_bool = 0; - - if (!this->children || this->children->next) { - gf_log (this->name, - GF_LOG_ERROR, - "translator not configured with exactly one child"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - filter = GF_CALLOC (sizeof (*filter), 1, gf_filter_mt_gf_filter); - ERR_ABORT (filter); - - if (dict_get (this->options, "read-only")) { - value = data_to_str (dict_get (this->options, "read-only")); - if (gf_string2boolean (value, &filter->complete_read_only) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "wrong value provided for 'read-only'"); - return -1; - } - } - - if (dict_get (this->options, "root-squashing")) { - value = data_to_str (dict_get (this->options, "root-squashing")); - if (gf_string2boolean (value, &tmp_bool) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "wrong value provided for 'root-squashing'"); - return -1; - } - if (tmp_bool) { - filter->translate_num_uid_entries = 1; - filter->translate_num_gid_entries = 1; - filter->translate_input_uid[0][0] = GF_FILTER_ROOT_UID; /* root */ - filter->translate_input_uid[0][1] = GF_FILTER_ROOT_UID; /* root */ - filter->translate_input_gid[0][0] = GF_FILTER_ROOT_GID; /* root */ - filter->translate_input_gid[0][1] = GF_FILTER_ROOT_GID; /* root */ - filter->translate_output_uid[0] = GF_FILTER_NOBODY_UID; - filter->translate_output_gid[0] = GF_FILTER_NOBODY_GID; - } - } - - if (dict_get (this->options, "translate-uid")) { - option_data = dict_get (this->options, "translate-uid"); - value = strtok_r (option_data->data, ",", &tmp_str); - while (value) { - dup_str = gf_strdup (value); - input_value_str1 = strtok_r (dup_str, "=", &tmp_str1); - if (input_value_str1) { - /* Check for n-m */ - char *temp_string = gf_strdup (input_value_str1); - input_value_str2 = strtok_r (temp_string, "-", &tmp_str2); - if (gf_string2int (input_value_str2, &input_value) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", - input_value_str2); - return -1; - } - filter->translate_input_uid[filter->translate_num_uid_entries][0] = input_value; - input_value_str2 = strtok_r (NULL, "-", &tmp_str2); - if (input_value_str2) { - if (gf_string2int (input_value_str2, &input_value) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", - input_value_str2); - return -1; - } - } - filter->translate_input_uid[filter->translate_num_uid_entries][1] = input_value; - GF_FREE (temp_string); - output_value_str = strtok_r (NULL, "=", &tmp_str1); - if (output_value_str) { - if (gf_string2int (output_value_str, &output_value) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", - output_value_str); - return -1; - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "mapping string not valid"); - return -1; - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "mapping string not valid"); - return -1; - } - filter->translate_output_uid[filter->translate_num_uid_entries] = output_value; - gf_log (this->name, - GF_LOG_DEBUG, - "pair %d: input uid '%d' will be changed to uid '%d'", - filter->translate_num_uid_entries, input_value, output_value); - - filter->translate_num_uid_entries++; - if (filter->translate_num_uid_entries == GF_MAXIMUM_FILTERING_ALLOWED) - break; - value = strtok_r (NULL, ",", &tmp_str); - GF_FREE (dup_str); - } - } - - tmp_str1 = NULL; - tmp_str2 = NULL; - tmp_str = NULL; - - if (dict_get (this->options, "translate-gid")) { - option_data = dict_get (this->options, "translate-gid"); - value = strtok_r (option_data->data, ",", &tmp_str); - while (value) { - dup_str = gf_strdup (value); - input_value_str1 = strtok_r (dup_str, "=", &tmp_str1); - if (input_value_str1) { - /* Check for n-m */ - char *temp_string = gf_strdup (input_value_str1); - input_value_str2 = strtok_r (temp_string, "-", &tmp_str2); - if (gf_string2int (input_value_str2, &input_value) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", - input_value_str2); - return -1; - } - filter->translate_input_gid[filter->translate_num_gid_entries][0] = input_value; - input_value_str2 = strtok_r (NULL, "-", &tmp_str2); - if (input_value_str2) { - if (gf_string2int (input_value_str2, &input_value) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", - input_value_str2); - return -1; - } - } - filter->translate_input_gid[filter->translate_num_gid_entries][1] = input_value; - GF_FREE (temp_string); - output_value_str = strtok_r (NULL, "=", &tmp_str1); - if (output_value_str) { - if (gf_string2int (output_value_str, &output_value) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", - output_value_str); - return -1; - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "translate-gid value not valid"); - return -1; - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "translate-gid value not valid"); - return -1; - } - - filter->translate_output_gid[filter->translate_num_gid_entries] = output_value; - - gf_log (this->name, GF_LOG_DEBUG, - "pair %d: input gid '%d' will be changed to gid '%d'", - filter->translate_num_gid_entries, input_value, output_value); - - filter->translate_num_gid_entries++; - if (filter->translate_num_gid_entries == GF_MAXIMUM_FILTERING_ALLOWED) - break; - value = strtok_r (NULL, ",", &tmp_str); - GF_FREE (dup_str); - } - } - - tmp_str = NULL; - tmp_str1 = NULL; - - if (dict_get (this->options, "filter-uid")) { - option_data = dict_get (this->options, "filter-uid"); - value = strtok_r (option_data->data, ",", &tmp_str); - while (value) { - dup_str = gf_strdup (value); - /* Check for n-m */ - input_value_str1 = strtok_r (dup_str, "-", &tmp_str1); - if (gf_string2int (input_value_str1, &input_value) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", - input_value_str1); - return -1; - } - filter->filter_input_uid[filter->filter_num_uid_entries][0] = input_value; - input_value_str1 = strtok_r (NULL, "-", &tmp_str1); - if (input_value_str1) { - if (gf_string2int (input_value_str1, &input_value) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", - input_value_str1); - return -1; - } - } - filter->filter_input_uid[filter->filter_num_uid_entries][1] = input_value; - - gf_log (this->name, - GF_LOG_DEBUG, - "filter [%d]: input uid(s) '%s' will be filtered", - filter->filter_num_uid_entries, dup_str); - - filter->filter_num_uid_entries++; - if (filter->filter_num_uid_entries == GF_MAXIMUM_FILTERING_ALLOWED) - break; - value = strtok_r (NULL, ",", &tmp_str); - GF_FREE (dup_str); - } - filter->partial_filter = 1; - } - - tmp_str = NULL; - tmp_str1 = NULL; - - if (dict_get (this->options, "filter-gid")) { - option_data = dict_get (this->options, "filter-gid"); - value = strtok_r (option_data->data, ",", &tmp_str); - while (value) { - dup_str = gf_strdup (value); - /* Check for n-m */ - input_value_str1 = strtok_r (dup_str, "-", &tmp_str1); - if (gf_string2int (input_value_str1, &input_value) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", - input_value_str1); - return -1; - } - filter->filter_input_gid[filter->filter_num_gid_entries][0] = input_value; - input_value_str1 = strtok_r (NULL, "-", &tmp_str1); - if (input_value_str1) { - if (gf_string2int (input_value_str1, &input_value) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", - input_value_str1); - return -1; - } - } - filter->filter_input_gid[filter->filter_num_gid_entries][1] = input_value; - - gf_log (this->name, - GF_LOG_DEBUG, - "filter [%d]: input gid(s) '%s' will be filtered", - filter->filter_num_gid_entries, dup_str); - - filter->filter_num_gid_entries++; - if (filter->filter_num_gid_entries == GF_MAXIMUM_FILTERING_ALLOWED) - break; - value = strtok_r (NULL, ",", &tmp_str); - GF_FREE (dup_str); - } - gf_log (this->name, GF_LOG_ERROR, "this option is not supported currently.. exiting"); - return -1; - filter->partial_filter = 1; - } - - if (dict_get (this->options, "fixed-uid")) { - option_data = dict_get (this->options, "fixed-uid"); - if (gf_string2int (option_data->data, &input_value) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", - option_data->data); - return -1; - } - filter->fixed_uid = input_value; - filter->fixed_uid_set = 1; - } - - if (dict_get (this->options, "fixed-gid")) { - option_data = dict_get (this->options, "fixed-gid"); - if (gf_string2int (option_data->data, &input_value) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", - option_data->data); - return -1; - } - filter->fixed_gid = input_value; - filter->fixed_gid_set = 1; - } - - this->private = filter; - return 0; -} - - -void -fini (xlator_t *this) -{ - struct gf_filter *filter = this->private; - - GF_FREE (filter); - - return; -} - - -struct xlator_fops fops = { - .lookup = filter_lookup, - .stat = filter_stat, - .fstat = filter_fstat, - .readlink = filter_readlink, - .mknod = filter_mknod, - .mkdir = filter_mkdir, - .unlink = filter_unlink, - .rmdir = filter_rmdir, - .symlink = filter_symlink, - .rename = filter_rename, - .link = filter_link, - .truncate = filter_truncate, - .ftruncate = filter_ftruncate, - .create = filter_create, - .open = filter_open, - .readv = filter_readv, - .writev = filter_writev, - .setxattr = filter_setxattr, - .getxattr = filter_getxattr, - .removexattr = filter_removexattr, - .opendir = filter_opendir, - .setattr = filter_setattr, - .fsetattr = filter_fsetattr, -}; - -struct xlator_cbks cbks = { -}; - -struct volume_options options[] = { - { .key = { "root-squashing" }, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = { "read-only" }, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = { "fixed-uid" }, - .type = GF_OPTION_TYPE_INT - }, - { .key = { "fixed-gid" }, - .type = GF_OPTION_TYPE_INT - }, - { .key = { "translate-uid" }, - .type = GF_OPTION_TYPE_ANY - }, - { .key = { "translate-gid" }, - .type = GF_OPTION_TYPE_ANY - }, - { .key = { "filter-uid" }, - .type = GF_OPTION_TYPE_ANY - }, - { .key = { "filter-gid" }, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {NULL} }, -}; diff --git a/xlators/features/gfid-access/src/Makefile.am b/xlators/features/gfid-access/src/Makefile.am index db53affaab3..ff95604c4de 100644 --- a/xlators/features/gfid-access/src/Makefile.am +++ b/xlators/features/gfid-access/src/Makefile.am @@ -1,14 +1,15 @@ xlator_LTLIBRARIES = gfid-access.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features -gfid_access_la_LDFLAGS = -module -avoid-version +gfid_access_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) gfid_access_la_SOURCES = gfid-access.c gfid_access_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = gfid-access.h gfid-access-mem-types.h -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/features/gfid-access/src/gfid-access-mem-types.h b/xlators/features/gfid-access/src/gfid-access-mem-types.h index 168d67b431f..1c4d0b93de2 100644 --- a/xlators/features/gfid-access/src/gfid-access-mem-types.h +++ b/xlators/features/gfid-access/src/gfid-access-mem-types.h @@ -11,13 +11,12 @@ #ifndef _GFID_ACCESS_MEM_TYPES_H #define _GFID_ACCESS_MEM_TYPES_H -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_changelog_mem_types { - gf_gfid_access_mt_priv_t = gf_common_mt_end + 1, - gf_gfid_access_mt_gfid_t, - gf_gfid_access_mt_end + gf_gfid_access_mt_priv_t = gf_common_mt_end + 1, + gf_gfid_access_mt_gfid_t, + gf_gfid_access_mt_end }; #endif - diff --git a/xlators/features/gfid-access/src/gfid-access.c b/xlators/features/gfid-access/src/gfid-access.c index da0ba7e5046..3fea5672a21 100644 --- a/xlators/features/gfid-access/src/gfid-access.c +++ b/xlators/features/gfid-access/src/gfid-access.c @@ -7,1166 +7,1414 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include "gfid-access.h" -#include "inode.h" -#include "byte-order.h" +#include <glusterfs/inode.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/statedump.h> +int +ga_valid_inode_loc_copy(loc_t *dst, loc_t *src, xlator_t *this) +{ + int ret = 0; + uint64_t value = 0; + + /* if its an entry operation, on the virtual */ + /* directory inode as parent, we need to handle */ + /* it properly */ + ret = loc_copy(dst, src); + if (ret < 0) + goto out; + + /* + * Change ALL virtual inodes with real-inodes in loc + */ + if (dst->parent) { + ret = inode_ctx_get(dst->parent, this, &value); + if (ret < 0) { + ret = 0; // real-inode + goto out; + } + inode_unref(dst->parent); + dst->parent = inode_ref((inode_t *)(uintptr_t)value); + gf_uuid_copy(dst->pargfid, dst->parent->gfid); + } + + if (dst->inode) { + ret = inode_ctx_get(dst->inode, this, &value); + if (ret < 0) { + ret = 0; // real-inode + goto out; + } + inode_unref(dst->inode); + dst->inode = inode_ref((inode_t *)(uintptr_t)value); + gf_uuid_copy(dst->gfid, dst->inode->gfid); + } +out: + return ret; +} void -ga_newfile_args_free (ga_newfile_args_t *args) +ga_newfile_args_free(ga_newfile_args_t *args) { - if (!args) - goto out; + if (!args) + goto out; - GF_FREE (args->bname); + GF_FREE(args->bname); - if (S_ISLNK (args->st_mode) && args->args.symlink.linkpath) { - GF_FREE (args->args.symlink.linkpath); - args->args.symlink.linkpath = NULL; - } + if (S_ISLNK(args->st_mode) && args->args.symlink.linkpath) { + GF_FREE(args->args.symlink.linkpath); + args->args.symlink.linkpath = NULL; + } - mem_put (args); + mem_put(args); out: - return; + return; } - void -ga_heal_args_free (ga_heal_args_t *args) +ga_heal_args_free(ga_heal_args_t *args) { - if (!args) - goto out; + if (!args) + goto out; - GF_FREE (args->bname); + GF_FREE(args->bname); - mem_put (args); + mem_put(args); out: - return; + return; } - ga_newfile_args_t * -ga_newfile_parse_args (xlator_t *this, data_t *data) +ga_newfile_parse_args(xlator_t *this, data_t *data) { - ga_newfile_args_t *args = NULL; - ga_private_t *priv = NULL; - int len = 0; - int blob_len = 0; - int min_len = 0; - void *blob = NULL; - - priv = this->private; - - blob = data->data; - blob_len = data->len; - - min_len = sizeof (args->uid) + sizeof (args->gid) + sizeof (args->gfid) - + sizeof (args->st_mode) + 2 + 2; - if (blob_len < min_len) { - gf_log (this->name, GF_LOG_ERROR, - "Invalid length: Total length is less " - "than minimum length."); - goto err; + ga_newfile_args_t *args = NULL; + ga_private_t *priv = NULL; + int len = 0; + int blob_len = 0; + int min_len = 0; + void *blob = NULL; + + priv = this->private; + + blob = data->data; + blob_len = data->len; + + min_len = sizeof(args->uid) + sizeof(args->gid) + sizeof(args->gfid) + + sizeof(args->st_mode) + 2 + 2; + if (blob_len < min_len) { + gf_log(this->name, GF_LOG_ERROR, + "Invalid length: Total length is less " + "than minimum length."); + goto err; + } + + args = mem_get0(priv->newfile_args_pool); + if (args == NULL) + goto err; + + args->uid = ntoh32(*(uint32_t *)blob); + blob += sizeof(uint32_t); + blob_len -= sizeof(uint32_t); + + args->gid = ntoh32(*(uint32_t *)blob); + blob += sizeof(uint32_t); + blob_len -= sizeof(uint32_t); + + memcpy(args->gfid, blob, sizeof(args->gfid)); + blob += sizeof(args->gfid); + blob_len -= sizeof(args->gfid); + + args->st_mode = ntoh32(*(uint32_t *)blob); + blob += sizeof(uint32_t); + blob_len -= sizeof(uint32_t); + + len = strnlen(blob, blob_len); + if (len == blob_len) { + gf_log(this->name, GF_LOG_ERROR, "gfid: %s. No null byte present.", + args->gfid); + goto err; + } + + args->bname = GF_MALLOC(len + 1, gf_common_mt_char); + if (args->bname == NULL) + goto err; + + memcpy(args->bname, blob, (len + 1)); + blob += (len + 1); + blob_len -= (len + 1); + + if (S_ISDIR(args->st_mode)) { + if (blob_len < sizeof(uint32_t)) { + gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length", + args->gfid); + goto err; } - - args = mem_get0 (priv->newfile_args_pool); - if (args == NULL) - goto err; - - args->uid = ntoh32 (*(uint32_t *)blob); - blob += sizeof (uint32_t); - blob_len -= sizeof (uint32_t); - - args->gid = ntoh32 (*(uint32_t *)blob); - blob += sizeof (uint32_t); - blob_len -= sizeof (uint32_t); - - memcpy (args->gfid, blob, sizeof (args->gfid)); - blob += sizeof (args->gfid); - blob_len -= sizeof (args->gfid); - - args->st_mode = ntoh32 (*(uint32_t *)blob); - blob += sizeof (uint32_t); - blob_len -= sizeof (uint32_t); - - len = strnlen (blob, blob_len); - if (len == blob_len) + args->args.mkdir.mode = ntoh32(*(uint32_t *)blob); + blob += sizeof(uint32_t); + blob_len -= sizeof(uint32_t); + + if (blob_len < sizeof(uint32_t)) { + gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length", + args->gfid); + goto err; + } + args->args.mkdir.umask = ntoh32(*(uint32_t *)blob); + blob_len -= sizeof(uint32_t); + if (blob_len < 0) { + gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length", + args->gfid); + goto err; + } + } else if (S_ISLNK(args->st_mode)) { + len = strnlen(blob, blob_len); if (len == blob_len) { - gf_log (this->name, GF_LOG_ERROR, - "gfid: %s. No null byte present.", - args->gfid); - goto err; + gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length", + args->gfid); + goto err; } + args->args.symlink.linkpath = GF_MALLOC(len + 1, gf_common_mt_char); + if (args->args.symlink.linkpath == NULL) + goto err; - args->bname = GF_CALLOC (1, (len + 1), gf_common_mt_char); - if (args->bname == NULL) - goto err; - - memcpy (args->bname, blob, (len + 1)); - blob += (len + 1); + memcpy(args->args.symlink.linkpath, blob, (len + 1)); blob_len -= (len + 1); - - if (S_ISDIR (args->st_mode)) { - if (blob_len < sizeof (uint32_t)) { - gf_log (this->name, GF_LOG_ERROR, - "gfid: %s. Invalid length", - args->gfid); - goto err; - } - args->args.mkdir.mode = ntoh32 (*(uint32_t *)blob); - blob += sizeof (uint32_t); - blob_len -= sizeof (uint32_t); - - if (blob_len < sizeof (uint32_t)) { - gf_log (this->name, GF_LOG_ERROR, - "gfid: %s. Invalid length", - args->gfid); - goto err; - } - args->args.mkdir.umask = ntoh32 (*(uint32_t *)blob); - blob += sizeof (uint32_t); - blob_len -= sizeof (uint32_t); - if (blob_len < 0) { - gf_log (this->name, GF_LOG_ERROR, - "gfid: %s. Invalid length", - args->gfid); - goto err; - } - } else if (S_ISLNK (args->st_mode)) { - len = strnlen (blob, blob_len); - if (len == blob_len) { - gf_log (this->name, GF_LOG_ERROR, - "gfid: %s. Invalid length", - args->gfid); - goto err; - } - args->args.symlink.linkpath = GF_CALLOC (1, len + 1, - gf_common_mt_char); - if (args->args.symlink.linkpath == NULL) - goto err; - - memcpy (args->args.symlink.linkpath, blob, (len + 1)); - blob += (len + 1); - blob_len -= (len + 1); - } else { - if (blob_len < sizeof (uint32_t)) { - gf_log (this->name, GF_LOG_ERROR, - "gfid: %s. Invalid length", - args->gfid); - goto err; - } - args->args.mknod.mode = ntoh32 (*(uint32_t *)blob); - blob += sizeof (uint32_t); - blob_len -= sizeof (uint32_t); - - if (blob_len < sizeof (uint32_t)) { - gf_log (this->name, GF_LOG_ERROR, - "gfid: %s. Invalid length", - args->gfid); - goto err; - } - args->args.mknod.rdev = ntoh32 (*(uint32_t *)blob); - blob += sizeof (uint32_t); - blob_len -= sizeof (uint32_t); - - if (blob_len < sizeof (uint32_t)) { - gf_log (this->name, GF_LOG_ERROR, - "gfid: %s. Invalid length", - args->gfid); - goto err; - } - args->args.mknod.umask = ntoh32 (*(uint32_t *)blob); - blob += sizeof (uint32_t); - blob_len -= sizeof (uint32_t); + } else { + if (blob_len < sizeof(uint32_t)) { + gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length", + args->gfid); + goto err; } - - if (blob_len) { - gf_log (this->name, GF_LOG_ERROR, - "gfid: %s. Invalid length", - args->gfid); - goto err; + args->args.mknod.mode = ntoh32(*(uint32_t *)blob); + blob += sizeof(uint32_t); + blob_len -= sizeof(uint32_t); + + if (blob_len < sizeof(uint32_t)) { + gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length", + args->gfid); + goto err; + } + args->args.mknod.rdev = ntoh32(*(uint32_t *)blob); + blob += sizeof(uint32_t); + blob_len -= sizeof(uint32_t); + + if (blob_len < sizeof(uint32_t)) { + gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length", + args->gfid); + goto err; } + args->args.mknod.umask = ntoh32(*(uint32_t *)blob); + blob_len -= sizeof(uint32_t); + } - return args; + if (blob_len) { + gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length", + args->gfid); + goto err; + } + + return args; err: - if (args) - ga_newfile_args_free (args); + if (args) + ga_newfile_args_free(args); - return NULL; + return NULL; } ga_heal_args_t * -ga_heal_parse_args (xlator_t *this, data_t *data) +ga_heal_parse_args(xlator_t *this, data_t *data) { - ga_heal_args_t *args = NULL; - ga_private_t *priv = NULL; - void *blob = NULL; - int len = 0; - int blob_len = 0; + ga_heal_args_t *args = NULL; + ga_private_t *priv = NULL; + void *blob = NULL; + int len = 0; + int blob_len = 0; - blob = data->data; - blob_len = data->len; + blob = data->data; + blob_len = data->len; - priv = this->private; + priv = this->private; - /* bname should at least contain a character */ - if (blob_len < (sizeof (args->gfid) + 2)) - goto err; + /* bname should at least contain a character */ + if (blob_len < (sizeof(args->gfid) + 2)) + goto err; - args = mem_get0 (priv->heal_args_pool); - if (!args) - goto err; + args = mem_get0(priv->heal_args_pool); + if (!args) + goto err; - memcpy (args->gfid, blob, sizeof (args->gfid)); - blob += sizeof (args->gfid); - blob_len -= sizeof (args->gfid); + memcpy(args->gfid, blob, sizeof(args->gfid)); + blob += sizeof(args->gfid); + blob_len -= sizeof(args->gfid); - len = strnlen (blob, blob_len); - if (len == blob_len) - goto err; + len = strnlen(blob, blob_len); + if (len == blob_len) + goto err; - args->bname = GF_CALLOC (1, len + 1, gf_common_mt_char); - if (!args->bname) - goto err; + args->bname = GF_MALLOC(len + 1, gf_common_mt_char); + if (!args->bname) + goto err; - memcpy (args->bname, blob, len); - blob_len -= (len + 1); + memcpy(args->bname, blob, len); + args->bname[len] = '\0'; + blob_len -= (len + 1); - if (blob_len) - goto err; + if (blob_len) + goto err; - return args; + return args; err: - if (args) - ga_heal_args_free (args); + if (args) + ga_heal_args_free(args); - return NULL; + return NULL; } static int32_t -ga_fill_tmp_loc (loc_t *loc, xlator_t *this, char *gfid, - char *bname, dict_t *xdata, loc_t *new_loc) +ga_fill_tmp_loc(loc_t *loc, xlator_t *this, uuid_t gfid, char *bname, + dict_t *xdata, loc_t *new_loc) { - int ret = -1; - uint64_t value = 0; - inode_t *parent = NULL; - - parent = loc->inode; - ret = inode_ctx_get (loc->inode, this, &value); - if (!ret) { - parent = (void *)value; - } - - /* parent itself should be looked up */ - uuid_copy (new_loc->pargfid, parent->gfid); - new_loc->parent = inode_ref (parent); - - new_loc->inode = inode_grep (parent->table, parent, bname); - if (!new_loc->inode) - new_loc->inode = inode_new (parent->table); - - loc_path (new_loc, bname); - new_loc->name = basename (new_loc->path); - - /* As GFID would not be set on the entry yet, lets not send entry - gfid in the request */ - /*uuid_copy (new_loc->gfid, (const unsigned char *)gfid); */ - - ret = dict_set_static_bin (xdata, "gfid-req", gfid, 16); - if (ret < 0) - goto out; - - ret = 0; + int ret = -1; + uint64_t value = 0; + inode_t *parent = NULL; + unsigned char *gfid_ptr = NULL; + + parent = loc->inode; + ret = inode_ctx_get(loc->inode, this, &value); + if (!ret) { + parent = (void *)(uintptr_t)value; + if (gf_uuid_is_null(parent->gfid)) + parent = loc->inode; + } + + /* parent itself should be looked up */ + gf_uuid_copy(new_loc->pargfid, parent->gfid); + new_loc->parent = inode_ref(parent); + + new_loc->inode = inode_grep(parent->table, parent, bname); + if (!new_loc->inode) { + new_loc->inode = inode_new(parent->table); + gf_uuid_copy(new_loc->inode->gfid, gfid); + } + + loc_path(new_loc, bname); + if (new_loc->path) { + new_loc->name = strrchr(new_loc->path, '/'); + if (new_loc->name) + new_loc->name++; + } + + gfid_ptr = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); + if (!gfid_ptr) { + ret = -1; + goto out; + } + gf_uuid_copy(gfid_ptr, gfid); + ret = dict_set_gfuuid(xdata, "gfid-req", gfid_ptr, false); + if (ret < 0) + goto out; + + ret = 0; out: - return ret; + if (ret && gfid_ptr) + GF_FREE(gfid_ptr); + return ret; } - - static gf_boolean_t -__is_gfid_access_dir (uuid_t gfid) +__is_gfid_access_dir(uuid_t gfid) { - uuid_t aux_gfid; - - memset (aux_gfid, 0, 16); - aux_gfid[15] = GF_AUX_GFID; + static uuid_t aux_gfid = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, GF_AUX_GFID}; - if (uuid_compare (gfid, aux_gfid) == 0) - return _gf_true; + if (gf_uuid_compare(gfid, aux_gfid) == 0) + return _gf_true; - return _gf_false; + return _gf_false; } int32_t -ga_forget (xlator_t *this, inode_t *inode) +ga_forget(xlator_t *this, inode_t *inode) { - int ret = -1; - uint64_t value = 0; - inode_t *tmp_inode = NULL; + int ret = -1; + uint64_t value = 0; + inode_t *tmp_inode = NULL; - ret = inode_ctx_del (inode, this, &value); - if (ret) - goto out; + ret = inode_ctx_del(inode, this, &value); + if (ret) + goto out; - tmp_inode = (void *)value; - inode_unref (tmp_inode); + tmp_inode = (void *)(uintptr_t)value; + inode_unref(tmp_inode); out: - return 0; + return 0; } - static int -ga_heal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stat, dict_t *dict, - struct iatt *postparent) +ga_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *stat, dict_t *dict, + struct iatt *postparent) { - call_frame_t *orig_frame = NULL; + call_frame_t *orig_frame = NULL; - orig_frame = frame->local; - frame->local = NULL; + orig_frame = frame->local; + frame->local = NULL; - /* don't worry about inode linking and other stuff. They'll happen on - * the next lookup. - */ - STACK_DESTROY (frame->root); + /* don't worry about inode linking and other stuff. They'll happen on + * the next lookup. + */ + STACK_DESTROY(frame->root); - STACK_UNWIND_STRICT (setxattr, orig_frame, op_ret, op_errno, dict); + STACK_UNWIND_STRICT(setxattr, orig_frame, op_ret, op_errno, dict); - return 0; + return 0; } static int -ga_newentry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) +ga_newentry_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - call_frame_t *orig_frame = NULL; + ga_local_t *local = NULL; - orig_frame = frame->local; - frame->local = NULL; + local = frame->local; - /* don't worry about inode linking and other stuff. They'll happen on - * the next lookup. - */ - STACK_DESTROY (frame->root); + /* don't worry about inode linking and other stuff. They'll happen on + * the next lookup. + */ + frame->local = NULL; + STACK_DESTROY(frame->root); - STACK_UNWIND_STRICT (setxattr, orig_frame, op_ret, op_errno, xdata); + STACK_UNWIND_STRICT(setxattr, local->orig_frame, op_ret, op_errno, xdata); - return 0; + if (local->xdata) + dict_unref(local->xdata); + loc_wipe(&local->loc); + mem_put(local); + + return 0; } -int32_t -ga_new_entry (call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *data, - dict_t *xdata) -{ - int ret = -1; - ga_newfile_args_t *args = NULL; - loc_t tmp_loc = {0,}; - call_frame_t *new_frame = NULL; - mode_t mode = 0; +static int +ga_newentry_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stat, dict_t *xdata, + struct iatt *postparent) - args = ga_newfile_parse_args (this, data); - if (!args) - goto out; +{ + ga_local_t *local = NULL; - if (!xdata) - xdata = dict_new (); + local = frame->local; - ret = ga_fill_tmp_loc (loc, this, args->gfid, - args->bname, xdata, &tmp_loc); - if (ret) - goto out; - - new_frame = copy_frame (frame); - if (!new_frame) - goto out; - new_frame->local = (void *)frame; - - new_frame->root->uid = args->uid; - new_frame->root->gid = args->gid; - - if (S_ISDIR (args->st_mode)) { - STACK_WIND (new_frame, ga_newentry_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, - &tmp_loc, args->args.mkdir.mode, - args->args.mkdir.umask, xdata); - } else if (S_ISLNK (args->st_mode)) { - STACK_WIND (new_frame, ga_newentry_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, - args->args.symlink.linkpath, - &tmp_loc, 0, xdata); - } else { - /* use 07777 (4 7s) for considering the Sticky bits etc) */ - mode = (S_IFMT & args->st_mode) | - (07777 | args->args.mknod.mode);; - - STACK_WIND (new_frame, ga_newentry_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, - &tmp_loc, mode, - args->args.mknod.rdev, args->args.mknod.umask, - xdata); - } + if ((op_ret < 0) && ((op_errno != ENOENT) && (op_errno != ESTALE))) + goto err; - ret = 0; -out: - ga_newfile_args_free (args); + STACK_WIND(frame, ga_newentry_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, &local->loc, local->mode, + local->rdev, local->umask, local->xdata); + return 0; - return ret; +err: + frame->local = NULL; + STACK_DESTROY(frame->root); + STACK_UNWIND_STRICT(setxattr, local->orig_frame, op_ret, op_errno, xdata); + if (local->xdata) + dict_unref(local->xdata); + loc_wipe(&local->loc); + mem_put(local); + + return 0; } int32_t -ga_heal_entry (call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *data, - dict_t *xdata) +ga_new_entry(call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *data, + dict_t *xdata) { - int ret = -1; - ga_heal_args_t *args = NULL; - loc_t tmp_loc = {0,}; - call_frame_t *new_frame = NULL; + int ret = -1; + ga_newfile_args_t *args = NULL; + loc_t tmp_loc = { + 0, + }; + call_frame_t *new_frame = NULL; + ga_local_t *local = NULL; + uuid_t gfid = { + 0, + }; + + if (!xdata) { + xdata = dict_new(); + } else { + xdata = dict_ref(xdata); + } + + if (!xdata) { + ret = -1; + goto out; + } + + args = ga_newfile_parse_args(this, data); + if (!args) + goto out; + + ret = gf_uuid_parse(args->gfid, gfid); + if (ret) + goto out; + + ret = ga_fill_tmp_loc(loc, this, gfid, args->bname, xdata, &tmp_loc); + if (ret) + goto out; + + new_frame = copy_frame(frame); + if (!new_frame) + goto out; + + local = mem_get0(this->local_pool); + local->orig_frame = frame; + + loc_copy(&local->loc, &tmp_loc); + + new_frame->local = local; + new_frame->root->uid = args->uid; + new_frame->root->gid = args->gid; + + if (S_ISDIR(args->st_mode)) { + STACK_WIND(new_frame, ga_newentry_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, &tmp_loc, + args->args.mkdir.mode, args->args.mkdir.umask, xdata); + } else if (S_ISLNK(args->st_mode)) { + STACK_WIND(new_frame, ga_newentry_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, + args->args.symlink.linkpath, &tmp_loc, 0, xdata); + } else { + /* use 07777 (4 7s) for considering the Sticky bits etc) */ + ((ga_local_t *)new_frame->local)->mode = (S_IFMT & args->st_mode) | + (07777 & + args->args.mknod.mode); + + ((ga_local_t *)new_frame->local)->umask = args->args.mknod.umask; + ((ga_local_t *)new_frame->local)->rdev = args->args.mknod.rdev; + ((ga_local_t *)new_frame->local)->xdata = dict_ref(xdata); + + /* send a named lookup, so that dht can cleanup up stale linkto + * files etc. + */ + STACK_WIND(new_frame, ga_newentry_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &tmp_loc, NULL); + } - args = ga_heal_parse_args (this, data); - if (!args) - goto out; + ret = 0; +out: + ga_newfile_args_free(args); - if (!xdata) - xdata = dict_new (); + if (xdata) + dict_unref(xdata); - ret = ga_fill_tmp_loc (loc, this, args->gfid, args->bname, - xdata, &tmp_loc); - if (ret) - goto out; + loc_wipe(&tmp_loc); - new_frame = copy_frame (frame); - if (!new_frame) - goto out; - new_frame->local = (void *)frame; - - STACK_WIND (new_frame, ga_heal_cbk, FIRST_CHILD (this), - FIRST_CHILD(this)->fops->lookup, - &tmp_loc, xdata); + return ret; +} - ret = 0; +int32_t +ga_heal_entry(call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *data, + dict_t *xdata) +{ + int ret = -1; + ga_heal_args_t *args = NULL; + loc_t tmp_loc = { + 0, + }; + call_frame_t *new_frame = NULL; + uuid_t gfid = { + 0, + }; + + args = ga_heal_parse_args(this, data); + if (!args) + goto out; + + ret = gf_uuid_parse(args->gfid, gfid); + if (ret) + goto out; + + if (!xdata) + xdata = dict_new(); + else + xdata = dict_ref(xdata); + + if (!xdata) { + ret = -1; + goto out; + } + + ret = ga_fill_tmp_loc(loc, this, gfid, args->bname, xdata, &tmp_loc); + if (ret) + goto out; + + new_frame = copy_frame(frame); + if (!new_frame) + goto out; + + new_frame->local = (void *)frame; + + STACK_WIND(new_frame, ga_heal_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &tmp_loc, xdata); + + ret = 0; out: - if (args) - ga_heal_args_free (args); + if (args) + ga_heal_args_free(args); - return ret; + loc_wipe(&tmp_loc); + + if (xdata) + dict_unref(xdata); + + return ret; } int32_t -ga_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - dict_t *xdata) +ga_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata); - return 0; + STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, xdata); + return 0; } int32_t -ga_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags, dict_t *xdata) +ga_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { + data_t *data = NULL; + int op_errno = ENOMEM; + int ret = 0; + loc_t ga_loc = { + 0, + }; + + GFID_ACCESS_INODE_OP_CHECK(loc, op_errno, err); + + data = dict_get(dict, GF_FUSE_AUX_GFID_NEWFILE); + if (data) { + ret = ga_new_entry(frame, this, loc, data, xdata); + if (ret) + goto err; + return 0; + } - data_t *data = NULL; - int op_errno = ENOMEM; - int ret = 0; - inode_t *unref = NULL; - - if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) && - ((loc->parent && - __is_root_gfid (loc->parent->gfid)) || - __is_root_gfid (loc->pargfid))) { - op_errno = EPERM; - goto err; - } - - data = dict_get (dict, GF_FUSE_AUX_GFID_NEWFILE); - if (data) { - ret = ga_new_entry (frame, this, loc, data, xdata); - if (ret) - goto err; - return 0; - } - - data = dict_get (dict, GF_FUSE_AUX_GFID_HEAL); - if (data) { - ret = ga_heal_entry (frame, this, loc, data, xdata); - if (ret) - goto err; - return 0; - } + data = dict_get(dict, GF_FUSE_AUX_GFID_HEAL); + if (data) { + ret = ga_heal_entry(frame, this, loc, data, xdata); + if (ret) + goto err; + return 0; + } - //If the inode is a virtual inode change the inode otherwise perform - //the operation on same inode - GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + // If the inode is a virtual inode change the inode otherwise perform + // the operation on same inode + ret = ga_valid_inode_loc_copy(&ga_loc, loc, this); + if (ret < 0) + goto err; -wind: - STACK_WIND (frame, ga_setxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, - xdata); - if (unref) - inode_unref (unref); + STACK_WIND(frame, ga_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, &ga_loc, dict, flags, xdata); - return 0; + loc_wipe(&ga_loc); + return 0; err: - STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, xdata); - return 0; + STACK_UNWIND_STRICT(setxattr, frame, -1, op_errno, xdata); + return 0; } - int32_t -ga_virtual_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xdata, struct iatt *postparent) +ga_virtual_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) { - int j = 0; - int i = 0; - int ret = 0; - uint64_t temp_ino = 0; - inode_t *cbk_inode = NULL; - inode_t *true_inode = NULL; - uuid_t random_gfid = {0,}; - - if (frame->local) - cbk_inode = frame->local; - else - cbk_inode = inode; - - frame->local = NULL; - if (op_ret) - goto unwind; - - if (!IA_ISDIR (buf->ia_type)) + int ret = 0; + inode_t *cbk_inode = NULL; + inode_t *true_inode = NULL; + uuid_t random_gfid = { + 0, + }; + inode_t *linked_inode = NULL; + + if (frame->local) + cbk_inode = frame->local; + else + cbk_inode = inode_ref(inode); + + frame->local = NULL; + if (op_ret) + goto unwind; + + if (!IA_ISDIR(buf->ia_type)) + goto unwind; + + /* need to send back a different inode for linking in itable */ + if (cbk_inode == inode) { + /* check if the inode is in the 'itable' or + if its just previously discover()'d inode */ + true_inode = inode_find(inode->table, buf->ia_gfid); + if (!true_inode) { + /* This unref is for 'inode_ref()' done in beginning. + This is needed as cbk_inode is allocated new inode + whose unref is taken at the end*/ + inode_unref(cbk_inode); + cbk_inode = inode_new(inode->table); + + if (!cbk_inode) { + op_ret = -1; + op_errno = ENOMEM; goto unwind; + } + /* the inode is not present in itable, ie, the actual + path is not yet looked up. Use the current inode + itself for now */ - /* need to send back a different inode for linking in itable */ - if (cbk_inode == inode) { - /* check if the inode is in the 'itable' or - if its just previously discover()'d inode */ - true_inode = inode_find (inode->table, buf->ia_gfid); - if (!true_inode) { - cbk_inode = inode_new (inode->table); - - if (!cbk_inode) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - /* the inode is not present in itable, ie, the actual - path is not yet looked up. Use the current inode - itself for now */ - inode_ref (inode); - } else { - /* 'inode_ref()' has been done in inode_find() */ - inode = true_inode; - } - - ret = inode_ctx_put (cbk_inode, this, (uint64_t)inode); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "failed to set the inode ctx with" - "the actual inode"); - if (inode) - inode_unref (inode); - } - inode = NULL; + linked_inode = inode_link(inode, NULL, NULL, buf); + inode = linked_inode; + } else { + /* 'inode_ref()' has been done in inode_find() */ + inode = true_inode; } - if (!uuid_is_null (cbk_inode->gfid)) { - /* if the previous linked inode is used, use the - same gfid */ - uuid_copy (random_gfid, cbk_inode->gfid); - } else { - /* replace the buf->ia_gfid to a random gfid - for directory, for files, what we received is fine */ - uuid_generate (random_gfid); + ret = inode_ctx_put(cbk_inode, this, (uint64_t)(uintptr_t)inode); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "failed to set the inode ctx with" + "the actual inode"); + if (inode) + inode_unref(inode); } + inode = NULL; + } - uuid_copy (buf->ia_gfid, random_gfid); + if (!gf_uuid_is_null(cbk_inode->gfid)) { + /* if the previous linked inode is used, use the + same gfid */ + gf_uuid_copy(random_gfid, cbk_inode->gfid); + } else { + /* replace the buf->ia_gfid to a random gfid + for directory, for files, what we received is fine */ + gf_uuid_generate(random_gfid); + } - for (i = 15; i > (15 - 8); i--) { - temp_ino += (uint64_t)(buf->ia_gfid[i]) << j; - j += 8; - } - buf->ia_ino = temp_ino; + gf_uuid_copy(buf->ia_gfid, random_gfid); + + buf->ia_ino = gfid_to_ino(buf->ia_gfid); unwind: - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, cbk_inode, buf, - xdata, postparent); + /* Lookup on non-existing gfid returns ESTALE. + Convert into ENOENT for virtual lookup*/ + if (op_errno == ESTALE) + op_errno = ENOENT; - return 0; + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, cbk_inode, buf, xdata, + postparent); + + /* Also handles inode_unref of frame->local if done in ga_lookup */ + if (cbk_inode) + inode_unref(cbk_inode); + + return 0; } int32_t -ga_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xdata, struct iatt *postparent) +ga_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) { - ga_private_t *priv = NULL; + ga_private_t *priv = NULL; - /* if the entry in question is not 'root', - then follow the normal path */ - if (op_ret || !__is_root_gfid(buf->ia_gfid)) - goto unwind; + /* if the entry in question is not 'root', + then follow the normal path */ + if (op_ret || !__is_root_gfid(buf->ia_gfid)) + goto unwind; - priv = this->private; + priv = this->private; - /* do we need to copy root stbuf everytime? */ - /* mostly yes, as we want to have the 'stat' info show latest - in every _cbk() */ + /* do we need to copy root stbuf every time? */ + /* mostly yes, as we want to have the 'stat' info show latest + in every _cbk() */ - /* keep the reference for root stat buf */ - priv->root_stbuf = *buf; - priv->gfiddir_stbuf = priv->root_stbuf; - priv->gfiddir_stbuf.ia_gfid[15] = GF_AUX_GFID; - priv->gfiddir_stbuf.ia_ino = GF_AUX_GFID; + /* keep the reference for root stat buf */ + priv->root_stbuf = *buf; + priv->gfiddir_stbuf = priv->root_stbuf; + priv->gfiddir_stbuf.ia_gfid[15] = GF_AUX_GFID; + priv->gfiddir_stbuf.ia_ino = GF_AUX_GFID; unwind: - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, - xdata, postparent); - return 0; + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + return 0; } int32_t -ga_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +ga_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - ga_private_t *priv = NULL; - int ret = -1; - uuid_t tmp_gfid = {0,}; - loc_t tmp_loc = {0,}; - uint64_t value = 0; - inode_t *inode = NULL; - inode_t *true_inode = NULL; - int32_t op_errno = ENOENT; - - /* if its discover(), no need for any action here */ - if (!loc->name) - goto wind; - - /* if its revalidate, and inode is not of type directory, - proceed with 'wind' */ - if (loc->inode && loc->inode->ia_type && - !IA_ISDIR (loc->inode->ia_type)) - goto wind; - - priv = this->private; - - /* need to check if the lookup is on virtual dir */ - if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) && - ((loc->parent && __is_root_gfid (loc->parent->gfid)) || - __is_root_gfid (loc->pargfid))) { - /* this means, the query is on '/.gfid', return the fake stat, - and say success */ - - STACK_UNWIND_STRICT (lookup, frame, 0, 0, loc->inode, - &priv->gfiddir_stbuf, xdata, - &priv->root_stbuf); - return 0; + ga_private_t *priv = NULL; + int ret = -1; + uuid_t tmp_gfid = { + 0, + }; + loc_t tmp_loc = { + 0, + }; + uint64_t value = 0; + inode_t *inode = NULL; + inode_t *true_inode = NULL; + int32_t op_errno = ENOENT; + + priv = this->private; + + /* Handle nameless lookup on ".gfid" */ + if (!loc->parent && __is_gfid_access_dir(loc->gfid)) { + STACK_UNWIND_STRICT(lookup, frame, 0, 0, loc->inode, + &priv->gfiddir_stbuf, xdata, &priv->root_stbuf); + return 0; + } + + /* if its discover(), no need for any action here */ + if (!loc->name) + goto wind; + + /* if its revalidate, and inode is not of type directory, + proceed with 'wind' */ + if (loc->inode && loc->inode->ia_type && !IA_ISDIR(loc->inode->ia_type)) { + /* a revalidate on ".gfid/<dentry>" is possible, check for it */ + if (((loc->parent && __is_gfid_access_dir(loc->parent->gfid)) || + __is_gfid_access_dir(loc->pargfid))) { + /* here, just send 'loc->gfid' and 'loc->inode' */ + tmp_loc.inode = inode_ref(loc->inode); + gf_uuid_copy(tmp_loc.gfid, loc->inode->gfid); + + STACK_WIND(frame, default_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &tmp_loc, xdata); + + inode_unref(tmp_loc.inode); + + return 0; } - /* now, check if the lookup() is on an existing entry, - but on gfid-path */ - if (!((loc->parent && __is_gfid_access_dir (loc->parent->gfid)) || - __is_gfid_access_dir (loc->pargfid))) - goto wind; + /* not something to bother, continue the flow */ + goto wind; + } - /* make sure the 'basename' is actually a 'canonical-gfid', - otherwise, return error */ - ret = uuid_parse (loc->name, tmp_gfid); - if (ret) - goto err; + /* need to check if the lookup is on virtual dir */ + if ((loc->name && !strcmp(GF_GFID_DIR, loc->name)) && + ((loc->parent && __is_root_gfid(loc->parent->gfid)) || + __is_root_gfid(loc->pargfid))) { + /* this means, the query is on '/.gfid', return the fake stat, + and say success */ + + STACK_UNWIND_STRICT(lookup, frame, 0, 0, loc->inode, + &priv->gfiddir_stbuf, xdata, &priv->root_stbuf); + return 0; + } - /* if its fresh lookup, go ahead and send it down, if not, - for directory, we need indirection to actual dir inode */ - if (!(loc->inode && loc->inode->ia_type)) - goto discover; + /* now, check if the lookup() is on an existing entry, + but on gfid-path */ + if (!((loc->parent && __is_gfid_access_dir(loc->parent->gfid)) || + __is_gfid_access_dir(loc->pargfid))) { + if (!loc->parent) + goto wind; - /* revalidate on directory */ - ret = inode_ctx_get (loc->inode, this, &value); + ret = inode_ctx_get(loc->parent, this, &value); if (ret) - goto err; + goto wind; - inode = (void *)value; + inode = (inode_t *)(uintptr_t)value; - /* valid inode, already looked up, work on that */ - if (inode->ia_type) - goto discover; + ret = loc_copy_overload_parent(&tmp_loc, loc, inode); + if (ret) + goto err; - /* check if the inode is in the 'itable' or - if its just previously discover()'d inode */ - true_inode = inode_find (loc->inode->table, tmp_gfid); - if (true_inode) { - /* time do another lookup and update the context - with proper inode */ - op_errno = ESTALE; - goto err; - } + STACK_WIND(frame, ga_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &tmp_loc, xdata); + + loc_wipe(&tmp_loc); + return 0; + } + + /* make sure the 'basename' is actually a 'canonical-gfid', + otherwise, return error */ + ret = gf_uuid_parse(loc->name, tmp_gfid); + if (ret) + goto err; + + /* if its fresh lookup, go ahead and send it down, if not, + for directory, we need indirection to actual dir inode */ + if (!(loc->inode && loc->inode->ia_type)) + goto discover; + + /* revalidate on directory */ + ret = inode_ctx_get(loc->inode, this, &value); + if (ret) + goto err; + + inode = (void *)(uintptr_t)value; + + /* valid inode, already looked up, work on that */ + if (inode->ia_type) + goto discover; + + /* check if the inode is in the 'itable' or + if its just previously discover()'d inode */ + true_inode = inode_find(loc->inode->table, tmp_gfid); + if (true_inode) { + /* time do another lookup and update the context + with proper inode */ + op_errno = ESTALE; + /* 'inode_ref()' done in inode_find */ + inode_unref(true_inode); + goto err; + } discover: - /* for the virtual entries, we don't need to send 'gfid-req' key, as - for these entries, we don't want to 'set' a new gfid */ - if (xdata) - dict_del (xdata, "gfid-req"); + /* for the virtual entries, we don't need to send 'gfid-req' key, as + for these entries, we don't want to 'set' a new gfid */ + if (xdata) + dict_del(xdata, "gfid-req"); - uuid_copy (tmp_loc.gfid, tmp_gfid); + gf_uuid_copy(tmp_loc.gfid, tmp_gfid); - /* if revalidate, then we need to have the proper reference */ - if (inode) { - tmp_loc.inode = inode_ref (inode); - frame->local = loc->inode; - } else { - tmp_loc.inode = inode_ref (loc->inode); - } + /* if revalidate, then we need to have the proper reference */ + if (inode) { + tmp_loc.inode = inode_ref(inode); + frame->local = inode_ref(loc->inode); + } else { + tmp_loc.inode = inode_ref(loc->inode); + } - STACK_WIND (frame, ga_virtual_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, &tmp_loc, xdata); + STACK_WIND(frame, ga_virtual_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &tmp_loc, xdata); - inode_unref (tmp_loc.inode); + inode_unref(tmp_loc.inode); - return 0; + return 0; wind: - /* used for all the normal lookup path */ - STACK_WIND (frame, ga_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, xdata); + /* used for all the normal lookup path */ + STACK_WIND(frame, ga_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); - return 0; + return 0; err: - STACK_UNWIND_STRICT (lookup, frame, -1, op_errno, loc->inode, - &priv->gfiddir_stbuf, xdata, - &priv->root_stbuf); - return 0; + STACK_UNWIND_STRICT(lookup, frame, -1, op_errno, loc->inode, + &priv->gfiddir_stbuf, xdata, &priv->root_stbuf); + return 0; } int -ga_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - mode_t umask, dict_t *xdata) +ga_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { - int op_errno = 0; + int op_errno = ENOMEM; - GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + GFID_ACCESS_ENTRY_OP_CHECK(loc, op_errno, err); - STACK_WIND (frame, default_mkdir_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, - xdata); + STACK_WIND(frame, default_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); - return 0; + return 0; err: - STACK_UNWIND_STRICT (mkdir, frame, -1, op_errno, loc->inode, - NULL, NULL, NULL, xdata); - return 0; + STACK_UNWIND_STRICT(mkdir, frame, -1, op_errno, loc->inode, NULL, NULL, + NULL, xdata); + return 0; } - int -ga_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, - mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +ga_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - int op_errno = 0; + int op_errno = ENOMEM; - GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + GFID_ACCESS_ENTRY_OP_CHECK(loc, op_errno, err); - STACK_WIND (frame, default_create_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, - loc, flags, mode, umask, fd, xdata); - return 0; + STACK_WIND(frame, default_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; err: - STACK_UNWIND_STRICT (create, frame, -1, op_errno, NULL, - NULL, NULL, NULL, NULL, xdata); - - return 0; + STACK_UNWIND_STRICT(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, xdata); + return 0; } int -ga_symlink (call_frame_t *frame, xlator_t *this, const char *linkname, - loc_t *loc, mode_t umask, dict_t *xdata) +ga_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) { - int op_errno = 0; + int op_errno = ENOMEM; - GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + GFID_ACCESS_ENTRY_OP_CHECK(loc, op_errno, err); - STACK_WIND (frame, default_symlink_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, - linkname, loc, umask, xdata); - return 0; + STACK_WIND(frame, default_symlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata); + return 0; err: - STACK_UNWIND_STRICT (symlink, frame, -1, op_errno, NULL, - NULL, NULL, NULL, xdata); + STACK_UNWIND_STRICT(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL, + xdata); - return 0; + return 0; } int -ga_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, mode_t umask, dict_t *xdata) +ga_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) { - int op_errno = 0; + int op_errno = ENOMEM; - GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + GFID_ACCESS_ENTRY_OP_CHECK(loc, op_errno, err); - STACK_WIND (frame, default_mknod_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, - umask, xdata); + STACK_WIND(frame, default_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); - return 0; + return 0; err: - STACK_UNWIND_STRICT (mknod, frame, -1, op_errno, NULL, - NULL, NULL, NULL, xdata); + STACK_UNWIND_STRICT(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + xdata); - return 0; + return 0; } int -ga_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flag, - dict_t *xdata) +ga_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flag, + dict_t *xdata) { - int op_errno = 0; - inode_t *unref = NULL; + int op_errno = ENOMEM; + int ret = -1; + loc_t ga_loc = { + 0, + }; - GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + GFID_ACCESS_ENTRY_OP_CHECK(loc, op_errno, err); - GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + ret = ga_valid_inode_loc_copy(&ga_loc, loc, this); + if (ret < 0) + goto err; -wind: - STACK_WIND (frame, default_rmdir_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir, - loc, flag, xdata); - if (unref) - inode_unref (unref); + STACK_WIND(frame, default_rmdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, &ga_loc, flag, xdata); - return 0; + loc_wipe(&ga_loc); + return 0; err: - STACK_UNWIND_STRICT (rmdir, frame, -1, op_errno, NULL, - NULL, xdata); + STACK_UNWIND_STRICT(rmdir, frame, -1, op_errno, NULL, NULL, xdata); - return 0; + return 0; } int -ga_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag, - dict_t *xdata) +ga_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag, + dict_t *xdata) { - int op_errno = 0; - inode_t *unref = NULL; + int op_errno = ENOMEM; + int ret = -1; + loc_t ga_loc = { + 0, + }; - GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + GFID_ACCESS_ENTRY_OP_CHECK(loc, op_errno, err); - GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); - -wind: - STACK_WIND (frame, default_unlink_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, - loc, xflag, xdata); + ret = ga_valid_inode_loc_copy(&ga_loc, loc, this); + if (ret < 0) + goto err; - if (unref) - inode_unref (unref); + STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &ga_loc, xflag, xdata); - return 0; + loc_wipe(&ga_loc); + return 0; err: - STACK_UNWIND_STRICT (unlink, frame, -1, op_errno, NULL, - NULL, xdata); + STACK_UNWIND_STRICT(unlink, frame, -1, op_errno, NULL, NULL, xdata); - return 0; + return 0; } int -ga_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) +ga_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - int op_errno = 0; - inode_t *oldloc_unref = NULL; - inode_t *newloc_unref = NULL; - - GFID_ACCESS_ENTRY_OP_CHECK (oldloc, op_errno, err); - GFID_ACCESS_ENTRY_OP_CHECK (newloc, op_errno, err); - - GFID_ACCESS_GET_VALID_DIR_INODE (this, oldloc, oldloc_unref, - handle_newloc); - -handle_newloc: - GFID_ACCESS_GET_VALID_DIR_INODE (this, newloc, newloc_unref, wind); - -wind: - STACK_WIND (frame, default_rename_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, - oldloc, newloc, xdata); - - if (oldloc_unref) - inode_unref (oldloc_unref); - - if (newloc_unref) - inode_unref (newloc_unref); - - return 0; + int op_errno = ENOMEM; + int ret = 0; + loc_t ga_oldloc = { + 0, + }; + loc_t ga_newloc = { + 0, + }; + + GFID_ACCESS_ENTRY_OP_CHECK(oldloc, op_errno, err); + GFID_ACCESS_ENTRY_OP_CHECK(newloc, op_errno, err); + + ret = ga_valid_inode_loc_copy(&ga_oldloc, oldloc, this); + if (ret < 0) + goto err; + + ret = ga_valid_inode_loc_copy(&ga_newloc, newloc, this); + if (ret < 0) { + loc_wipe(&ga_oldloc); + goto err; + } + + STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, &ga_oldloc, &ga_newloc, xdata); + + loc_wipe(&ga_newloc); + loc_wipe(&ga_oldloc); + return 0; err: - STACK_UNWIND_STRICT (rename, frame, -1, op_errno, NULL, - NULL, NULL, NULL, NULL, xdata); + STACK_UNWIND_STRICT(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, xdata); - return 0; + return 0; } - int -ga_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) +ga_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - int op_errno = 0; - inode_t *oldloc_unref = NULL; - inode_t *newloc_unref = NULL; - - GFID_ACCESS_ENTRY_OP_CHECK (oldloc, op_errno, err); - GFID_ACCESS_ENTRY_OP_CHECK (newloc, op_errno, err); - - GFID_ACCESS_GET_VALID_DIR_INODE (this, oldloc, oldloc_unref, - handle_newloc); + int op_errno = ENOMEM; + int ret = 0; + loc_t ga_oldloc = { + 0, + }; + loc_t ga_newloc = { + 0, + }; + + GFID_ACCESS_ENTRY_OP_CHECK(oldloc, op_errno, err); + GFID_ACCESS_ENTRY_OP_CHECK(newloc, op_errno, err); + + ret = ga_valid_inode_loc_copy(&ga_oldloc, oldloc, this); + if (ret < 0) + goto err; + + ret = ga_valid_inode_loc_copy(&ga_newloc, newloc, this); + if (ret < 0) { + loc_wipe(&ga_oldloc); + goto err; + } + + STACK_WIND(frame, default_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, &ga_oldloc, &ga_newloc, xdata); + + loc_wipe(&ga_newloc); + loc_wipe(&ga_oldloc); + return 0; -handle_newloc: - GFID_ACCESS_GET_VALID_DIR_INODE (this, newloc, newloc_unref, wind); - -wind: - STACK_WIND (frame, default_link_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, - oldloc, newloc, xdata); - - if (oldloc_unref) - inode_unref (oldloc_unref); - - if (newloc_unref) - inode_unref (newloc_unref); - - return 0; err: - STACK_UNWIND_STRICT (link, frame, -1, op_errno, NULL, - NULL, NULL, NULL, xdata); + STACK_UNWIND_STRICT(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, + xdata); - return 0; + return 0; } int32_t -ga_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, - fd_t *fd, dict_t *xdata) +ga_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { - int op_errno = 0; + int op_errno = ENOMEM; - GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + GFID_ACCESS_INODE_OP_CHECK(loc, op_errno, err); - /* also check if the loc->inode itself is virtual - inode, if yes, return with failure, mainly because we - can't handle all the readdirp and other things on it. */ - if (inode_ctx_get (loc->inode, this, NULL) == 0) { - op_errno = ENOTSUP; - goto err; - } + /* also check if the loc->inode itself is virtual + inode, if yes, return with failure, mainly because we + can't handle all the readdirp and other things on it. */ + if (inode_ctx_get(loc->inode, this, NULL) == 0) { + op_errno = ENOTSUP; + goto err; + } - STACK_WIND (frame, default_opendir_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir, - loc, fd, xdata); - return 0; + STACK_WIND(frame, default_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + return 0; err: - STACK_UNWIND_STRICT (opendir, frame, -1, op_errno, NULL, xdata); + STACK_UNWIND_STRICT(opendir, frame, -1, op_errno, NULL, xdata); - return 0; + return 0; } int32_t -ga_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) +ga_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata) { - inode_t *unref = NULL; + int op_errno = ENOMEM; + int ret = -1; + loc_t ga_loc = { + 0, + }; - GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + GFID_ACCESS_INODE_OP_CHECK(loc, op_errno, err); + ret = ga_valid_inode_loc_copy(&ga_loc, loc, this); + if (ret < 0) + goto err; -wind: - STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + STACK_WIND(frame, default_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, &ga_loc, name, xdata); - if (unref) - inode_unref (unref); + loc_wipe(&ga_loc); - return 0; + return 0; +err: + STACK_UNWIND_STRICT(getxattr, frame, -1, op_errno, NULL, xdata); + + return 0; } int32_t -ga_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xdata) +ga_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - inode_t *unref = NULL; + int op_errno = ENOMEM; + int ret = -1; + loc_t ga_loc = { + 0, + }; + ga_private_t *priv = NULL; + + priv = this->private; + /* If stat is on ".gfid" itself, do not wind further, + * return fake stat and return success. + */ + if (__is_gfid_access_dir(loc->gfid)) + goto out; + + ret = ga_valid_inode_loc_copy(&ga_loc, loc, this); + if (ret < 0) + goto err; + + STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, &ga_loc, xdata); + + loc_wipe(&ga_loc); + return 0; - GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); +err: + STACK_UNWIND_STRICT(stat, frame, -1, op_errno, NULL, xdata); -wind: - STACK_WIND (frame, default_stat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, loc, xdata); - if (unref) - inode_unref (unref); + return 0; - return 0; +out: + STACK_UNWIND_STRICT(stat, frame, 0, 0, &priv->gfiddir_stbuf, xdata); + return 0; } int32_t -ga_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid, - dict_t *xdata) +ga_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) { - inode_t *unref = NULL; - - GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); - -wind: - STACK_WIND (frame, default_setattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->setattr, loc, stbuf, valid, - xdata); - if (unref) - inode_unref (unref); + int op_errno = ENOMEM; + int ret = -1; + loc_t ga_loc = { + 0, + }; + + GFID_ACCESS_INODE_OP_CHECK(loc, op_errno, err); + ret = ga_valid_inode_loc_copy(&ga_loc, loc, this); + if (ret < 0) + goto err; + + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, &ga_loc, stbuf, valid, xdata); + + loc_wipe(&ga_loc); + return 0; +err: + STACK_UNWIND_STRICT(setattr, frame, -1, op_errno, NULL, NULL, xdata); - return 0; + return 0; } int32_t -ga_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) +ga_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - inode_t *unref = NULL; + int op_errno = ENOMEM; + int ret = -1; + loc_t ga_loc = { + 0, + }; - GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + GFID_ACCESS_INODE_OP_CHECK(loc, op_errno, err); + ret = ga_valid_inode_loc_copy(&ga_loc, loc, this); + if (ret < 0) + goto err; -wind: - STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->removexattr, loc, name, - xdata); - if (unref) - inode_unref (unref); + STACK_WIND(frame, default_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, &ga_loc, name, xdata); - return 0; -} + loc_wipe(&ga_loc); + return 0; +err: + STACK_UNWIND_STRICT(removexattr, frame, -1, op_errno, xdata); + + return 0; +} int32_t -mem_acct_init (xlator_t *this) +mem_acct_init(xlator_t *this) { - int ret = -1; - - if (!this) - return ret; + int ret = -1; - ret = xlator_mem_acct_init (this, gf_gfid_access_mt_end + 1); + if (!this) + return ret; - if (ret != 0) { - gf_log (this->name, GF_LOG_WARNING, "Memory accounting" - " init failed"); - return ret; - } + ret = xlator_mem_acct_init(this, gf_gfid_access_mt_end + 1); + if (ret != 0) { + gf_log(this->name, GF_LOG_WARNING, + "Memory accounting" + " init failed"); return ret; + } + + return ret; } int32_t -init (xlator_t *this) +init(xlator_t *this) { - ga_private_t *priv = NULL; - int ret = -1; - - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "not configured with exactly one child. exiting"); - goto out; - } - - /* This can be the top of graph in certain cases */ - if (!this->parents) { - gf_log (this->name, GF_LOG_DEBUG, - "dangling volume. check volfile "); - } - - /* TODO: define a mem-type structure */ - priv = GF_CALLOC (1, sizeof (*priv), gf_gfid_access_mt_priv_t); - if (!priv) - goto out; - - priv->newfile_args_pool = mem_pool_new (ga_newfile_args_t, 512); - if (!priv->newfile_args_pool) - goto out; - - priv->heal_args_pool = mem_pool_new (ga_heal_args_t, 512); - if (!priv->heal_args_pool) - goto out; - - this->private = priv; - - ret = 0; + ga_private_t *priv = NULL; + int ret = -1; + + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_ERROR, + "not configured with exactly one child. exiting"); + goto out; + } + + /* This can be the top of graph in certain cases */ + if (!this->parents) { + gf_log(this->name, GF_LOG_DEBUG, "dangling volume. check volfile "); + } + + /* TODO: define a mem-type structure */ + priv = GF_CALLOC(1, sizeof(*priv), gf_gfid_access_mt_priv_t); + if (!priv) + goto out; + + priv->newfile_args_pool = mem_pool_new(ga_newfile_args_t, 512); + if (!priv->newfile_args_pool) + goto out; + + priv->heal_args_pool = mem_pool_new(ga_heal_args_t, 512); + if (!priv->heal_args_pool) + goto out; + + this->local_pool = mem_pool_new(ga_local_t, 16); + if (!this->local_pool) { + gf_log(this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto out; + } + + this->private = priv; + + ret = 0; out: - if (ret && priv) { - if (priv->newfile_args_pool) - mem_pool_destroy (priv->newfile_args_pool); - GF_FREE (priv); - } + if (ret && priv) { + if (priv->newfile_args_pool) + mem_pool_destroy(priv->newfile_args_pool); + GF_FREE(priv); + } - return ret; + return ret; } void -fini (xlator_t *this) +fini(xlator_t *this) { - ga_private_t *priv = NULL; - priv = this->private; - this->private = NULL; - - if (priv) { - if (priv->newfile_args_pool) - mem_pool_destroy (priv->newfile_args_pool); - if (priv->heal_args_pool) - mem_pool_destroy (priv->heal_args_pool); - GF_FREE (priv); - } - - return; + ga_private_t *priv = NULL; + priv = this->private; + this->private = NULL; + + if (priv) { + if (priv->newfile_args_pool) + mem_pool_destroy(priv->newfile_args_pool); + if (priv->heal_args_pool) + mem_pool_destroy(priv->heal_args_pool); + GF_FREE(priv); + } + + return; } +int32_t +ga_dump_inodectx(xlator_t *this, inode_t *inode) +{ + int ret = -1; + uint64_t value = 0; + inode_t *tmp_inode = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + + ret = inode_ctx_get(inode, this, &value); + if (ret == 0) { + tmp_inode = (void *)(uintptr_t)value; + gf_proc_dump_build_key(key_prefix, this->name, "inode"); + gf_proc_dump_add_section("%s", key_prefix); + gf_proc_dump_write("real-gfid", "%s", uuid_utoa(tmp_inode->gfid)); + } + + return 0; +} struct xlator_fops fops = { - .lookup = ga_lookup, - - /* entry fops */ - .mkdir = ga_mkdir, - .mknod = ga_mknod, - .create = ga_create, - .symlink = ga_symlink, - .link = ga_link, - .unlink = ga_unlink, - .rmdir = ga_rmdir, - .rename = ga_rename, - - /* handle any other directory operations here */ - .opendir = ga_opendir, - .stat = ga_stat, - .setattr = ga_setattr, - .getxattr = ga_getxattr, - .removexattr = ga_removexattr, - - /* special fop to handle more entry creations */ - .setxattr = ga_setxattr, + .lookup = ga_lookup, + + /* entry fops */ + .mkdir = ga_mkdir, + .mknod = ga_mknod, + .create = ga_create, + .symlink = ga_symlink, + .link = ga_link, + .unlink = ga_unlink, + .rmdir = ga_rmdir, + .rename = ga_rename, + + /* handle any other directory operations here */ + .opendir = ga_opendir, + .stat = ga_stat, + .setattr = ga_setattr, + .getxattr = ga_getxattr, + .removexattr = ga_removexattr, + + /* special fop to handle more entry creations */ + .setxattr = ga_setxattr, }; struct xlator_cbks cbks = { - .forget = ga_forget, + .forget = ga_forget, +}; + +struct xlator_dumpops dumpops = { + .inodectx = ga_dump_inodectx, }; struct volume_options options[] = { - /* This translator doesn't take any options, or provide any options */ - { .key = {NULL} }, + /* This translator doesn't take any options, or provide any options */ + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .mem_acct_init = mem_acct_init, + .op_version = {1}, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "gfid-access", + .category = GF_MAINTAINED, }; diff --git a/xlators/features/gfid-access/src/gfid-access.h b/xlators/features/gfid-access/src/gfid-access.h index e13c9b7240b..b1e255e56c0 100644 --- a/xlators/features/gfid-access/src/gfid-access.h +++ b/xlators/features/gfid-access/src/gfid-access.h @@ -10,119 +10,98 @@ #ifndef __GFID_ACCESS_H__ #define __GFID_ACCESS_H__ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "defaults.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> #include "gfid-access-mem-types.h" #define UUID_CANONICAL_FORM_LEN 36 #define GF_FUSE_AUX_GFID_NEWFILE "glusterfs.gfid.newfile" -#define GF_FUSE_AUX_GFID_HEAL "glusterfs.gfid.heal" +#define GF_FUSE_AUX_GFID_HEAL "glusterfs.gfid.heal" #define GF_GFID_KEY "GLUSTERFS_GFID" #define GF_GFID_DIR ".gfid" #define GF_AUX_GFID 0xd -#define GFID_ACCESS_GET_VALID_DIR_INODE(x,l,unref,lbl) do { \ - int ret = 0; \ - uint64_t value = 0; \ - inode_t *tmp_inode = NULL; \ - \ - /* if its an entry operation, on the virtual */ \ - /* directory inode as parent, we need to handle */ \ - /* it properly */ \ - if (l->parent) { \ - ret = inode_ctx_get (l->parent, x, &value); \ - if (ret) \ - goto lbl; \ - tmp_inode = (inode_t *)value; \ - unref = inode_ref (tmp_inode); \ - l->parent = tmp_inode; \ - /* if parent is virtual, no need to handle */ \ - /* loc->inode */ \ - break; \ - } \ - \ - /* if its an inode operation, on the virtual */ \ - /* directory inode itself, we need to handle */ \ - /* it properly */ \ - if (l->inode) { \ - ret = inode_ctx_get (l->inode, x, &value); \ - if (ret) \ - goto lbl; \ - tmp_inode = (inode_t *)value; \ - unref = inode_ref (tmp_inode); \ - l->inode = tmp_inode; \ - } \ - \ - } while (0) - -#define GFID_ACCESS_ENTRY_OP_CHECK(loc,err,lbl) do { \ - /* need to check if the lookup is on virtual dir */ \ - if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) && \ - ((loc->parent && \ - __is_root_gfid (loc->parent->gfid)) || \ - __is_root_gfid (loc->pargfid))) { \ - err = EEXIST; \ - goto lbl; \ - } \ - \ - /* now, check if the lookup() is on an existing */ \ - /* entry, but on gfid-path */ \ - if ((loc->parent && \ - __is_gfid_access_dir (loc->parent->gfid)) || \ - __is_gfid_access_dir (loc->pargfid)) { \ - err = EPERM; \ - goto lbl; \ - } \ - } while (0) - - +#define GFID_ACCESS_ENTRY_OP_CHECK(loc, err, lbl) \ + do { \ + /* need to check if the lookup is on virtual dir */ \ + if ((loc->name && !strcmp(GF_GFID_DIR, loc->name)) && \ + ((loc->parent && __is_root_gfid(loc->parent->gfid)) || \ + __is_root_gfid(loc->pargfid))) { \ + err = ENOTSUP; \ + goto lbl; \ + } \ + \ + /* now, check if the lookup() is on an existing */ \ + /* entry, but on gfid-path */ \ + if ((loc->parent && __is_gfid_access_dir(loc->parent->gfid)) || \ + __is_gfid_access_dir(loc->pargfid)) { \ + err = EPERM; \ + goto lbl; \ + } \ + } while (0) + +#define GFID_ACCESS_INODE_OP_CHECK(loc, err, lbl) \ + do { \ + /*Check if it is on .gfid*/ \ + if (__is_gfid_access_dir(loc->gfid)) { \ + err = ENOTSUP; \ + goto lbl; \ + } \ + } while (0) typedef struct { - unsigned int uid; - unsigned int gid; - char gfid[UUID_CANONICAL_FORM_LEN + 1]; - unsigned int st_mode; - char *bname; - - union { - struct _symlink_in { - char *linkpath; - } __attribute__ ((__packed__)) symlink; - - struct _mknod_in { - unsigned int mode; - unsigned int rdev; - unsigned int umask; - } __attribute__ ((__packed__)) mknod; - - struct _mkdir_in { - unsigned int mode; - unsigned int umask; - } __attribute__ ((__packed__)) mkdir; - } __attribute__ ((__packed__)) args; + unsigned int uid; + unsigned int gid; + char gfid[UUID_CANONICAL_FORM_LEN + 1]; + unsigned int st_mode; + char *bname; + + union { + struct _symlink_in { + char *linkpath; + } __attribute__((__packed__)) symlink; + + struct _mknod_in { + unsigned int mode; + unsigned int rdev; + unsigned int umask; + } __attribute__((__packed__)) mknod; + + struct _mkdir_in { + unsigned int mode; + unsigned int umask; + } __attribute__((__packed__)) mkdir; + } __attribute__((__packed__)) args; } __attribute__((__packed__)) ga_newfile_args_t; typedef struct { - char gfid[UUID_CANONICAL_FORM_LEN + 1]; - char *bname; /* a null terminated basename */ + char gfid[UUID_CANONICAL_FORM_LEN + 1]; + char *bname; /* a null terminated basename */ } __attribute__((__packed__)) ga_heal_args_t; struct ga_private { - /* root inode's stbuf */ - struct iatt root_stbuf; - struct iatt gfiddir_stbuf; - struct mem_pool *newfile_args_pool; - struct mem_pool *heal_args_pool; + /* root inode's stbuf */ + struct iatt root_stbuf; + struct iatt gfiddir_stbuf; + struct mem_pool *newfile_args_pool; + struct mem_pool *heal_args_pool; }; typedef struct ga_private ga_private_t; +struct __ga_local { + call_frame_t *orig_frame; + unsigned int uid; + unsigned int gid; + loc_t loc; + mode_t mode; + dev_t rdev; + mode_t umask; + dict_t *xdata; +}; +typedef struct __ga_local ga_local_t; + #endif /* __GFID_ACCESS_H__ */ diff --git a/xlators/features/glupy/doc/README.md b/xlators/features/glupy/doc/README.md deleted file mode 100644 index 2d7b30ef694..00000000000 --- a/xlators/features/glupy/doc/README.md +++ /dev/null @@ -1,44 +0,0 @@ -This is just the very start for a GlusterFS[1] meta-translator that will -allow translator code to be written in Python. It's based on the standard -Python embedding (not extending) techniques, plus a dash of the ctypes module. -The interface is a pretty minimal adaptation of the dispatches and callbacks -from the C API[2] to Python, as follows: - -* Dispatch functions and callbacks must be defined on an "xlator" class - derived from gluster.Translator so that they'll be auto-registered with - the C translator during initialization. - -* For each dispatch or callback function you want to intercept, you define a - Python function using the xxx\_fop\_t or xxx\_cbk\_t decorator. - -* The arguments for each operation are different, so you'll need to refer to - the C API. GlusterFS-specific types are used (though only loc\_t is fully - defined so far) and type correctness is enforced by ctypes. - -* If you do intercept a dispatch function, it is your responsibility to call - xxx\_wind (like STACK\_WIND in the C API but operation-specific) to pass - the request to the next translator. If you do not intercept a function, it - will default the same way as for C (pass through to the same operation with - the same arguments on the first child translator). - -* If you intercept a callback function, it is your responsibility to call - xxx\_unwind (like STACK\_UNWIND\_STRICT in the C API) to pass the request back - to the caller. - -So far only the lookup and create operations are handled this way, to support -the "negative lookup" example. Now that the basic infrastructure is in place, -adding more functions should be very quick, though with that much boilerplate I -might pause to write a code generator. I also plan to add structure -definitions and interfaces for some of the utility functions in libglusterfs -(especially those having to do with inode and fd context) in the fairly near -future. Note that you can also use ctypes to get at anything not explicitly -exposed to Python already. - -_If you're coming here because of the Linux Journal article, please note that -the code has evolved since that was written. The version that matches the -article is here:_ - -https://github.com/jdarcy/glupy/tree/4bbae91ba459ea46ef32f2966562492e4ca9187a - -[1] http://www.gluster.org -[2] http://hekafs.org/dist/xlator_api_2.html diff --git a/xlators/features/glupy/doc/TESTING b/xlators/features/glupy/doc/TESTING deleted file mode 100644 index e05f17f498f..00000000000 --- a/xlators/features/glupy/doc/TESTING +++ /dev/null @@ -1,9 +0,0 @@ -Loading a translator written in Python using the glupy meta translator -------------------------------------------------------------------------------- -'test.vol' is a simple volfile with the debug-trace Python translator on top -of a brick. The volfile can be mounted using the following command. - -$ glusterfs --debug -f test.vol /path/to/mntpt - -If then file operations are performed on the newly mounted file system, log -output would be printed by the Python translator on the standard output. diff --git a/xlators/features/glupy/doc/test.vol b/xlators/features/glupy/doc/test.vol deleted file mode 100644 index 0751a488c1f..00000000000 --- a/xlators/features/glupy/doc/test.vol +++ /dev/null @@ -1,10 +0,0 @@ -volume vol-posix - type storage/posix - option directory /path/to/brick -end-volume - -volume vol-glupy - type features/glupy - option module-name debug-trace - subvolumes vol-posix -end-volume diff --git a/xlators/features/glupy/src/Makefile.am b/xlators/features/glupy/src/Makefile.am deleted file mode 100644 index 9608628398f..00000000000 --- a/xlators/features/glupy/src/Makefile.am +++ /dev/null @@ -1,20 +0,0 @@ -xlator_LTLIBRARIES = glupy.la - -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features - -glupydir = $(xlatordir)/glupy - -glupy_PYTHON = gluster.py negative.py helloworld.py debug-trace.py - -glupy_la_LDFLAGS = -module -avoid-version -shared -nostartfiles -glupy_la_SOURCES = glupy.c -glupy_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ - -lpthread -l$(BUILD_PYTHON_LIB) - -noinst_HEADERS = glupy.h - -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -isystem $(BUILD_PYTHON_INC) - -AM_CFLAGS = -Wall -fno-strict-aliasing -DGLUSTER_PYTHON_PATH=\"$(glupydir)\" $(GF_CFLAGS) - -CLEANFILES = diff --git a/xlators/features/glupy/src/debug-trace.py b/xlators/features/glupy/src/debug-trace.py deleted file mode 100644 index 53e76546b15..00000000000 --- a/xlators/features/glupy/src/debug-trace.py +++ /dev/null @@ -1,774 +0,0 @@ -import sys -import stat -from uuid import UUID -from time import strftime, localtime -from gluster import * -# This translator was written primarily to test the fop entry point definitions -# and structure definitions in 'gluster.py'. -# It is similar to the debug-trace translator, one of the already available -# translator types written in C, that logs the arguments passed to the fops and -# their corresponding cbk functions. - -dl.get_id.restype = c_long -dl.get_id.argtypes = [ POINTER(call_frame_t) ] - -dl.get_rootunique.restype = c_uint64 -dl.get_rootunique.argtypes = [ POINTER(call_frame_t) ] - -def uuid2str (gfid): - return str(UUID(''.join(map("{0:02x}".format, gfid)))) - - -def st_mode_from_ia (prot, filetype): - st_mode = 0 - type_bit = 0 - prot_bit = 0 - - if filetype == IA_IFREG: - type_bit = stat.S_IFREG - elif filetype == IA_IFDIR: - type_bit = stat.S_IFDIR - elif filetype == IA_IFLNK: - type_bit = stat.S_IFLNK - elif filetype == IA_IFBLK: - type_bit = stat.S_IFBLK - elif filetype == IA_IFCHR: - type_bit = stat.S_IFCHR - elif filetype == IA_IFIFO: - type_bit = stat.S_IFIFO - elif filetype == IA_IFSOCK: - type_bit = stat.S_IFSOCK - elif filetype == IA_INVAL: - pass - - - if prot.suid: - prot_bit |= stat.S_ISUID - if prot.sgid: - prot_bit |= stat.S_ISGID - if prot.sticky: - prot_bit |= stat.S_ISVTX - - if prot.owner.read: - prot_bit |= stat.S_IRUSR - if prot.owner.write: - prot_bit |= stat.S_IWUSR - if prot.owner.execn: - prot_bit |= stat.S_IXUSR - - if prot.group.read: - prot_bit |= stat.S_IRGRP - if prot.group.write: - prot_bit |= stat.S_IWGRP - if prot.group.execn: - prot_bit |= stat.S_IXGRP - - if prot.other.read: - prot_bit |= stat.S_IROTH - if prot.other.write: - prot_bit |= stat.S_IWOTH - if prot.other.execn: - prot_bit |= stat.S_IXOTH - - st_mode = (type_bit | prot_bit) - - return st_mode - - -def trace_stat2str (buf): - gfid = uuid2str(buf.contents.ia_gfid) - mode = st_mode_from_ia(buf.contents.ia_prot, buf.contents.ia_type) - atime_buf = strftime("[%b %d %H:%M:%S]", - localtime(buf.contents.ia_atime)) - mtime_buf = strftime("[%b %d %H:%M:%S]", - localtime(buf.contents.ia_mtime)) - ctime_buf = strftime("[%b %d %H:%M:%S]", - localtime(buf.contents.ia_ctime)) - return ("(gfid={0:s}, ino={1:d}, mode={2:o}, nlink={3:d}, uid ={4:d}, "+ - "gid ={5:d}, size={6:d}, blocks={7:d}, atime={8:s}, mtime={9:s}, "+ - "ctime={10:s})").format(gfid, buf.contents.ia_no, mode, - buf.contents.ia_nlink, - buf.contents.ia_uid, - buf.contents.ia_gid, - buf.contents.ia_size, - buf.contents.ia_blocks, - atime_buf, mtime_buf, - ctime_buf) - -class xlator(Translator): - - def __init__(self, c_this): - Translator.__init__(self, c_this) - self.gfids = {} - - def lookup_fop(self, frame, this, loc, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(loc.contents.gfid) - print("GLUPY TRACE LOOKUP FOP- {0:d}: gfid={1:s}; " + - "path={2:s}").format(unique, gfid, loc.contents.path) - self.gfids[key] = gfid - dl.wind_lookup(frame, POINTER(xlator_t)(), loc, xdata) - return 0 - - def lookup_cbk(self, frame, cookie, this, op_ret, op_errno, - inode, buf, xdata, postparent): - unique =dl.get_rootunique(frame) - key =dl.get_id(frame) - if op_ret == 0: - gfid = uuid2str(buf.contents.ia_gfid) - statstr = trace_stat2str(buf) - postparentstr = trace_stat2str(postparent) - print("GLUPY TRACE LOOKUP CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; *buf={3:s}; " + - "*postparent={4:s}").format(unique, gfid, - op_ret, statstr, - postparentstr) - else: - gfid = self.gfids[key] - print("GLUPY TRACE LOOKUP CBK - {0:d}: gfid={1:s};" + - " op_ret={2:d}; op_errno={3:d}").format(unique, - gfid, - op_ret, - op_errno) - del self.gfids[key] - dl.unwind_lookup(frame, cookie, this, op_ret, op_errno, - inode, buf, xdata, postparent) - return 0 - - def create_fop(self, frame, this, loc, flags, mode, umask, fd, - xdata): - unique = dl.get_rootunique(frame) - gfid = uuid2str(loc.contents.gfid) - print("GLUPY TRACE CREATE FOP- {0:d}: gfid={1:s}; path={2:s}; " + - "fd={3:s}; flags=0{4:o}; mode=0{5:o}; " + - "umask=0{6:o}").format(unique, gfid, loc.contents.path, - fd, flags, mode, umask) - dl.wind_create(frame, POINTER(xlator_t)(), loc, flags,mode, - umask, fd, xdata) - return 0 - - def create_cbk(self, frame, cookie, this, op_ret, op_errno, fd, - inode, buf, preparent, postparent, xdata): - unique = dl.get_rootunique(frame) - if op_ret >= 0: - gfid = uuid2str(inode.contents.gfid) - statstr = trace_stat2str(buf) - preparentstr = trace_stat2str(preparent) - postparentstr = trace_stat2str(postparent) - print("GLUPY TRACE CREATE CBK- {0:d}: gfid={1:s};" + - " op_ret={2:d}; fd={3:s}; *stbuf={4:s}; " + - "*preparent={5:s};" + - " *postparent={6:s}").format(unique, gfid, op_ret, - fd, statstr, - preparentstr, - postparentstr) - else: - print ("GLUPY TRACE CREATE CBK- {0:d}: op_ret={1:d}; " + - "op_errno={2:d}").format(unique, op_ret, op_errno) - dl.unwind_create(frame, cookie, this, op_ret, op_errno, fd, - inode, buf, preparent, postparent, xdata) - return 0 - - def open_fop(self, frame, this, loc, flags, fd, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(loc.contents.inode.contents.gfid) - print("GLUPY TRACE OPEN FOP- {0:d}: gfid={1:s}; path={2:s}; "+ - "flags={3:d}; fd={4:s}").format(unique, gfid, - loc.contents.path, flags, - fd) - self.gfids[key] = gfid - dl.wind_open(frame, POINTER(xlator_t)(), loc, flags, fd, xdata) - return 0 - - def open_cbk(self, frame, cookie, this, op_ret, op_errno, fd, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - print("GLUPY TRACE OPEN CBK- {0:d}: gfid={1:s}; op_ret={2:d}; " - "op_errno={3:d}; *fd={4:s}").format(unique, gfid, - op_ret, op_errno, fd) - del self.gfids[key] - dl.unwind_open(frame, cookie, this, op_ret, op_errno, fd, - xdata) - return 0 - - def readv_fop(self, frame, this, fd, size, offset, flags, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(fd.contents.inode.contents.gfid) - print("GLUPY TRACE READV FOP- {0:d}: gfid={1:s}; "+ - "fd={2:s}; size ={3:d}; offset={4:d}; " + - "flags=0{5:x}").format(unique, gfid, fd, size, offset, - flags) - self.gfids[key] = gfid - dl.wind_readv (frame, POINTER(xlator_t)(), fd, size, offset, - flags, xdata) - return 0 - - def readv_cbk(self, frame, cookie, this, op_ret, op_errno, vector, - count, buf, iobref, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - if op_ret >= 0: - statstr = trace_stat2str(buf) - print("GLUPY TRACE READV CBK- {0:d}: gfid={1:s}, "+ - "op_ret={2:d}; *buf={3:s};").format(unique, gfid, - op_ret, - statstr) - - else: - print("GLUPY TRACE READV CBK- {0:d}: gfid={1:s}, "+ - "op_ret={2:d}; op_errno={3:d}").format(unique, - gfid, - op_ret, - op_errno) - del self.gfids[key] - dl.unwind_readv (frame, cookie, this, op_ret, op_errno, - vector, count, buf, iobref, xdata) - return 0 - - def writev_fop(self, frame, this, fd, vector, count, offset, flags, - iobref, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(fd.contents.inode.contents.gfid) - print("GLUPY TRACE WRITEV FOP- {0:d}: gfid={1:s}; " + - "fd={2:s}; count={3:d}; offset={4:d}; " + - "flags=0{5:x}").format(unique, gfid, fd, count, offset, - flags) - self.gfids[key] = gfid - dl.wind_writev(frame, POINTER(xlator_t)(), fd, vector, count, - offset, flags, iobref, xdata) - return 0 - - def writev_cbk(self, frame, cookie, this, op_ret, op_errno, prebuf, - postbuf, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - if op_ret >= 0: - preopstr = trace_stat2str(prebuf) - postopstr = trace_stat2str(postbuf) - print("GLUPY TRACE WRITEV CBK- {0:d}: op_ret={1:d}; " + - "*prebuf={2:s}; " + - "*postbuf={3:s}").format(unique, op_ret, preopstr, - postopstr) - else: - gfid = self.gfids[key] - print("GLUPY TRACE WRITEV CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; op_errno={3:d}").format(unique, - gfid, - op_ret, - op_errno) - del self.gfids[key] - dl.unwind_writev (frame, cookie, this, op_ret, op_errno, - prebuf, postbuf, xdata) - return 0 - - def opendir_fop(self, frame, this, loc, fd, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(loc.contents.inode.contents.gfid) - print("GLUPY TRACE OPENDIR FOP- {0:d}: gfid={1:s}; path={2:s}; "+ - "fd={3:s}").format(unique, gfid, loc.contents.path, fd) - self.gfids[key] = gfid - dl.wind_opendir(frame, POINTER(xlator_t)(), loc, fd, xdata) - return 0 - - def opendir_cbk(self, frame, cookie, this, op_ret, op_errno, fd, - xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - print("GLUPY TRACE OPENDIR CBK- {0:d}: gfid={1:s}; op_ret={2:d};"+ - " op_errno={3:d}; fd={4:s}").format(unique, gfid, op_ret, - op_errno, fd) - del self.gfids[key] - dl.unwind_opendir(frame, cookie, this, op_ret, op_errno, - fd, xdata) - return 0 - - def readdir_fop(self, frame, this, fd, size, offset, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(fd.contents.inode.contents.gfid) - print("GLUPY TRACE READDIR FOP- {0:d}: gfid={1:s}; fd={2:s}; " + - "size={3:d}; offset={4:d}").format(unique, gfid, fd, size, - offset) - self.gfids[key] = gfid - dl.wind_readdir(frame, POINTER(xlator_t)(), fd, size, offset, - xdata) - return 0 - - def readdir_cbk(self, frame, cookie, this, op_ret, op_errno, buf, - xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - print("GLUPY TRACE READDIR CBK- {0:d}: gfid={1:s}; op_ret={2:d};"+ - " op_errno={3:d}").format(unique, gfid, op_ret, op_errno) - del self.gfids[key] - dl.unwind_readdir(frame, cookie, this, op_ret, op_errno, buf, - xdata) - return 0 - - def readdirp_fop(self, frame, this, fd, size, offset, dictionary): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(fd.contents.inode.contents.gfid) - print("GLUPY TRACE READDIRP FOP- {0:d}: gfid={1:s}; fd={2:s}; "+ - " size={3:d}; offset={4:d}").format(unique, gfid, fd, size, - offset) - self.gfids[key] = gfid - dl.wind_readdirp(frame, POINTER(xlator_t)(), fd, size, offset, - dictionary) - return 0 - - def readdirp_cbk(self, frame, cookie, this, op_ret, op_errno, buf, - xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - print("GLUPY TRACE READDIRP CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; op_errno={3:d}").format(unique, gfid, - op_ret, op_errno) - del self.gfids[key] - dl.unwind_readdirp(frame, cookie, this, op_ret, op_errno, buf, - xdata) - return 0 - - def mkdir_fop(self, frame, this, loc, mode, umask, xdata): - unique = dl.get_rootunique(frame) - gfid = uuid2str(loc.contents.inode.contents.gfid) - print("GLUPY TRACE MKDIR FOP- {0:d}: gfid={1:s}; path={2:s}; " + - "mode={3:d}; umask=0{4:o}").format(unique, gfid, - loc.contents.path, mode, - umask) - dl.wind_mkdir(frame, POINTER(xlator_t)(), loc, mode, umask, - xdata) - return 0 - - def mkdir_cbk(self, frame, cookie, this, op_ret, op_errno, inode, buf, - preparent, postparent, xdata): - unique = dl.get_rootunique(frame) - if op_ret == 0: - gfid = uuid2str(inode.contents.gfid) - statstr = trace_stat2str(buf) - preparentstr = trace_stat2str(preparent) - postparentstr = trace_stat2str(postparent) - print("GLUPY TRACE MKDIR CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; *stbuf={3:s}; *prebuf={4:s}; "+ - "*postbuf={5:s} ").format(unique, gfid, op_ret, - statstr, - preparentstr, - postparentstr) - else: - print("GLUPY TRACE MKDIR CBK- {0:d}: op_ret={1:d}; "+ - "op_errno={2:d}").format(unique, op_ret, op_errno) - dl.unwind_mkdir(frame, cookie, this, op_ret, op_errno, inode, - buf, preparent, postparent, xdata) - return 0 - - def rmdir_fop(self, frame, this, loc, flags, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(loc.contents.inode.contents.gfid) - print("GLUPY TRACE RMDIR FOP- {0:d}: gfid={1:s}; path={2:s}; "+ - "flags={3:d}").format(unique, gfid, loc.contents.path, - flags) - self.gfids[key] = gfid - dl.wind_rmdir(frame, POINTER(xlator_t)(), loc, flags, xdata) - return 0 - - def rmdir_cbk(self, frame, cookie, this, op_ret, op_errno, preparent, - postparent, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - if op_ret == 0: - preparentstr = trace_stat2str(preparent) - postparentstr = trace_stat2str(postparent) - print("GLUPY TRACE RMDIR CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; *prebuf={3:s}; "+ - "*postbuf={4:s}").format(unique, gfid, op_ret, - preparentstr, - postparentstr) - else: - print("GLUPY TRACE RMDIR CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; op_errno={3:d}").format(unique, - gfid, - op_ret, - op_errno) - del self.gfids[key] - dl.unwind_rmdir(frame, cookie, this, op_ret, op_errno, - preparent, postparent, xdata) - return 0 - - def stat_fop(self, frame, this, loc, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(loc.contents.inode.contents.gfid) - print("GLUPY TRACE STAT FOP- {0:d}: gfid={1:s}; " + - " path={2:s}").format(unique, gfid, loc.contents.path) - self.gfids[key] = gfid - dl.wind_stat(frame, POINTER(xlator_t)(), loc, xdata) - return 0 - - def stat_cbk(self, frame, cookie, this, op_ret, op_errno, buf, - xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - if op_ret == 0: - statstr = trace_stat2str(buf) - print("GLUPY TRACE STAT CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; *buf={3:s};").format(unique, - gfid, - op_ret, - statstr) - else: - print("GLUPY TRACE STAT CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; op_errno={3:d}").format(unique, - gfid, - op_ret, - op_errno) - del self.gfids[key] - dl.unwind_stat(frame, cookie, this, op_ret, op_errno, - buf, xdata) - return 0 - - def fstat_fop(self, frame, this, fd, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(fd.contents.inode.contents.gfid) - print("GLUPY TRACE FSTAT FOP- {0:d}: gfid={1:s}; " + - "fd={2:s}").format(unique, gfid, fd) - self.gfids[key] = gfid - dl.wind_fstat(frame, POINTER(xlator_t)(), fd, xdata) - return 0 - - def fstat_cbk(self, frame, cookie, this, op_ret, op_errno, buf, - xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - if op_ret == 0: - statstr = trace_stat2str(buf) - print("GLUPY TRACE FSTAT CBK- {0:d}: gfid={1:s} "+ - " op_ret={2:d}; *buf={3:s}").format(unique, - gfid, - op_ret, - statstr) - else: - print("GLUPY TRACE FSTAT CBK- {0:d}: gfid={1:s} "+ - "op_ret={2:d}; op_errno={3:d}").format(unique. - gfid, - op_ret, - op_errno) - del self.gfids[key] - dl.unwind_fstat(frame, cookie, this, op_ret, op_errno, - buf, xdata) - return 0 - - def statfs_fop(self, frame, this, loc, xdata): - unique = dl.get_rootunique(frame) - if loc.contents.inode: - gfid = uuid2str(loc.contents.inode.contents.gfid) - else: - gfid = "0" - print("GLUPY TRACE STATFS FOP- {0:d}: gfid={1:s}; "+ - "path={2:s}").format(unique, gfid, loc.contents.path) - dl.wind_statfs(frame, POINTER(xlator_t)(), loc, xdata) - return 0 - - def statfs_cbk(self, frame, cookie, this, op_ret, op_errno, buf, - xdata): - unique = dl.get_rootunique(frame) - if op_ret == 0: - #TBD: print buf (pointer to an iovec type object) - print("GLUPY TRACE STATFS CBK {0:d}: "+ - "op_ret={1:d}").format(unique, op_ret) - else: - print("GLUPY TRACE STATFS CBK- {0:d}"+ - "op_ret={1:d}; op_errno={2:d}").format(unique, - op_ret, - op_errno) - dl.unwind_statfs(frame, cookie, this, op_ret, op_errno, - buf, xdata) - return 0 - - def getxattr_fop(self, frame, this, loc, name, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(loc.contents.inode.contents.gfid) - print("GLUPY TRACE GETXATTR FOP- {0:d}: gfid={1:s}; path={2:s};"+ - " name={3:s}").format(unique, gfid, loc.contents.path, - name) - self.gfids[key]=gfid - dl.wind_getxattr(frame, POINTER(xlator_t)(), loc, name, xdata) - return 0 - - def getxattr_cbk(self, frame, cookie, this, op_ret, op_errno, - dictionary, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - print("GLUPY TRACE GETXATTR CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; op_errno={3:d}; "+ - " dictionary={4:s}").format(unique, gfid, op_ret, op_errno, - dictionary) - del self.gfids[key] - dl.unwind_getxattr(frame, cookie, this, op_ret, op_errno, - dictionary, xdata) - return 0 - - def fgetxattr_fop(self, frame, this, fd, name, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(fd.contents.inode.contents.gfid) - print("GLUPY TRACE FGETXATTR FOP- {0:d}: gfid={1:s}; fd={2:s}; "+ - "name={3:s}").format(unique, gfid, fd, name) - self.gfids[key] = gfid - dl.wind_fgetxattr(frame, POINTER(xlator_t)(), fd, name, xdata) - return 0 - - def fgetxattr_cbk(self, frame, cookie, this, op_ret, op_errno, - dictionary, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - print("GLUPY TRACE FGETXATTR CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; op_errno={3:d};"+ - " dictionary={4:s}").format(unique, gfid, op_ret, - op_errno, dictionary) - del self.gfids[key] - dl.unwind_fgetxattr(frame, cookie, this, op_ret, op_errno, - dictionary, xdata) - return 0 - - def setxattr_fop(self, frame, this, loc, dictionary, flags, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(loc.contents.inode.contents.gfid) - print("GLUPY TRACE SETXATTR FOP- {0:d}: gfid={1:s}; path={2:s};"+ - " flags={3:d}").format(unique, gfid, loc.contents.path, - flags) - self.gfids[key] = gfid - dl.wind_setxattr(frame, POINTER(xlator_t)(), loc, dictionary, - flags, xdata) - return 0 - - def setxattr_cbk(self, frame, cookie, this, op_ret, op_errno, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - print("GLUPY TRACE SETXATTR CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; op_errno={3:d}").format(unique, gfid, - op_ret, op_errno) - del self.gfids[key] - dl.unwind_setxattr(frame, cookie, this, op_ret, op_errno, - xdata) - return 0 - - def fsetxattr_fop(self, frame, this, fd, dictionary, flags, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(fd.contents.inode.contents.gfid) - print("GLUPY TRACE FSETXATTR FOP- {0:d}: gfid={1:s}; fd={2:p}; "+ - "flags={3:d}").format(unique, gfid, fd, flags) - self.gfids[key] = gfid - dl.wind_fsetxattr(frame, POINTER(xlator_t)(), fd, dictionary, - flags, xdata) - return 0 - - def fsetxattr_cbk(self, frame, cookie, this, op_ret, op_errno, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - print("GLUPY TRACE FSETXATTR CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; op_errno={3:d}").format(unique, gfid, - op_ret, op_errno) - del self.gfids[key] - dl.unwind_fsetxattr(frame, cookie, this, op_ret, op_errno, - xdata) - return 0 - - def removexattr_fop(self, frame, this, loc, name, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(loc.contents.inode.contents.gfid) - print("GLUPY TRACE REMOVEXATTR FOP- {0:d}: gfid={1:s}; "+ - "path={2:s}; name={3:s}").format(unique, gfid, - loc.contents.path, - name) - self.gfids[key] = gfid - dl.wind_removexattr(frame, POINTER(xlator_t)(), loc, name, - xdata) - return 0 - - def removexattr_cbk(self, frame, cookie, this, op_ret, op_errno, - xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - print("GLUPY TRACE REMOVEXATTR CBK- {0:d}: gfid={1:s} "+ - " op_ret={2:d}; op_errno={3:d}").format(unique, gfid, - op_ret, op_errno) - del self.gfids[key] - dl.unwind_removexattr(frame, cookie, this, op_ret, op_errno, - xdata) - return 0 - - def link_fop(self, frame, this, oldloc, newloc, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - if (newloc.contents.inode): - newgfid = uuid2str(newloc.contents.inode.contents.gfid) - else: - newgfid = "0" - oldgfid = uuid2str(oldloc.contents.inode.contents.gfid) - print("GLUPY TRACE LINK FOP-{0:d}: oldgfid={1:s}; oldpath={2:s};"+ - "newgfid={3:s};"+ - "newpath={4:s}").format(unique, oldgfid, - oldloc.contents.path, - newgfid, - newloc.contents.path) - self.gfids[key] = oldgfid - dl.wind_link(frame, POINTER(xlator_t)(), oldloc, newloc, - xdata) - return 0 - - def link_cbk(self, frame, cookie, this, op_ret, op_errno, inode, buf, - preparent, postparent, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - if op_ret == 0: - statstr = trace_stat2str(buf) - preparentstr = trace_stat2str(preparent) - postparentstr = trace_stat2str(postparent) - print("GLUPY TRACE LINK CBK- {0:d}: op_ret={1:d} "+ - "*stbuf={2:s}; *prebuf={3:s}; "+ - "*postbuf={4:s} ").format(unique, op_ret, statstr, - preparentstr, - postparentstr) - else: - print("GLUPY TRACE LINK CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; "+ - "op_errno={3:d}").format(unique, gfid, - op_ret, op_errno) - del self.gfids[key] - dl.unwind_link(frame, cookie, this, op_ret, op_errno, inode, - buf, preparent, postparent, xdata) - return 0 - - def unlink_fop(self, frame, this, loc, xflag, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(loc.contents.inode.contents.gfid) - print("GLUPY TRACE UNLINK FOP- {0:d}; gfid={1:s}; path={2:s}; "+ - "flag={3:d}").format(unique, gfid, loc.contents.path, - xflag) - self.gfids[key] = gfid - dl.wind_unlink(frame, POINTER(xlator_t)(), loc, xflag, - xdata) - return 0 - - def unlink_cbk(self, frame, cookie, this, op_ret, op_errno, - preparent, postparent, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - if op_ret == 0: - preparentstr = trace_stat2str(preparent) - postparentstr = trace_stat2str(postparent) - print("GLUPY TRACE UNLINK CBK- {0:d}: gfid ={1:s}; "+ - "op_ret={2:d}; *prebuf={3:s}; "+ - "*postbuf={4:s} ").format(unique, gfid, op_ret, - preparentstr, - postparentstr) - else: - print("GLUPY TRACE UNLINK CBK: {0:d}: gfid ={1:s}; "+ - "op_ret={2:d}; "+ - "op_errno={3:d}").format(unique, gfid, op_ret, - op_errno) - del self.gfids[key] - dl.unwind_unlink(frame, cookie, this, op_ret, op_errno, - preparent, postparent, xdata) - return 0 - - def readlink_fop(self, frame, this, loc, size, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(loc.contents.inode.contents.gfid) - print("GLUPY TRACE READLINK FOP- {0:d}: gfid={1:s}; path={2:s};"+ - " size={3:d}").format(unique, gfid, loc.contents.path, - size) - self.gfids[key] = gfid - dl.wind_readlink(frame, POINTER(xlator_t)(), loc, size, - xdata) - return 0 - - def readlink_cbk(self, frame, cookie, this, op_ret, op_errno, - buf, stbuf, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - if op_ret == 0: - statstr = trace_stat2str(stbuf) - print("GLUPY TRACE READLINK CBK- {0:d}: gfid={1:s} "+ - " op_ret={2:d}; op_errno={3:d}; *prebuf={4:s}; "+ - "*postbuf={5:s} ").format(unique, gfid, - op_ret, op_errno, - buf, statstr) - else: - print("GLUPY TRACE READLINK CBK- {0:d}: gfid={1:s} "+ - " op_ret={2:d}; op_errno={3:d}").format(unique, - gfid, - op_ret, - op_errno) - del self.gfids[key] - dl.unwind_readlink(frame, cookie, this, op_ret, op_errno, buf, - stbuf, xdata) - return 0 - - def symlink_fop(self, frame, this, linkpath, loc, umask, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = uuid2str(loc.contents.inode.contents.gfid) - print("GLUPY TRACE SYMLINK FOP- {0:d}: gfid={1:s}; "+ - "linkpath={2:s}; path={3:s};"+ - "umask=0{4:o}").format(unique, gfid, linkpath, - loc.contents.path, umask) - self.gfids[key] = gfid - dl.wind_symlink(frame, POINTER(xlator_t)(), linkpath, loc, - umask, xdata) - return 0 - - def symlink_cbk(self, frame, cookie, this, op_ret, op_errno, - inode, buf, preparent, postparent, xdata): - unique = dl.get_rootunique(frame) - key = dl.get_id(frame) - gfid = self.gfids[key] - if op_ret == 0: - statstr = trace_stat2str(buf) - preparentstr = trace_stat2str(preparent) - postparentstr = trace_stat2str(postparent) - print("GLUPY TRACE SYMLINK CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; *stbuf={3:s}; *preparent={4:s}; "+ - "*postparent={5:s}").format(unique, gfid, - op_ret, statstr, - preparentstr, - postparentstr) - else: - print("GLUPY TRACE SYMLINK CBK- {0:d}: gfid={1:s}; "+ - "op_ret={2:d}; op_errno={3:d}").format(unique, - gfid, - op_ret, - op_errno) - del self.gfids[key] - dl.unwind_symlink(frame, cookie, this, op_ret, op_errno, - inode, buf, preparent, postparent, xdata) - return 0 diff --git a/xlators/features/glupy/src/glupy.c b/xlators/features/glupy/src/glupy.c deleted file mode 100644 index dc86c0071e1..00000000000 --- a/xlators/features/glupy/src/glupy.c +++ /dev/null @@ -1,2470 +0,0 @@ -/* - Copyright (c) 2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#include <ctype.h> -#include <sys/uio.h> -#include <Python.h> - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "xlator.h" -#include "logging.h" -#include "defaults.h" - -#include "glupy.h" - -/* UTILITY FUNCTIONS FOR FOP-SPECIFIC CODE */ - -pthread_key_t gil_init_key; - -PyGILState_STATE -glupy_enter (void) -{ -#if 0 - if (!pthread_getspecific(gil_init_key)) { - PyEval_ReleaseLock(); - (void)pthread_setspecific(gil_init_key,(void *)1); - } -#endif - - return PyGILState_Ensure(); -} - -void -glupy_leave (PyGILState_STATE gstate) -{ - PyGILState_Release(gstate); -} - -/* FOP: LOOKUP */ - -int32_t -glupy_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xdata, struct iatt *postparent) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_LOOKUP]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_lookup_cbk_t)(priv->cbks[GLUPY_LOOKUP]))( - frame, cookie, this, op_ret, op_errno, - inode, buf, xdata, postparent); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, - xdata, postparent); - return 0; -} - -int32_t -glupy_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_LOOKUP]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_lookup_t)(priv->fops[GLUPY_LOOKUP]))( - frame, this, loc, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, xdata); - return 0; -} - -void -wind_lookup (call_frame_t *frame, xlator_t *xl, loc_t *loc, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND(frame,glupy_lookup_cbk,xl,xl->fops->lookup,loc,xdata); -} - -void -unwind_lookup (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xdata, struct iatt *postparent) -{ - frame->local = NULL; - STACK_UNWIND_STRICT(lookup,frame,op_ret,op_errno, - inode,buf,xdata,postparent); -} - -void -set_lookup_fop (long py_this, fop_lookup_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_LOOKUP] = (long)fop; -} - -void -set_lookup_cbk (long py_this, fop_lookup_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_LOOKUP] = (long)cbk; -} - -/* FOP: CREATE */ - -int32_t -glupy_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_CREATE]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_create_cbk_t)(priv->cbks[GLUPY_CREATE]))( - frame, cookie, this, op_ret, op_errno, - fd, inode, buf, preparent, postparent, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent, xdata); - return 0; -} - -int32_t -glupy_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_CREATE]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_create_t)(priv->fops[GLUPY_CREATE]))( - frame, this, loc, flags, mode, umask, fd, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_create_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, - fd, xdata); - return 0; -} - -void -wind_create (call_frame_t *frame, xlator_t *xl, loc_t *loc, int32_t flags, - mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_create_cbk,xl, xl->fops->create, - loc, flags, mode, umask, fd, xdata); -} - -void -unwind_create (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent, xdata); -} - -void -set_create_fop (long py_this, fop_create_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_CREATE] = (long)fop; -} - -void -set_create_cbk (long py_this, fop_create_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_CREATE] = (long)cbk; -} - -/* FOP: OPEN */ - -int32_t -glupy_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_OPEN]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_open_cbk_t)(priv->cbks[GLUPY_OPEN]))( - frame, cookie, this, op_ret, op_errno, - fd, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); - return 0; -} - -int32_t -glupy_open (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, fd_t *fd, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_OPEN]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_open_t)(priv->fops[GLUPY_OPEN]))( - frame, this, loc, flags, fd, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); - return 0; -} - -void -wind_open (call_frame_t *frame, xlator_t *xl, loc_t *loc, int32_t flags, - fd_t *fd, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_open_cbk, xl, xl->fops->open, loc, flags, - fd, xdata); -} - -void -unwind_open (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); -} - -void -set_open_fop (long py_this, fop_open_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - priv->fops[GLUPY_OPEN] = (long)fop; -} - -void -set_open_cbk (long py_this, fop_open_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - priv->cbks[GLUPY_OPEN] = (long)cbk; -} - -/* FOP: READV */ - -int32_t -glupy_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref, - dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_READV]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_readv_cbk_t)(priv->cbks[GLUPY_READV]))( - frame, cookie, this, op_ret, op_errno, - vector, count, stbuf, iobref, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, - count, stbuf, iobref, xdata); - return 0; -} - -int32_t -glupy_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset, uint32_t flags, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_READV]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_readv_t)(priv->fops[GLUPY_READV]))( - frame, this, fd, size, offset, flags, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_readv_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, fd, size, offset, - flags, xdata); - return 0; -} - -void -wind_readv (call_frame_t *frame, xlator_t *xl, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_readv_cbk, xl, xl->fops->readv, fd, size, - offset, flags, xdata); -} - -void -unwind_readv (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref, - dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, - count, stbuf, iobref, xdata); -} - -void -set_readv_fop (long py_this, fop_readv_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - priv->fops[GLUPY_READV] = (long)fop; -} - -void -set_readv_cbk (long py_this, fop_readv_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - priv->cbks[GLUPY_READV] = (long)cbk; -} - -/* FOP: WRITEV */ - -int32_t -glupy_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_WRITEV]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_writev_cbk_t)(priv->cbks[GLUPY_WRITEV]))( - frame, cookie, this, op_ret, op_errno, - prebuf, postbuf, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, - postbuf, xdata); - return 0; -} - -int32_t -glupy_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - uint32_t flags, struct iobref *iobref, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_WRITEV]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_writev_t)(priv->fops[GLUPY_WRITEV]))( - frame, this, fd, vector, count, offset, flags, - iobref, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_writev_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, fd, vector, count, - offset, flags, iobref, xdata); - return 0; -} - -void -wind_writev (call_frame_t *frame, xlator_t *xl, fd_t *fd, struct iovec *vector, - int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, - dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_writev_cbk, xl, xl->fops->writev, fd, vector, - count, offset, flags, iobref, xdata); -} - -void -unwind_writev (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, - postbuf, xdata); -} - -void -set_writev_fop (long py_this, fop_writev_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - priv->fops[GLUPY_WRITEV] = (long)fop; -} - -void -set_writev_cbk (long py_this, fop_writev_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - priv->cbks[GLUPY_WRITEV] = (long)cbk; -} - - -/* FOP: OPENDIR */ - -int32_t -glupy_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, - dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_OPENDIR]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_opendir_cbk_t)(priv->cbks[GLUPY_OPENDIR]))( - frame, cookie, this, op_ret, op_errno, - fd, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, xdata); - return 0; -} - -int32_t -glupy_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, - fd_t *fd, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_OPENDIR]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_opendir_t)(priv->fops[GLUPY_OPENDIR]))( - frame, this, loc, fd, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_opendir_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); - return 0; -} - -void -wind_opendir (call_frame_t *frame, xlator_t *xl, loc_t *loc, fd_t *fd, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND(frame,glupy_opendir_cbk,xl,xl->fops->opendir,loc,fd,xdata); -} - -void -unwind_opendir (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT(opendir,frame,op_ret,op_errno, - fd,xdata); -} - -void -set_opendir_fop (long py_this, fop_opendir_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_OPENDIR] = (long)fop; -} - -void -set_opendir_cbk (long py_this, fop_opendir_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_OPENDIR] = (long)cbk; -} - -/* FOP: READDIR */ - -int32_t -glupy_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, - dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_READDIR]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_readdir_cbk_t)(priv->cbks[GLUPY_READDIR]))( - frame, cookie, this, op_ret, op_errno, - entries, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, entries, - xdata); - return 0; -} - -int32_t -glupy_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_READDIR]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_readdir_t)(priv->fops[GLUPY_READDIR]))( - frame, this, fd, size, offset, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_readdir_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdir,fd, size, offset, xdata); - return 0; -} - -void -wind_readdir(call_frame_t *frame, xlator_t *xl, fd_t *fd, size_t size, - off_t offset, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND(frame,glupy_readdir_cbk,xl,xl->fops->readdir,fd,size,offset,xdata); -} - -void -unwind_readdir (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, - dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT(readdir,frame,op_ret,op_errno, - entries, xdata); -} - -void -set_readdir_fop (long py_this, fop_readdir_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_READDIR] = (long)fop; -} - -void -set_readdir_cbk (long py_this, fop_readdir_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_READDIR] = (long)cbk; -} - - -/* FOP: READDIRP */ - -int32_t -glupy_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, - dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_READDIRP]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_readdirp_cbk_t)(priv->cbks[GLUPY_READDIRP]))( - frame, cookie, this, op_ret, op_errno, - entries, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, - xdata); - return 0; -} - -int32_t -glupy_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_READDIRP]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_readdirp_t)(priv->fops[GLUPY_READDIRP]))( - frame, this, fd, size, offset, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_readdirp_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdirp,fd, size, offset, xdata); - return 0; -} - -void -wind_readdirp (call_frame_t *frame, xlator_t *xl, fd_t *fd, size_t size, - off_t offset, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND(frame,glupy_readdirp_cbk,xl,xl->fops->readdirp,fd,size,offset,xdata); -} - -void -unwind_readdirp (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, - dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT(readdirp,frame,op_ret,op_errno, - entries, xdata); -} - -void -set_readdirp_fop (long py_this, fop_readdirp_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_READDIRP] = (long)fop; -} - -void -set_readdirp_cbk (long py_this, fop_readdirp_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_READDIRP] = (long)cbk; -} - - -/* FOP:STAT */ - -int32_t -glupy_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_STAT]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_stat_cbk_t)(priv->cbks[GLUPY_STAT]))( - frame, cookie, this, op_ret, op_errno, - buf, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata); - return 0; -} - -int32_t -glupy_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_STAT]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_stat_t)(priv->fops[GLUPY_STAT]))( - frame, this, loc, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_stat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, loc, xdata); - return 0; -} - -void -wind_stat (call_frame_t *frame, xlator_t *xl, loc_t *loc, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND(frame,glupy_stat_cbk,xl,xl->fops->stat,loc,xdata); -} - -void -unwind_stat (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT(stat,frame,op_ret,op_errno, - buf,xdata); -} - -void -set_stat_fop (long py_this, fop_stat_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_STAT] = (long)fop; -} - -void -set_stat_cbk (long py_this, fop_stat_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_STAT] = (long)cbk; -} - - -/* FOP: FSTAT */ - -int32_t -glupy_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_FSTAT]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_fstat_cbk_t)(priv->cbks[GLUPY_FSTAT]))( - frame, cookie, this, op_ret, op_errno, - buf, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf, xdata); - return 0; -} - -int32_t -glupy_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, - dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_FSTAT]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_fstat_t)(priv->fops[GLUPY_FSTAT]))( - frame, this, fd, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_fstat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fstat, fd, xdata); - return 0; -} - -void -wind_fstat (call_frame_t *frame, xlator_t *xl, fd_t *fd, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND(frame,glupy_fstat_cbk,xl,xl->fops->fstat,fd,xdata); -} - -void -unwind_fstat (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT(fstat,frame,op_ret,op_errno, - buf,xdata); -} - -void -set_fstat_fop (long py_this, fop_fstat_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_FSTAT] = (long)fop; -} - -void -set_fstat_cbk (long py_this, fop_fstat_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_FSTAT] = (long)cbk; -} - -/* FOP:STATFS */ - -int32_t -glupy_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct statvfs *buf, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_STATFS]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_statfs_cbk_t)(priv->cbks[GLUPY_STATFS]))( - frame, cookie, this, op_ret, op_errno, - buf, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata); - return 0; -} - -int32_t -glupy_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_STATFS]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_statfs_t)(priv->fops[GLUPY_STATFS]))( - frame, this, loc, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_statfs_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->statfs, loc, xdata); - return 0; -} - -void -wind_statfs (call_frame_t *frame, xlator_t *xl, loc_t *loc, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND(frame,glupy_statfs_cbk,xl,xl->fops->statfs,loc,xdata); -} - -void -unwind_statfs (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct statvfs *buf, - dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT(statfs,frame,op_ret,op_errno, - buf,xdata); -} - -void -set_statfs_fop (long py_this, fop_statfs_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_STATFS] = (long)fop; -} - -void -set_statfs_cbk (long py_this, fop_statfs_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_STATFS] = (long)cbk; -} - - -/* FOP: SETXATTR */ - -int32_t -glupy_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_SETXATTR]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_setxattr_cbk_t)(priv->cbks[GLUPY_SETXATTR]))( - frame, cookie, this, op_ret, op_errno, - xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata); - return 0; -} - -int32_t -glupy_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *dict, int32_t flags, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_SETXATTR]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_setxattr_t)(priv->fops[GLUPY_SETXATTR]))( - frame, this, loc, dict, flags, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_setxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, loc, dict, - flags, xdata); - return 0; -} - -void -wind_setxattr (call_frame_t *frame, xlator_t *xl, loc_t *loc, - dict_t *dict, int32_t flags, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_setxattr_cbk, xl, xl->fops->setxattr, - loc, dict, flags, xdata); -} - - -void -unwind_setxattr (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata); - -} - -void -set_setxattr_fop (long py_this, fop_setxattr_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_SETXATTR] = (long)fop; -} - -void -set_setxattr_cbk (long py_this, fop_setxattr_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_SETXATTR] = (long)cbk; -} - -/* FOP: GETXATTR */ - -int32_t -glupy_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_GETXATTR]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_getxattr_cbk_t)(priv->cbks[GLUPY_GETXATTR]))( - frame, cookie, this, op_ret, op_errno, dict, - xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, - xdata); - return 0; -} - -int32_t -glupy_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_GETXATTR]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_getxattr_t)(priv->fops[GLUPY_GETXATTR]))( - frame, this, loc, name, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_getxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, loc, name, - xdata); - return 0; -} - -void -wind_getxattr (call_frame_t *frame, xlator_t *xl, loc_t *loc, - const char *name, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_getxattr_cbk, xl, xl->fops->getxattr, - loc, name, xdata); -} - - -void -unwind_getxattr (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, - xdata); - -} - - -void -set_getxattr_fop (long py_this, fop_getxattr_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_GETXATTR] = (long)fop; -} - - -void -set_getxattr_cbk (long py_this, fop_getxattr_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_GETXATTR] = (long)cbk; -} - -/* FOP: FSETXATTR */ - -int32_t -glupy_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_FSETXATTR]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_fsetxattr_cbk_t)(priv->cbks[GLUPY_FSETXATTR]))( - frame, cookie, this, op_ret, op_errno, - xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata); - return 0; -} - -int32_t -glupy_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - dict_t *dict, int32_t flags, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_FSETXATTR]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_fsetxattr_t)(priv->fops[GLUPY_FSETXATTR]))( - frame, this, fd, dict, flags, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_fsetxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsetxattr, fd, dict, - flags, xdata); - return 0; -} - -void -wind_fsetxattr (call_frame_t *frame, xlator_t *xl, fd_t *fd, - dict_t *dict, int32_t flags, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_fsetxattr_cbk, xl, xl->fops->fsetxattr, - fd, dict, flags, xdata); -} - - -void -unwind_fsetxattr (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata); - -} - -void -set_fsetxattr_fop (long py_this, fop_fsetxattr_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_FSETXATTR] = (long)fop; -} - -void -set_fsetxattr_cbk (long py_this, fop_fsetxattr_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_FSETXATTR] = (long)cbk; -} - -/* FOP: FGETXATTR */ - -int32_t -glupy_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_FGETXATTR]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_fgetxattr_cbk_t)(priv->cbks[GLUPY_FGETXATTR]))( - frame, cookie, this, op_ret, op_errno, dict, - xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, - xdata); - return 0; -} - -int32_t -glupy_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_FGETXATTR]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_fgetxattr_t)(priv->fops[GLUPY_FGETXATTR]))( - frame, this, fd, name, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_fgetxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fgetxattr, fd, name, - xdata); - return 0; -} - -void -wind_fgetxattr (call_frame_t *frame, xlator_t *xl, fd_t *fd, - const char *name, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_fgetxattr_cbk, xl, xl->fops->fgetxattr, - fd, name, xdata); -} - - -void -unwind_fgetxattr (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, - xdata); - -} - - -void -set_fgetxattr_fop (long py_this, fop_fgetxattr_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_FGETXATTR] = (long)fop; -} - - -void -set_fgetxattr_cbk (long py_this, fop_fgetxattr_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_FGETXATTR] = (long)cbk; -} - -/* FOP:REMOVEXATTR */ - -int32_t -glupy_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_REMOVEXATTR]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_removexattr_cbk_t)(priv->cbks[GLUPY_REMOVEXATTR]))( - frame, cookie, this, op_ret, op_errno, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata); - return 0; -} - -int32_t -glupy_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_REMOVEXATTR]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_removexattr_t)(priv->fops[GLUPY_REMOVEXATTR]))( - frame, this, loc, name, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_removexattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->removexattr, loc, name, - xdata); - return 0; -} - -void -wind_removexattr (call_frame_t *frame, xlator_t *xl, loc_t *loc, - const char *name, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_removexattr_cbk, xl, xl->fops->removexattr, - loc, name, xdata); -} - - -void -unwind_removexattr (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata); - -} - -void -set_removexattr_fop (long py_this, fop_removexattr_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_REMOVEXATTR] = (long)fop; -} - -void -set_removexattr_cbk (long py_this, fop_removexattr_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_REMOVEXATTR] = (long)cbk; -} - - -/* FOP:FREMOVEXATTR */ - -int32_t -glupy_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_FREMOVEXATTR]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_fremovexattr_cbk_t)(priv->cbks[GLUPY_FREMOVEXATTR]))( - frame, cookie, this, op_ret, op_errno, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata); - return 0; -} - -int32_t -glupy_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_FREMOVEXATTR]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_fremovexattr_t)(priv->fops[GLUPY_FREMOVEXATTR]))( - frame, this, fd, name, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_fremovexattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fremovexattr, fd, name, - xdata); - return 0; -} - -void -wind_fremovexattr (call_frame_t *frame, xlator_t *xl, fd_t *fd, - const char *name, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_fremovexattr_cbk, xl, xl->fops->fremovexattr, - fd, name, xdata); -} - - -void -unwind_fremovexattr (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata); - -} - -void -set_fremovexattr_fop (long py_this, fop_fremovexattr_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_FREMOVEXATTR] = (long)fop; -} - -void -set_fremovexattr_cbk (long py_this, fop_fremovexattr_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_FREMOVEXATTR] = (long)cbk; -} - - -/* FOP: LINK*/ -int32_t -glupy_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_LINK]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_link_cbk_t)(priv->cbks[GLUPY_LINK]))( - frame, cookie, this, op_ret, op_errno, - inode, buf, preparent, postparent, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); - return 0; -} - -int32_t -glupy_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, - dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_LINK]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_link_t)(priv->fops[GLUPY_LINK]))( - frame, this, oldloc, newloc, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_link_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->link, oldloc, newloc, - xdata); - return 0; -} - -void -wind_link (call_frame_t *frame, xlator_t *xl, loc_t *oldloc, loc_t *newloc, - dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_link_cbk, xl, xl->fops->link, - oldloc, newloc, xdata); -} - -void -unwind_link (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); -} - -void -set_link_fop (long py_this, fop_link_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_LINK] = (long)fop; -} - -void -set_link_cbk (long py_this, fop_link_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_LINK] = (long)cbk; -} - -/* FOP: SYMLINK*/ -int32_t -glupy_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_SYMLINK]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_symlink_cbk_t)(priv->cbks[GLUPY_SYMLINK]))( - frame, cookie, this, op_ret, op_errno, - inode, buf, preparent, postparent, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); - return 0; -} - -int32_t -glupy_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, - loc_t *loc, mode_t umask, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_SYMLINK]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_symlink_t)(priv->fops[GLUPY_SYMLINK]))( - frame, this, linkname, loc, umask, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_symlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->symlink, linkname, loc, - umask, xdata); - return 0; -} - -void -wind_symlink (call_frame_t *frame, xlator_t *xl, const char *linkname, - loc_t *loc, mode_t umask, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_symlink_cbk, xl, xl->fops->symlink, - linkname, loc, umask, xdata); -} - -void -unwind_symlink (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); -} - -void -set_symlink_fop (long py_this, fop_symlink_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_SYMLINK] = (long)fop; -} - -void -set_symlink_cbk (long py_this, fop_symlink_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_SYMLINK] = (long)cbk; -} - - -/* FOP: READLINK */ -int32_t -glupy_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, const char *path, - struct iatt *buf, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_READLINK]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_readlink_cbk_t)(priv->cbks[GLUPY_READLINK]))( - frame, cookie, this, op_ret, op_errno, - path, buf, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, path, - buf, xdata); - return 0; -} - -int32_t -glupy_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, - size_t size, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_READLINK]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_readlink_t)(priv->fops[GLUPY_READLINK]))( - frame, this, loc, size, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_readlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readlink, loc, - size, xdata); - return 0; -} - -void -wind_readlink (call_frame_t *frame, xlator_t *xl, loc_t *loc, - size_t size, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_readlink_cbk, xl, xl->fops->readlink, - loc, size, xdata); -} - -void -unwind_readlink (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, const char *path, - struct iatt *buf, dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, path, buf, - xdata); -} - -void -set_readlink_fop (long py_this, fop_readlink_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_READLINK] = (long)fop; -} - -void -set_readlink_cbk (long py_this, fop_readlink_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_READLINK] = (long)cbk; -} - - -/* FOP: UNLINK */ - -int32_t -glupy_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_UNLINK]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_unlink_cbk_t)(priv->cbks[GLUPY_UNLINK]))( - frame, cookie, this, op_ret, op_errno, - preparent, postparent, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, preparent, - postparent, xdata); - return 0; -} - -int32_t -glupy_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, - int xflags, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_UNLINK]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_unlink_t)(priv->fops[GLUPY_UNLINK]))( - frame, this, loc, xflags, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_unlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, loc, - xflags, xdata); - return 0; -} - -void -wind_unlink (call_frame_t *frame, xlator_t *xl, loc_t *loc, - int xflags, dict_t *xdata) -{ - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_unlink_cbk, xl, xl->fops->unlink, - loc, xflags, xdata); -} - -void -unwind_unlink (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, - preparent, postparent, xdata); -} - -void -set_unlink_fop (long py_this, fop_unlink_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_UNLINK] = (long)fop; -} - -void -set_unlink_cbk (long py_this, fop_unlink_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_UNLINK] = (long)cbk; -} - - -/* FOP: MKDIR */ - -int32_t -glupy_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_MKDIR]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_mkdir_cbk_t)(priv->cbks[GLUPY_MKDIR]))( - frame, cookie, this, op_ret, op_errno, - inode, buf, preparent, postparent, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); - return 0; -} - -int32_t -glupy_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - mode_t umask, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_MKDIR]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_mkdir_t)(priv->fops[GLUPY_MKDIR]))( - frame, this, loc, mode, umask, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_mkdir_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, - xdata); - return 0; -} - -void -wind_mkdir (call_frame_t *frame, xlator_t *xl, loc_t *loc, mode_t mode, - mode_t umask, dict_t *xdata) -{ - - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_mkdir_cbk, xl, xl->fops->mkdir, - loc, mode, umask, xdata); -} - -void -unwind_mkdir (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); -} - -void -set_mkdir_fop (long py_this, fop_mkdir_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_MKDIR] = (long)fop; -} - -void -set_mkdir_cbk (long py_this, fop_mkdir_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_MKDIR] = (long)cbk; -} - - -/* FOP: RMDIR */ - -int32_t -glupy_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - - if (!priv->cbks[GLUPY_RMDIR]) { - goto unwind; - } - - gstate = glupy_enter(); - ret = ((fop_rmdir_cbk_t)(priv->cbks[GLUPY_RMDIR]))( - frame, cookie, this, op_ret, op_errno, - preparent, postparent, xdata); - glupy_leave(gstate); - - return ret; - -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, preparent, - postparent, xdata); - return 0; -} - -int32_t -glupy_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, - int xflags, dict_t *xdata) -{ - glupy_private_t *priv = this->private; - PyGILState_STATE gstate; - int32_t ret; - static long next_id = 0; - - if (!priv->fops[GLUPY_RMDIR]) { - goto wind; - } - - gstate = glupy_enter(); - frame->local = (void *)++next_id; - ret = ((fop_rmdir_t)(priv->fops[GLUPY_RMDIR]))( - frame, this, loc, xflags, xdata); - glupy_leave(gstate); - - return ret; - -wind: - STACK_WIND (frame, glupy_rmdir_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rmdir, loc, - xflags, xdata); - return 0; -} - -void -wind_rmdir (call_frame_t *frame, xlator_t *xl, loc_t *loc, - int xflags, dict_t *xdata) -{ - - xlator_t *this = THIS; - - if (!xl || (xl == this)) { - xl = FIRST_CHILD(this); - } - - STACK_WIND (frame, glupy_rmdir_cbk, xl, xl->fops->rmdir, - loc, xflags, xdata); -} - -void -unwind_rmdir (call_frame_t *frame, long cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) -{ - frame->local = NULL; - STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, - preparent, postparent, xdata); -} - -void -set_rmdir_fop (long py_this, fop_rmdir_t fop) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->fops[GLUPY_RMDIR] = (long)fop; -} - -void -set_rmdir_cbk (long py_this, fop_rmdir_cbk_t cbk) -{ - glupy_private_t *priv = ((xlator_t *)py_this)->private; - - priv->cbks[GLUPY_RMDIR] = (long)cbk; -} - - -/* NON-FOP-SPECIFIC CODE */ - - -long -get_id (call_frame_t *frame) -{ - return (long)(frame->local); -} - -uint64_t -get_rootunique (call_frame_t *frame) -{ - return frame->root->unique; -} - -int32_t -init (xlator_t *this) -{ - glupy_private_t *priv = NULL; - char *module_name = NULL; - PyObject *py_mod_name = NULL; - PyObject *py_init_func = NULL; - PyObject *py_args = NULL; - PyObject *syspath = NULL; - PyObject *path = NULL; - static gf_boolean_t py_inited = _gf_false; - void * err_cleanup = &&err_return; - - if (dict_get_str(this->options,"module-name",&module_name) != 0) { - gf_log (this->name, GF_LOG_ERROR, "missing module-name"); - return -1; - } - - priv = GF_CALLOC (1, sizeof (glupy_private_t), gf_glupy_mt_priv); - if (!priv) { - goto *err_cleanup; - } - this->private = priv; - err_cleanup = &&err_free_priv; - - if (!py_inited) { - Py_Initialize(); - PyEval_InitThreads(); -#if 0 - (void)pthread_key_create(&gil_init_key,NULL); - (void)pthread_setspecific(gil_init_key,(void *)1); -#endif - /* PyEval_InitThreads takes this "for" us. No thanks. */ - PyEval_ReleaseLock(); - py_inited = _gf_true; - } - - /* Adjust python's path */ - syspath = PySys_GetObject("path"); - path = PyString_FromString(GLUSTER_PYTHON_PATH); - PyList_Append(syspath, path); - Py_DECREF(path); - - py_mod_name = PyString_FromString(module_name); - if (!py_mod_name) { - gf_log (this->name, GF_LOG_ERROR, "could not create name"); - if (PyErr_Occurred()) { - PyErr_Print(); - } - goto *err_cleanup; - } - - gf_log (this->name, GF_LOG_ERROR, "py_mod_name = %s", module_name); - priv->py_module = PyImport_Import(py_mod_name); - Py_DECREF(py_mod_name); - if (!priv->py_module) { - gf_log (this->name, GF_LOG_ERROR, "Python import failed"); - if (PyErr_Occurred()) { - PyErr_Print(); - } - goto *err_cleanup; - } - err_cleanup = &&err_deref_module; - - py_init_func = PyObject_GetAttrString(priv->py_module, "xlator"); - if (!py_init_func || !PyCallable_Check(py_init_func)) { - gf_log (this->name, GF_LOG_ERROR, "missing init func"); - if (PyErr_Occurred()) { - PyErr_Print(); - } - goto *err_cleanup; - } - err_cleanup = &&err_deref_init; - - py_args = PyTuple_New(1); - if (!py_args) { - gf_log (this->name, GF_LOG_ERROR, "could not create args"); - if (PyErr_Occurred()) { - PyErr_Print(); - } - goto *err_cleanup; - } - PyTuple_SetItem(py_args,0,PyLong_FromLong((long)this)); - - /* TBD: pass in list of children */ - priv->py_xlator = PyObject_CallObject(py_init_func, py_args); - Py_DECREF(py_args); - if (!priv->py_xlator) { - gf_log (this->name, GF_LOG_ERROR, "Python init failed"); - if (PyErr_Occurred()) { - PyErr_Print(); - } - goto *err_cleanup; - } - gf_log (this->name, GF_LOG_INFO, "init returned %p", priv->py_xlator); - - return 0; - -err_deref_init: - Py_DECREF(py_init_func); -err_deref_module: - Py_DECREF(priv->py_module); -err_free_priv: - GF_FREE(priv); -err_return: - return -1; -} - -void -fini (xlator_t *this) -{ - glupy_private_t *priv = this->private; - - if (!priv) - return; - Py_DECREF(priv->py_xlator); - Py_DECREF(priv->py_module); - this->private = NULL; - GF_FREE (priv); - - return; -} - -struct xlator_fops fops = { - .lookup = glupy_lookup, - .create = glupy_create, - .open = glupy_open, - .readv = glupy_readv, - .writev = glupy_writev, - .opendir = glupy_opendir, - .readdir = glupy_readdir, - .stat = glupy_stat, - .fstat = glupy_fstat, - .setxattr = glupy_setxattr, - .getxattr = glupy_getxattr, - .fsetxattr = glupy_fsetxattr, - .fgetxattr = glupy_fgetxattr, - .removexattr = glupy_removexattr, - .fremovexattr = glupy_fremovexattr, - .link = glupy_link, - .unlink = glupy_unlink, - .readlink = glupy_readlink, - .symlink = glupy_symlink, - .mkdir = glupy_mkdir, - .rmdir = glupy_rmdir, - .statfs = glupy_statfs, - .readdirp = glupy_readdirp -}; - -struct xlator_cbks cbks = { -}; - -struct volume_options options[] = { - { .key = {NULL} }, -}; diff --git a/xlators/features/glupy/src/glupy.h b/xlators/features/glupy/src/glupy.h deleted file mode 100644 index 8661fce88c5..00000000000 --- a/xlators/features/glupy/src/glupy.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef __GLUPY_H__ -#define __GLUPY_H__ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif -#include "mem-types.h" - -enum { - GLUPY_LOOKUP = 0, - GLUPY_CREATE, - GLUPY_OPEN, - GLUPY_READV, - GLUPY_WRITEV, - GLUPY_OPENDIR, - GLUPY_READDIR, - GLUPY_READDIRP, - GLUPY_STAT, - GLUPY_FSTAT, - GLUPY_STATFS, - GLUPY_SETXATTR, - GLUPY_GETXATTR, - GLUPY_FSETXATTR, - GLUPY_FGETXATTR, - GLUPY_REMOVEXATTR, - GLUPY_FREMOVEXATTR, - GLUPY_LINK, - GLUPY_UNLINK, - GLUPY_READLINK, - GLUPY_SYMLINK, - GLUPY_MKNOD, - GLUPY_MKDIR, - GLUPY_RMDIR, - GLUPY_N_FUNCS -}; - -typedef struct { - PyObject *py_module; - PyObject *py_xlator; - long fops[GLUPY_N_FUNCS]; - long cbks[GLUPY_N_FUNCS]; -} glupy_private_t; - -enum gf_glupy_mem_types_ { - gf_glupy_mt_priv = gf_common_mt_end + 1, - gf_glupy_mt_end -}; - -#endif /* __GLUPY_H__ */ diff --git a/xlators/features/glupy/src/gluster.py b/xlators/features/glupy/src/gluster.py deleted file mode 100644 index a5daa77d32a..00000000000 --- a/xlators/features/glupy/src/gluster.py +++ /dev/null @@ -1,841 +0,0 @@ -import sys -from ctypes import * - -dl = CDLL("",RTLD_GLOBAL) - - -class call_frame_t (Structure): - pass - -class dev_t (Structure): - pass - - -class dict_t (Structure): - pass - - -class gf_dirent_t (Structure): - pass - - -class iobref_t (Structure): - pass - - -class iovec_t (Structure): - pass - - -class list_head (Structure): - pass - -list_head._fields_ = [ - ("next", POINTER(list_head)), - ("prev", POINTER(list_head)) - ] - - -class rwxperm_t (Structure): - _fields_ = [ - ("read", c_uint8, 1), - ("write", c_uint8, 1), - ("execn", c_uint8, 1) - ] - - -class statvfs_t (Structure): - pass - - -class xlator_t (Structure): - pass - - -class ia_prot_t (Structure): - _fields_ = [ - ("suid", c_uint8, 1), - ("sgid", c_uint8, 1), - ("sticky", c_uint8, 1), - ("owner", rwxperm_t), - ("group", rwxperm_t), - ("other", rwxperm_t) - ] - -# For checking file type. -(IA_INVAL, IA_IFREG, IA_IFDIR, IA_IFLNK, IA_IFBLK, IA_IFCHR, IA_IFIFO, - IA_IFSOCK) = xrange(8) - - -class iatt_t (Structure): - _fields_ = [ - ("ia_no", c_uint64), - ("ia_gfid", c_ubyte * 16), - ("ia_dev", c_uint64), - ("ia_type", c_uint), - ("ia_prot", ia_prot_t), - ("ia_nlink", c_uint32), - ("ia_uid", c_uint32), - ("ia_gid", c_uint32), - ("ia_rdev", c_uint64), - ("ia_size", c_uint64), - ("ia_blksize", c_uint32), - ("ia_blocks", c_uint64), - ("ia_atime", c_uint32 ), - ("ia_atime_nsec", c_uint32), - ("ia_mtime", c_uint32), - ("ia_mtime_nsec", c_uint32), - ("ia_ctime", c_uint32), - ("ia_ctime_nsec", c_uint32) - ] - - -class mem_pool (Structure): - _fields_ = [ - ("list", list_head), - ("hot_count", c_int), - ("cold_count", c_int), - ("lock", c_void_p), - ("padded_sizeof_type", c_ulong), - ("pool", c_void_p), - ("pool_end", c_void_p), - ("real_sizeof_type", c_int), - ("alloc_count", c_uint64), - ("pool_misses", c_uint64), - ("max_alloc", c_int), - ("curr_stdalloc", c_int), - ("max_stdalloc", c_int), - ("name", c_char_p), - ("global_list", list_head) - ] - - -class U_ctx_key_inode (Union): - _fields_ = [ - ("key", c_uint64), - ("xl_key", POINTER(xlator_t)) - ] - - -class U_ctx_value1 (Union): - _fields_ = [ - ("value1", c_uint64), - ("ptr1", c_void_p) - ] - - -class U_ctx_value2 (Union): - _fields_ = [ - ("value2", c_uint64), - ("ptr2", c_void_p) - ] - -class inode_ctx (Structure): - _anonymous_ = ("u_key","u_value1","u_value2",) - _fields_ = [ - ("u_key", U_ctx_key_inode), - ("u_value1", U_ctx_value1), - ("u_value2", U_ctx_value2) - ] - -class inode_t (Structure): - pass - -class inode_table_t (Structure): - _fields_ = [ - ("lock", c_void_p), - ("hashsize", c_size_t), - ("name", c_char_p), - ("root", POINTER(inode_t)), - ("xl", POINTER(xlator_t)), - ("lru_limit", c_uint32), - ("inode_hash", POINTER(list_head)), - ("name_hash", POINTER(list_head)), - ("active", list_head), - ("active_size", c_uint32), - ("lru", list_head), - ("lru_size", c_uint32), - ("purge", list_head), - ("purge_size", c_uint32), - ("inode_pool", POINTER(mem_pool)), - ("dentry_pool", POINTER(mem_pool)), - ("fd_mem_pool", POINTER(mem_pool)) - ] - -inode_t._fields_ = [ - ("table", POINTER(inode_table_t)), - ("gfid", c_ubyte * 16), - ("lock", c_void_p), - ("nlookup", c_uint64), - ("fd_count", c_uint32), - ("ref", c_uint32), - ("ia_type", c_uint), - ("fd_list", list_head), - ("dentry_list", list_head), - ("hashv", list_head), - ("listv", list_head), - ("ctx", POINTER(inode_ctx)) - ] - - - -class U_ctx_key_fd (Union): - _fields_ = [ - ("key", c_uint64), - ("xl_key", c_void_p) - ] - -class fd_lk_ctx (Structure): - _fields_ = [ - ("lk_list", list_head), - ("ref", c_int), - ("lock", c_void_p) - ] - -class fd_ctx (Structure): - _anonymous_ = ("u_key","u_value1") - _fields_ = [ - ("u_key", U_ctx_key_fd), - ("u_value1", U_ctx_value1) - ] - -class fd_t (Structure): - _fields_ = [ - ("pid", c_uint64), - ("flags", c_int32), - ("refcount", c_int32), - ("inode_list", list_head), - ("inode", POINTER(inode_t)), - ("lock", c_void_p), - ("ctx", POINTER(fd_ctx)), - ("xl_count", c_int), - ("lk_ctx", POINTER(fd_lk_ctx)), - ("anonymous", c_uint) - ] - -class loc_t (Structure): - _fields_ = [ - ("path", c_char_p), - ("name", c_char_p), - ("inode", POINTER(inode_t)), - ("parent", POINTER(inode_t)), - ("gfid", c_ubyte * 16), - ("pargfid", c_ubyte * 16), - ] - - - -def _init_op (a_class, fop, cbk, wind, unwind): - # Decorators, used by translators. We could pass the signatures as - # parameters, but it's actually kind of nice to keep them around for - # inspection. - a_class.fop_type = apply(CFUNCTYPE,a_class.fop_sig) - a_class.cbk_type = apply(CFUNCTYPE,a_class.cbk_sig) - # Dispatch-function registration. - fop.restype = None - fop.argtypes = [ c_long, a_class.fop_type ] - # Callback-function registration. - cbk.restype = None - cbk.argtypes = [ c_long, a_class.cbk_type ] - # STACK_WIND function. - wind.restype = None - wind.argtypes = list(a_class.fop_sig[1:]) - # STACK_UNWIND function. - unwind.restype = None - unwind.argtypes = list(a_class.cbk_sig[1:]) - -class OpLookup: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(loc_t), POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(inode_t), POINTER(iatt_t), - POINTER(dict_t), POINTER(iatt_t)) -_init_op (OpLookup, dl.set_lookup_fop, dl.set_lookup_cbk, - dl.wind_lookup, dl.unwind_lookup) - -class OpCreate: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(loc_t), c_int, c_uint, c_uint, POINTER(fd_t), - POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(fd_t), POINTER(inode_t), - POINTER(iatt_t), POINTER(iatt_t), POINTER(iatt_t), - POINTER(dict_t)) -_init_op (OpCreate, dl.set_create_fop, dl.set_create_cbk, - dl.wind_create, dl.unwind_create) - -class OpOpen: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(loc_t), c_int, POINTER(fd_t), POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(fd_t), POINTER(dict_t)) -_init_op (OpOpen, dl.set_open_fop, dl.set_open_cbk, - dl.wind_open, dl.unwind_open) - -class OpReadv: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(fd_t), c_size_t, c_long, c_uint32, POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(iovec_t), c_int, POINTER(iatt_t), - POINTER(iobref_t), POINTER(dict_t)) -_init_op (OpReadv, dl.set_readv_fop, dl.set_readv_cbk, - dl.wind_readv, dl.unwind_readv) -class OpWritev: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(fd_t), POINTER(iovec_t), c_int, c_long, c_uint32, - POINTER(iobref_t), POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(iatt_t), POINTER(iatt_t), - POINTER(dict_t)) -_init_op (OpWritev, dl.set_writev_fop, dl.set_writev_cbk, - dl.wind_writev, dl.unwind_writev) - -class OpOpendir: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(loc_t), POINTER(fd_t) ,POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(fd_t), POINTER(dict_t)) -_init_op (OpOpendir, dl.set_opendir_fop, dl.set_opendir_cbk, - dl.wind_opendir, dl.unwind_opendir) - -class OpReaddir: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(fd_t), c_size_t, c_long, POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(gf_dirent_t), POINTER(dict_t)) -_init_op (OpReaddir, dl.set_readdir_fop, dl.set_readdir_cbk, - dl.wind_readdir, dl.unwind_readdir) - -class OpReaddirp: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(fd_t), c_size_t, c_long, POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(gf_dirent_t), POINTER(dict_t)) -_init_op (OpReaddirp, dl.set_readdirp_fop, dl.set_readdirp_cbk, - dl.wind_readdirp, dl.unwind_readdirp) - -class OpStat: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(loc_t), POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(iatt_t), POINTER(dict_t)) -_init_op (OpStat, dl.set_stat_fop, dl.set_stat_cbk, - dl.wind_stat, dl.unwind_stat) - -class OpFstat: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(fd_t), POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(iatt_t), POINTER(dict_t)) -_init_op (OpFstat, dl.set_fstat_fop, dl.set_fstat_cbk, - dl.wind_fstat, dl.unwind_fstat) - -class OpStatfs: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(loc_t), POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(statvfs_t), POINTER(dict_t)) -_init_op (OpStatfs, dl.set_statfs_fop, dl.set_statfs_cbk, - dl.wind_statfs, dl.unwind_statfs) - - -class OpSetxattr: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(loc_t), POINTER(dict_t), c_int32, - POINTER (dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(dict_t)) -_init_op (OpSetxattr, dl.set_setxattr_fop, dl.set_setxattr_cbk, - dl.wind_setxattr, dl.unwind_setxattr) - -class OpGetxattr: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(loc_t), c_char_p, POINTER (dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(dict_t), POINTER(dict_t)) -_init_op (OpGetxattr, dl.set_getxattr_fop, dl.set_getxattr_cbk, - dl.wind_getxattr, dl.unwind_getxattr) - -class OpFsetxattr: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(fd_t), POINTER(dict_t), c_int32, - POINTER (dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(dict_t)) -_init_op (OpFsetxattr, dl.set_fsetxattr_fop, dl.set_fsetxattr_cbk, - dl.wind_fsetxattr, dl.unwind_fsetxattr) - -class OpFgetxattr: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(fd_t), c_char_p, POINTER (dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(dict_t), POINTER(dict_t)) -_init_op (OpFgetxattr, dl.set_fgetxattr_fop, dl.set_fgetxattr_cbk, - dl.wind_fgetxattr, dl.unwind_fgetxattr) - -class OpRemovexattr: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(loc_t), c_char_p, POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(dict_t)) -_init_op (OpRemovexattr, dl.set_removexattr_fop, dl.set_removexattr_cbk, - dl.wind_removexattr, dl.unwind_removexattr) - - -class OpFremovexattr: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(fd_t), c_char_p, POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(dict_t)) -_init_op (OpFremovexattr, dl.set_fremovexattr_fop, dl.set_fremovexattr_cbk, - dl.wind_fremovexattr, dl.unwind_fremovexattr) - -class OpLink: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(loc_t), POINTER(loc_t), POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(inode_t), POINTER(iatt_t), - POINTER(iatt_t), POINTER(iatt_t), POINTER(dict_t)) -_init_op (OpLink, dl.set_link_fop, dl.set_link_cbk, - dl.wind_link, dl.unwind_link) - -class OpSymlink: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - c_char_p, POINTER(loc_t), c_uint, POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(inode_t), POINTER(iatt_t), - POINTER(iatt_t), POINTER(iatt_t), POINTER(dict_t)) -_init_op (OpSymlink, dl.set_symlink_fop, dl.set_symlink_cbk, - dl.wind_symlink, dl.unwind_symlink) - -class OpUnlink: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(loc_t), c_int, POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(iatt_t), POINTER(iatt_t), - POINTER(dict_t)) -_init_op (OpUnlink, dl.set_unlink_fop, dl.set_unlink_cbk, - dl.wind_unlink, dl.unwind_unlink) - -class OpReadlink: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(loc_t), c_size_t, POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, c_char_p, POINTER(iatt_t), POINTER(dict_t)) -_init_op (OpReadlink, dl.set_readlink_fop, dl.set_readlink_cbk, - dl.wind_readlink, dl.unwind_readlink) - -class OpMkdir: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(loc_t), c_uint, c_uint, POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(inode_t), POINTER(iatt_t), - POINTER(iatt_t), POINTER(iatt_t), POINTER(dict_t)) -_init_op (OpMkdir, dl.set_mkdir_fop, dl.set_mkdir_cbk, - dl.wind_mkdir, dl.unwind_mkdir) - -class OpRmdir: - fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(loc_t), c_int, POINTER(dict_t)) - cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t), - c_int, c_int, POINTER(iatt_t), POINTER(iatt_t), - POINTER(dict_t)) -_init_op (OpRmdir, dl.set_rmdir_fop, dl.set_rmdir_cbk, - dl.wind_rmdir, dl.unwind_rmdir) - - -class Translator: - def __init__ (self, c_this): - # This is only here to keep references to the stubs we create, - # because ctypes doesn't and glupy.so can't because it doesn't - # get a pointer to the actual Python object. It's a dictionary - # instead of a list in case we ever allow changing fops/cbks - # after initialization and need to look them up. - self.stub_refs = {} - funcs = dir(self.__class__) - if "lookup_fop" in funcs: - @OpLookup.fop_type - def stub (frame, this, loc, xdata, s=self): - return s.lookup_fop (frame, this, loc, xdata) - self.stub_refs["lookup_fop"] = stub - dl.set_lookup_fop(c_this,stub) - if "lookup_cbk" in funcs: - @OpLookup.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, inode, - buf, xdata, postparent, s=self): - return s.lookup_cbk(frame, cookie, this, op_ret, - op_errno, inode, buf, xdata, - postparent) - self.stub_refs["lookup_cbk"] = stub - dl.set_lookup_cbk(c_this,stub) - if "create_fop" in funcs: - @OpCreate.fop_type - def stub (frame, this, loc, flags, mode, umask, fd, - xdata, s=self): - return s.create_fop (frame, this, loc, flags, - mode, umask, fd, xdata) - self.stub_refs["create_fop"] = stub - dl.set_create_fop(c_this,stub) - if "create_cbk" in funcs: - @OpCreate.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, fd, - inode, buf, preparent, postparent, xdata, - s=self): - return s.create_cbk (frame, cookie, this, - op_ret, op_errno, fd, - inode, buf, preparent, - postparent, xdata) - self.stub_refs["create_cbk"] = stub - dl.set_create_cbk(c_this,stub) - if "open_fop" in funcs: - @OpOpen.fop_type - def stub (frame, this, loc, flags, fd, - xdata, s=self): - return s.open_fop (frame, this, loc, flags, - fd, xdata) - self.stub_refs["open_fop"] = stub - dl.set_open_fop(c_this,stub) - if "open_cbk" in funcs: - @OpOpen.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, fd, - xdata, s=self): - return s.open_cbk (frame, cookie, this, - op_ret, op_errno, fd, - xdata) - self.stub_refs["open_cbk"] = stub - dl.set_open_cbk(c_this,stub) - if "readv_fop" in funcs: - @OpReadv.fop_type - def stub (frame, this, fd, size, offset, flags, - xdata, s=self): - return s.readv_fop (frame, this, fd, size, - offset, flags, xdata) - self.stub_refs["readv_fop"] = stub - dl.set_readv_fop(c_this,stub) - if "readv_cbk" in funcs: - @OpReadv.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, - vector, count, stbuf, iobref, xdata, - s=self): - return s.readv_cbk (frame, cookie, this, - op_ret, op_errno, vector, - count, stbuf, iobref, - xdata) - self.stub_refs["readv_cbk"] = stub - dl.set_readv_cbk(c_this,stub) - if "writev_fop" in funcs: - @OpWritev.fop_type - def stub (frame, this, fd, vector, count, - offset, flags, iobref, xdata, s=self): - return s.writev_fop (frame, this, fd, vector, - count, offset, flags, - iobref, xdata) - self.stub_refs["writev_fop"] = stub - dl.set_writev_fop(c_this,stub) - if "writev_cbk" in funcs: - @OpWritev.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, - prebuf, postbuf, xdata, s=self): - return s.writev_cbk (frame, cookie, this, - op_ret, op_errno, prebuf, - postbuf, xdata) - self.stub_refs["writev_cbk"] = stub - dl.set_writev_cbk(c_this,stub) - if "opendir_fop" in funcs: - @OpOpendir.fop_type - def stub (frame, this, loc, fd, xdata, s=self): - return s.opendir_fop (frame, this, loc, fd, - xdata) - self.stub_refs["opendir_fop"] = stub - dl.set_opendir_fop(c_this,stub) - if "opendir_cbk" in funcs: - @OpOpendir.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, fd, - xdata, s=self): - return s.opendir_cbk(frame, cookie, this, - op_ret, op_errno, fd, - xdata) - self.stub_refs["opendir_cbk"] = stub - dl.set_opendir_cbk(c_this,stub) - if "readdir_fop" in funcs: - @OpReaddir.fop_type - def stub (frame, this, fd, size, offset, xdata, s=self): - return s.readdir_fop (frame, this, fd, size, - offset, xdata) - self.stub_refs["readdir_fop"] = stub - dl.set_readdir_fop(c_this,stub) - if "readdir_cbk" in funcs: - @OpReaddir.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, - entries, xdata, s=self): - return s.readdir_cbk(frame, cookie, this, - op_ret, op_errno, entries, - xdata) - self.stub_refs["readdir_cbk"] = stub - dl.set_readdir_cbk(c_this,stub) - if "readdirp_fop" in funcs: - @OpReaddirp.fop_type - def stub (frame, this, fd, size, offset, xdata, s=self): - return s.readdirp_fop (frame, this, fd, size, - offset, xdata) - self.stub_refs["readdirp_fop"] = stub - dl.set_readdirp_fop(c_this,stub) - if "readdirp_cbk" in funcs: - @OpReaddirp.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, - entries, xdata, s=self): - return s.readdirp_cbk (frame, cookie, this, - op_ret, op_errno, - entries, xdata) - self.stub_refs["readdirp_cbk"] = stub - dl.set_readdirp_cbk(c_this,stub) - if "stat_fop" in funcs: - @OpStat.fop_type - def stub (frame, this, loc, xdata, s=self): - return s.stat_fop (frame, this, loc, xdata) - self.stub_refs["stat_fop"] = stub - dl.set_stat_fop(c_this,stub) - if "stat_cbk" in funcs: - @OpStat.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, buf, - xdata, s=self): - return s.stat_cbk(frame, cookie, this, op_ret, - op_errno, buf, xdata) - self.stub_refs["stat_cbk"] = stub - dl.set_stat_cbk(c_this,stub) - if "fstat_fop" in funcs: - @OpFstat.fop_type - def stub (frame, this, fd, xdata, s=self): - return s.fstat_fop (frame, this, fd, xdata) - self.stub_refs["fstat_fop"] = stub - dl.set_fstat_fop(c_this,stub) - if "fstat_cbk" in funcs: - @OpFstat.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, buf, - xdata, s=self): - return s.fstat_cbk(frame, cookie, this, op_ret, - op_errno, buf, xdata) - self.stub_refs["fstat_cbk"] = stub - dl.set_fstat_cbk(c_this,stub) - if "statfs_fop" in funcs: - @OpStatfs.fop_type - def stub (frame, this, loc, xdata, s=self): - return s.statfs_fop (frame, this, loc, xdata) - self.stub_refs["statfs_fop"] = stub - dl.set_statfs_fop(c_this,stub) - if "statfs_cbk" in funcs: - @OpStatfs.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, buf, - xdata, s=self): - return s.statfs_cbk (frame, cookie, this, - op_ret, op_errno, buf, - xdata) - self.stub_refs["statfs_cbk"] = stub - dl.set_statfs_cbk(c_this,stub) - if "setxattr_fop" in funcs: - @OpSetxattr.fop_type - def stub (frame, this, loc, dictionary, flags, xdata, - s=self): - return s.setxattr_fop (frame, this, loc, - dictionary, flags, - xdata) - self.stub_refs["setxattr_fop"] = stub - dl.set_setxattr_fop(c_this,stub) - if "setxattr_cbk" in funcs: - @OpSetxattr.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, xdata, - s=self): - return s.setxattr_cbk(frame, cookie, this, - op_ret, op_errno, xdata) - self.stub_refs["setxattr_cbk"] = stub - dl.set_setxattr_cbk(c_this,stub) - if "getxattr_fop" in funcs: - @OpGetxattr.fop_type - def stub (frame, this, loc, name, xdata, s=self): - return s.getxattr_fop (frame, this, loc, name, - xdata) - self.stub_refs["getxattr_fop"] = stub - dl.set_getxattr_fop(c_this,stub) - if "getxattr_cbk" in funcs: - @OpGetxattr.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, - dictionary, xdata, s=self): - return s.getxattr_cbk(frame, cookie, this, - op_ret, op_errno, - dictionary, xdata) - self.stub_refs["getxattr_cbk"] = stub - dl.set_getxattr_cbk(c_this,stub) - if "fsetxattr_fop" in funcs: - @OpFsetxattr.fop_type - def stub (frame, this, fd, dictionary, flags, xdata, - s=self): - return s.fsetxattr_fop (frame, this, fd, - dictionary, flags, - xdata) - self.stub_refs["fsetxattr_fop"] = stub - dl.set_fsetxattr_fop(c_this,stub) - if "fsetxattr_cbk" in funcs: - @OpFsetxattr.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, xdata, - s=self): - return s.fsetxattr_cbk(frame, cookie, this, - op_ret, op_errno, xdata) - self.stub_refs["fsetxattr_cbk"] = stub - dl.set_fsetxattr_cbk(c_this,stub) - if "fgetxattr_fop" in funcs: - @OpFgetxattr.fop_type - def stub (frame, this, fd, name, xdata, s=self): - return s.fgetxattr_fop (frame, this, fd, name, - xdata) - self.stub_refs["fgetxattr_fop"] = stub - dl.set_fgetxattr_fop(c_this,stub) - if "fgetxattr_cbk" in funcs: - @OpFgetxattr.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, - dictionary, xdata, s=self): - return s.fgetxattr_cbk(frame, cookie, this, - op_ret, op_errno, - dictionary, xdata) - self.stub_refs["fgetxattr_cbk"] = stub - dl.set_fgetxattr_cbk(c_this,stub) - if "removexattr_fop" in funcs: - @OpRemovexattr.fop_type - def stub (frame, this, loc, name, xdata, s=self): - return s.removexattr_fop (frame, this, loc, - name, xdata) - self.stub_refs["removexattr_fop"] = stub - dl.set_removexattr_fop(c_this,stub) - if "removexattr_cbk" in funcs: - @OpRemovexattr.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, - xdata, s=self): - return s.removexattr_cbk(frame, cookie, this, - op_ret, op_errno, - xdata) - self.stub_refs["removexattr_cbk"] = stub - dl.set_removexattr_cbk(c_this,stub) - if "fremovexattr_fop" in funcs: - @OpFremovexattr.fop_type - def stub (frame, this, fd, name, xdata, s=self): - return s.fremovexattr_fop (frame, this, fd, - name, xdata) - self.stub_refs["fremovexattr_fop"] = stub - dl.set_fremovexattr_fop(c_this,stub) - if "fremovexattr_cbk" in funcs: - @OpFremovexattr.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, - xdata, s=self): - return s.fremovexattr_cbk(frame, cookie, this, - op_ret, op_errno, - xdata) - self.stub_refs["fremovexattr_cbk"] = stub - dl.set_fremovexattr_cbk(c_this,stub) - if "link_fop" in funcs: - @OpLink.fop_type - def stub (frame, this, oldloc, newloc, - xdata, s=self): - return s.link_fop (frame, this, oldloc, - newloc, xdata) - self.stub_refs["link_fop"] = stub - dl.set_link_fop(c_this,stub) - if "link_cbk" in funcs: - @OpLink.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, - inode, buf, preparent, postparent, xdata, - s=self): - return s.link_cbk (frame, cookie, this, - op_ret, op_errno, inode, - buf, preparent, - postparent, xdata) - self.stub_refs["link_cbk"] = stub - dl.set_link_cbk(c_this,stub) - if "symlink_fop" in funcs: - @OpSymlink.fop_type - def stub (frame, this, linkname, loc, - umask, xdata, s=self): - return s.symlink_fop (frame, this, linkname, - loc, umask, xdata) - self.stub_refs["symlink_fop"] = stub - dl.set_symlink_fop(c_this,stub) - if "symlink_cbk" in funcs: - @OpSymlink.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, - inode, buf, preparent, postparent, xdata, - s=self): - return s.symlink_cbk (frame, cookie, this, - op_ret, op_errno, inode, - buf, preparent, - postparent, xdata) - self.stub_refs["symlink_cbk"] = stub - dl.set_symlink_cbk(c_this,stub) - if "unlink_fop" in funcs: - @OpUnlink.fop_type - def stub (frame, this, loc, xflags, - xdata, s=self): - return s.unlink_fop (frame, this, loc, - xflags, xdata) - self.stub_refs["unlink_fop"] = stub - dl.set_unlink_fop(c_this,stub) - if "unlink_cbk" in funcs: - @OpUnlink.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, - preparent, postparent, xdata, s=self): - return s.unlink_cbk (frame, cookie, this, - op_ret, op_errno, - preparent, postparent, - xdata) - self.stub_refs["unlink_cbk"] = stub - dl.set_unlink_cbk(c_this,stub) - if "readlink_fop" in funcs: - @OpReadlink.fop_type - def stub (frame, this, loc, size, - xdata, s=self): - return s.readlink_fop (frame, this, loc, - size, xdata) - self.stub_refs["readlink_fop"] = stub - dl.set_readlink_fop(c_this,stub) - if "readlink_cbk" in funcs: - @OpReadlink.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, - path, buf, xdata, s=self): - return s.readlink_cbk (frame, cookie, this, - op_ret, op_errno, - path, buf, xdata) - self.stub_refs["readlink_cbk"] = stub - dl.set_readlink_cbk(c_this,stub) - if "mkdir_fop" in funcs: - @OpMkdir.fop_type - def stub (frame, this, loc, mode, umask, xdata, - s=self): - return s.mkdir_fop (frame, this, loc, mode, - umask, xdata) - self.stub_refs["mkdir_fop"] = stub - dl.set_mkdir_fop(c_this,stub) - if "mkdir_cbk" in funcs: - @OpMkdir.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, inode, - buf, preparent, postparent, xdata, s=self): - return s.mkdir_cbk (frame, cookie, this, - op_ret, op_errno, inode, - buf, preparent, - postparent, xdata) - self.stub_refs["mkdir_cbk"] = stub - dl.set_mkdir_cbk(c_this,stub) - if "rmdir_fop" in funcs: - @OpRmdir.fop_type - def stub (frame, this, loc, xflags, - xdata, s=self): - return s.rmdir_fop (frame, this, loc, - xflags, xdata) - self.stub_refs["rmdir_fop"] = stub - dl.set_rmdir_fop(c_this,stub) - if "rmdir_cbk" in funcs: - @OpRmdir.cbk_type - def stub (frame, cookie, this, op_ret, op_errno, - preparent, postparent, xdata, s=self): - return s.rmdir_cbk (frame, cookie, this, - op_ret, op_errno, - preparent, postparent, - xdata) - self.stub_refs["rmdir_cbk"] = stub - dl.set_rmdir_cbk(c_this,stub) diff --git a/xlators/features/glupy/src/helloworld.py b/xlators/features/glupy/src/helloworld.py deleted file mode 100644 index 8fe4037118e..00000000000 --- a/xlators/features/glupy/src/helloworld.py +++ /dev/null @@ -1,19 +0,0 @@ -import sys -from gluster import * - -class xlator (Translator): - - def __init__(self, c_this): - Translator.__init__(self, c_this) - - def lookup_fop(self, frame, this, loc, xdata): - print "Python xlator: Hello!" - dl.wind_lookup(frame, POINTER(xlator_t)(), loc, xdata) - return 0 - - def lookup_cbk(self, frame, cookie, this, op_ret, op_errno, inode, buf, - xdata, postparent): - print "Python xlator: Hello again!" - dl.unwind_lookup(frame, cookie, this, op_ret, op_errno, inode, buf, - xdata, postparent) - return 0 diff --git a/xlators/features/glupy/src/negative.py b/xlators/features/glupy/src/negative.py deleted file mode 100644 index 1023602b9f3..00000000000 --- a/xlators/features/glupy/src/negative.py +++ /dev/null @@ -1,92 +0,0 @@ -import sys -from uuid import UUID -from gluster import * - -# Negative-lookup-caching example. If a file wasn't there the last time we -# looked, it's probably still not there. This translator keeps track of -# those failed lookups for us, and returns ENOENT without needing to pass the -# call any further for repeated requests. - -# If we were doing this for real, we'd need separate caches for each xlator -# instance. The easiest way to do this would be to have xlator.__init__ -# "register" each instance in a module-global dict, with the key as the C -# translator address and the value as the xlator object itself. For testing -# and teaching, it's sufficient just to have one cache. The keys are parent -# GFIDs, and the entries are lists of names within that parent that we know -# don't exist. -cache = {} - -# TBD: we need a better way of handling per-request data (frame->local in C). -dl.get_id.restype = c_long -dl.get_id.argtypes = [ POINTER(call_frame_t) ] - -def uuid2str (gfid): - return str(UUID(''.join(map("{0:02x}".format, gfid)))) - -class xlator (Translator): - - def __init__ (self, c_this): - self.requests = {} - Translator.__init__(self,c_this) - - def lookup_fop (self, frame, this, loc, xdata): - pargfid = uuid2str(loc.contents.pargfid) - print "lookup FOP: %s:%s" % (pargfid, loc.contents.name) - # Check the cache. - if cache.has_key(pargfid): - if loc.contents.name in cache[pargfid]: - print "short-circuiting for %s:%s" % (pargfid, - loc.contents.name) - dl.unwind_lookup(frame,0,this,-1,2,None,None,None,None) - return 0 - key = dl.get_id(frame) - self.requests[key] = (pargfid, loc.contents.name[:]) - # TBD: get real child xl from init, pass it here - dl.wind_lookup(frame,POINTER(xlator_t)(),loc,xdata) - return 0 - - def lookup_cbk (self, frame, cookie, this, op_ret, op_errno, inode, buf, - xdata, postparent): - print "lookup CBK: %d (%d)" % (op_ret, op_errno) - key = dl.get_id(frame) - pargfid, name = self.requests[key] - # Update the cache. - if op_ret == 0: - print "found %s, removing from cache" % name - if cache.has_key(pargfid): - cache[pargfid].discard(name) - elif op_errno == 2: # ENOENT - print "failed to find %s, adding to cache" % name - if cache.has_key(pargfid): - cache[pargfid].add(name) - else: - cache[pargfid] = set([name]) - del self.requests[key] - dl.unwind_lookup(frame,cookie,this,op_ret,op_errno, - inode,buf,xdata,postparent) - return 0 - - def create_fop (self, frame, this, loc, flags, mode, umask, fd, xdata): - pargfid = uuid2str(loc.contents.pargfid) - print "create FOP: %s:%s" % (pargfid, loc.contents.name) - key = dl.get_id(frame) - self.requests[key] = (pargfid, loc.contents.name[:]) - # TBD: get real child xl from init, pass it here - dl.wind_create(frame,POINTER(xlator_t)(),loc,flags,mode,umask,fd,xdata) - return 0 - - def create_cbk (self, frame, cookie, this, op_ret, op_errno, fd, inode, - buf, preparent, postparent, xdata): - print "create CBK: %d (%d)" % (op_ret, op_errno) - key = dl.get_id(frame) - pargfid, name = self.requests[key] - # Update the cache. - if op_ret == 0: - print "created %s, removing from cache" % name - if cache.has_key(pargfid): - cache[pargfid].discard(name) - del self.requests[key] - dl.unwind_create(frame,cookie,this,op_ret,op_errno,fd,inode,buf, - preparent,postparent,xdata) - return 0 - diff --git a/xlators/features/index/src/Makefile.am b/xlators/features/index/src/Makefile.am index 73bb8972e70..c71c238c163 100644 --- a/xlators/features/index/src/Makefile.am +++ b/xlators/features/index/src/Makefile.am @@ -1,15 +1,17 @@ +if WITH_SERVER xlator_LTLIBRARIES = index.la +endif xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features -index_la_LDFLAGS = -module -avoid-version +index_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) index_la_SOURCES = index.c index_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = index.h index-mem-types.h +noinst_HEADERS = index.h index-mem-types.h index-messages.h -AM_CPPFLAGS = $(GF_CPPFLAGS) \ - -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/rpc/xdr/src \ +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ -I$(top_srcdir)/rpc/rpc-lib/src AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/features/index/src/index-mem-types.h b/xlators/features/index/src/index-mem-types.h index 553d492dfbf..58833d0ec9b 100644 --- a/xlators/features/index/src/index-mem-types.h +++ b/xlators/features/index/src/index-mem-types.h @@ -8,15 +8,16 @@ cases as published by the Free Software Foundation. */ -#ifndef __QUIESCE_MEM_TYPES_H__ -#define __QUIESCE_MEM_TYPES_H__ +#ifndef __INDEX_MEM_TYPES_H__ +#define __INDEX_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_index_mem_types_ { - gf_index_mt_priv_t = gf_common_mt_end + 1, - gf_index_inode_ctx_t = gf_common_mt_end + 2, - gf_index_fd_ctx_t = gf_common_mt_end + 3, - gf_index_mt_end + gf_index_mt_priv_t = gf_common_mt_end + 1, + gf_index_inode_ctx_t, + gf_index_fd_ctx_t, + gf_index_mt_local_t, + gf_index_mt_end }; #endif diff --git a/xlators/features/index/src/index-messages.h b/xlators/features/index/src/index-messages.h new file mode 100644 index 00000000000..364f17cd34e --- /dev/null +++ b/xlators/features/index/src/index-messages.h @@ -0,0 +1,33 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _INDEX_MESSAGES_H_ +#define _INDEX_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(INDEX, INDEX_MSG_INDEX_DIR_CREATE_FAILED, + INDEX_MSG_INDEX_READDIR_FAILED, INDEX_MSG_INDEX_ADD_FAILED, + INDEX_MSG_INDEX_DEL_FAILED, INDEX_MSG_DICT_SET_FAILED, + INDEX_MSG_INODE_CTX_GET_SET_FAILED, INDEX_MSG_INVALID_ARGS, + INDEX_MSG_FD_OP_FAILED, INDEX_MSG_WORKER_THREAD_CREATE_FAILED, + INDEX_MSG_INVALID_GRAPH); + +#endif /* !_INDEX_MESSAGES_H_ */ diff --git a/xlators/features/index/src/index.c b/xlators/features/index/src/index.c index 9253120f3f2..4abb2c73ce5 100644 --- a/xlators/features/index/src/index.c +++ b/xlators/features/index/src/index.c @@ -7,1483 +7,2676 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include "index.h" -#include "options.h" +#include <glusterfs/options.h> #include "glusterfs3-xdr.h" -#include "syncop.h" +#include <glusterfs/syscall.h> +#include <glusterfs/syncop.h> +#include <glusterfs/common-utils.h> +#include "index-messages.h" +#include <ftw.h> +#include <libgen.h> /* for dirname() */ +#include <signal.h> #define XATTROP_SUBDIR "xattrop" -#define BASE_INDICES_HOLDER_SUBDIR "base_indices_holder" +#define DIRTY_SUBDIR "dirty" +#define ENTRY_CHANGES_SUBDIR "entry-changes" -call_stub_t * -__index_dequeue (struct list_head *callstubs) +struct index_syncop_args { + inode_t *parent; + gf_dirent_t *entries; + char *path; +}; + +static char *index_vgfid_xattrs[XATTROP_TYPE_END] = { + [XATTROP] = GF_XATTROP_INDEX_GFID, + [DIRTY] = GF_XATTROP_DIRTY_GFID, + [ENTRY_CHANGES] = GF_XATTROP_ENTRY_CHANGES_GFID}; + +static char *index_subdirs[XATTROP_TYPE_END] = { + [XATTROP] = XATTROP_SUBDIR, + [DIRTY] = DIRTY_SUBDIR, + [ENTRY_CHANGES] = ENTRY_CHANGES_SUBDIR}; + +int +index_get_type_from_vgfid(index_priv_t *priv, uuid_t vgfid) { - call_stub_t *stub = NULL; + int i = 0; - if (!list_empty (callstubs)) { - stub = list_entry (callstubs->next, call_stub_t, list); - list_del_init (&stub->list); - } + for (i = 0; i < XATTROP_TYPE_END; i++) { + if (gf_uuid_compare(priv->internal_vgfid[i], vgfid) == 0) + return i; + } + return -1; +} - return stub; +gf_boolean_t +index_is_virtual_gfid(index_priv_t *priv, uuid_t vgfid) +{ + if (index_get_type_from_vgfid(priv, vgfid) < 0) + return _gf_false; + return _gf_true; } -inline static void -__index_enqueue (struct list_head *callstubs, call_stub_t *stub) +static int +__index_inode_ctx_get(inode_t *inode, xlator_t *this, index_inode_ctx_t **ctx) { - list_add_tail (&stub->list, callstubs); + int ret = 0; + index_inode_ctx_t *ictx = NULL; + uint64_t tmpctx = 0; + + ret = __inode_ctx_get(inode, this, &tmpctx); + if (!ret) { + ictx = (index_inode_ctx_t *)(long)tmpctx; + goto out; + } + ictx = GF_CALLOC(1, sizeof(*ictx), gf_index_inode_ctx_t); + if (!ictx) { + ret = -1; + goto out; + } + + INIT_LIST_HEAD(&ictx->callstubs); + ret = __inode_ctx_put(inode, this, (uint64_t)(uintptr_t)ictx); + if (ret) { + GF_FREE(ictx); + ictx = NULL; + goto out; + } +out: + if (ictx) + *ctx = ictx; + return ret; } -static void -worker_enqueue (xlator_t *this, call_stub_t *stub) +static int +index_inode_ctx_get(inode_t *inode, xlator_t *this, index_inode_ctx_t **ctx) { - index_priv_t *priv = NULL; + int ret = 0; - priv = this->private; - pthread_mutex_lock (&priv->mutex); - { - __index_enqueue (&priv->callstubs, stub); - pthread_cond_signal (&priv->cond); - } - pthread_mutex_unlock (&priv->mutex); + LOCK(&inode->lock); + { + ret = __index_inode_ctx_get(inode, this, ctx); + } + UNLOCK(&inode->lock); + + return ret; } -void * -index_worker (void *data) -{ - index_priv_t *priv = NULL; - xlator_t *this = NULL; - call_stub_t *stub = NULL; - int ret = 0; - - THIS = data; - this = data; - priv = this->private; - - for (;;) { - pthread_mutex_lock (&priv->mutex); - { - while (list_empty (&priv->callstubs)) { - ret = pthread_cond_wait (&priv->cond, - &priv->mutex); - } - - stub = __index_dequeue (&priv->callstubs); - } - pthread_mutex_unlock (&priv->mutex); +static gf_boolean_t +index_is_subdir_of_entry_changes(xlator_t *this, inode_t *inode) +{ + index_inode_ctx_t *ctx = NULL; + int ret = 0; - if (stub) /* guard against spurious wakeups */ - call_resume (stub); - } + if (!inode) + return _gf_false; - return NULL; + ret = index_inode_ctx_get(inode, this, &ctx); + if ((ret == 0) && !gf_uuid_is_null(ctx->virtual_pargfid)) + return _gf_true; + return _gf_false; } -int -__index_inode_ctx_get (inode_t *inode, xlator_t *this, index_inode_ctx_t **ctx) + +static int +index_get_type_from_vgfid_xattr(const char *name) { - int ret = 0; - index_inode_ctx_t *ictx = NULL; - uint64_t tmpctx = 0; + int i = 0; - ret = __inode_ctx_get (inode, this, &tmpctx); - if (!ret) { - ictx = (index_inode_ctx_t*) (long) tmpctx; - goto out; - } - ictx = GF_CALLOC (1, sizeof (*ictx), gf_index_inode_ctx_t); - if (!ictx) { - ret = -1; - goto out; - } + for (i = 0; i < XATTROP_TYPE_END; i++) { + if (strcmp(name, index_vgfid_xattrs[i]) == 0) + return i; + } + return -1; +} - INIT_LIST_HEAD (&ictx->callstubs); - ret = __inode_ctx_put (inode, this, (uint64_t)ictx); - if (ret) { - GF_FREE (ictx); - ictx = NULL; - goto out; - } -out: - if (ictx) - *ctx = ictx; - return ret; +gf_boolean_t +index_is_fop_on_internal_inode(xlator_t *this, inode_t *inode, uuid_t gfid) +{ + index_priv_t *priv = this->private; + uuid_t vgfid = {0}; + + if (!inode) + return _gf_false; + + if (gfid && !gf_uuid_is_null(gfid)) + gf_uuid_copy(vgfid, gfid); + else + gf_uuid_copy(vgfid, inode->gfid); + + if (index_is_virtual_gfid(priv, vgfid)) + return _gf_true; + if (index_is_subdir_of_entry_changes(this, inode)) + return _gf_true; + return _gf_false; } -int -index_inode_ctx_get (inode_t *inode, xlator_t *this, index_inode_ctx_t **ctx) +static gf_boolean_t +index_is_vgfid_xattr(const char *name) { - int ret = 0; + if (index_get_type_from_vgfid_xattr(name) < 0) + return _gf_false; + return _gf_true; +} - LOCK (&inode->lock); - { - ret = __index_inode_ctx_get (inode, this, ctx); - } - UNLOCK (&inode->lock); +call_stub_t * +__index_dequeue(struct list_head *callstubs) +{ + call_stub_t *stub = NULL; + + if (!list_empty(callstubs)) { + stub = list_entry(callstubs->next, call_stub_t, list); + list_del_init(&stub->list); + } - return ret; + return stub; } static void -make_index_dir_path (char *base, const char *subdir, - char *index_dir, size_t len) +__index_enqueue(struct list_head *callstubs, call_stub_t *stub) { - snprintf (index_dir, len, "%s/%s", base, subdir); + list_add_tail(&stub->list, callstubs); } -int -index_dir_create (xlator_t *this, const char *subdir) -{ - int ret = 0; - struct stat st = {0}; - char fullpath[PATH_MAX] = {0}; - char path[PATH_MAX] = {0}; - char *dir = NULL; - index_priv_t *priv = NULL; - size_t len = 0; - size_t pathlen = 0; - - priv = this->private; - make_index_dir_path (priv->index_basepath, subdir, fullpath, - sizeof (fullpath)); - ret = stat (fullpath, &st); - if (!ret) { - if (!S_ISDIR (st.st_mode)) - ret = -2; - goto out; - } - - pathlen = strlen (fullpath); - if ((pathlen > 1) && fullpath[pathlen - 1] == '/') - fullpath[pathlen - 1] = '\0'; - dir = strchr (fullpath, '/'); - while (dir) { - dir = strchr (dir + 1, '/'); - if (dir) - len = pathlen - strlen (dir); - else - len = pathlen; - strncpy (path, fullpath, len); - path[len] = '\0'; - ret = mkdir (path, 0600); - if (ret && (errno != EEXIST)) - goto out; - } - ret = 0; -out: - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, "%s/%s: Failed to " - "create (%s)", priv->index_basepath, subdir, - strerror (errno)); - } else if (ret == -2) { - gf_log (this->name, GF_LOG_ERROR, "%s/%s: Failed to create, " - "path exists, not a directory ", priv->index_basepath, - subdir); - } - return ret; +static void +worker_enqueue(xlator_t *this, call_stub_t *stub) +{ + index_priv_t *priv = NULL; + + priv = this->private; + pthread_mutex_lock(&priv->mutex); + { + __index_enqueue(&priv->callstubs, stub); + GF_ATOMIC_INC(priv->stub_cnt); + pthread_cond_signal(&priv->cond); + } + pthread_mutex_unlock(&priv->mutex); } -void -index_get_index (index_priv_t *priv, uuid_t index) +void * +index_worker(void *data) { - LOCK (&priv->lock); + index_priv_t *priv = NULL; + xlator_t *this = NULL; + call_stub_t *stub = NULL; + gf_boolean_t bye = _gf_false; + + THIS = data; + this = data; + priv = this->private; + + for (;;) { + pthread_mutex_lock(&priv->mutex); { - uuid_copy (index, priv->index); - } - UNLOCK (&priv->lock); + while (list_empty(&priv->callstubs)) { + if (priv->down) { + bye = _gf_true; /*Avoid wait*/ + break; + } + (void)pthread_cond_wait(&priv->cond, &priv->mutex); + if (priv->down) { + bye = _gf_true; + break; + } + } + if (!bye) + stub = __index_dequeue(&priv->callstubs); + if (bye) { + priv->curr_count--; + if (priv->curr_count == 0) + pthread_cond_broadcast(&priv->cond); + } + } + pthread_mutex_unlock(&priv->mutex); + + if (stub) { /* guard against spurious wakeups */ + call_resume(stub); + GF_ATOMIC_DEC(priv->stub_cnt); + } + stub = NULL; + if (bye) + break; + } + + return NULL; +} + +static void +make_index_dir_path(char *base, const char *subdir, char *index_dir, size_t len) +{ + snprintf(index_dir, len, "%s/%s", base, subdir); +} + +int +index_dir_create(xlator_t *this, const char *subdir) +{ + int ret = 0; + struct stat st = {0}; + char fullpath[PATH_MAX] = {0}; + char path[PATH_MAX] = {0}; + char *dir = NULL; + index_priv_t *priv = NULL; + size_t len = 0; + size_t pathlen = 0; + + priv = this->private; + make_index_dir_path(priv->index_basepath, subdir, fullpath, + sizeof(fullpath)); + ret = sys_stat(fullpath, &st); + if (!ret) { + if (!S_ISDIR(st.st_mode)) + ret = -2; + goto out; + } + + pathlen = strlen(fullpath); + if ((pathlen > 1) && fullpath[pathlen - 1] == '/') + fullpath[pathlen - 1] = '\0'; + dir = strchr(fullpath, '/'); + while (dir) { + dir = strchr(dir + 1, '/'); + if (dir) + len = pathlen - strlen(dir); + else + len = pathlen; + strncpy(path, fullpath, len); + path[len] = '\0'; + ret = sys_mkdir(path, 0600); + if (ret && (errno != EEXIST)) + goto out; + } + ret = 0; +out: + if (ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, errno, + INDEX_MSG_INDEX_DIR_CREATE_FAILED, + "%s/%s: Failed to " + "create", + priv->index_basepath, subdir); + } else if (ret == -2) { + gf_msg(this->name, GF_LOG_ERROR, ENOTDIR, + INDEX_MSG_INDEX_DIR_CREATE_FAILED, + "%s/%s: Failed to " + "create, path exists, not a directory ", + priv->index_basepath, subdir); + } + return ret; } void -index_generate_index (index_priv_t *priv, uuid_t index) +index_get_index(index_priv_t *priv, uuid_t index) { - LOCK (&priv->lock); - { - //To prevent duplicate generates. - //This method fails if number of contending threads is greater - //than MAX_LINK count of the fs - if (!uuid_compare (priv->index, index)) - uuid_generate (priv->index); - uuid_copy (index, priv->index); - } - UNLOCK (&priv->lock); + LOCK(&priv->lock); + { + gf_uuid_copy(index, priv->index); + } + UNLOCK(&priv->lock); } -static void -make_index_path (char *base, const char *subdir, uuid_t index, - char *index_path, size_t len) +void +index_generate_index(index_priv_t *priv, uuid_t index) { - make_index_dir_path (base, subdir, index_path, len); - snprintf (index_path + strlen (index_path), len - strlen (index_path), - "/%s-%s", subdir, uuid_utoa (index)); + LOCK(&priv->lock); + { + // To prevent duplicate generates. + // This method fails if number of contending threads is greater + // than MAX_LINK count of the fs + if (!gf_uuid_compare(priv->index, index)) + gf_uuid_generate(priv->index); + gf_uuid_copy(index, priv->index); + } + UNLOCK(&priv->lock); } static void -make_gfid_path (char *base, const char *subdir, uuid_t gfid, - char *gfid_path, size_t len) +make_index_path(char *base, const char *subdir, uuid_t index, char *index_path, + size_t len) { - make_index_dir_path (base, subdir, gfid_path, len); - snprintf (gfid_path + strlen (gfid_path), len - strlen (gfid_path), - "/%s", uuid_utoa (gfid)); + make_index_dir_path(base, subdir, index_path, len); + snprintf(index_path + strlen(index_path), len - strlen(index_path), + "/%s-%s", subdir, uuid_utoa(index)); } static void -make_file_path (char *base, const char *subdir, const char *filename, - char *file_path, size_t len) +make_gfid_path(char *base, const char *subdir, uuid_t gfid, char *gfid_path, + size_t len) { - make_index_dir_path (base, subdir, file_path, len); - snprintf (file_path + strlen (file_path), len - strlen (file_path), - "/%s", filename); + make_index_dir_path(base, subdir, gfid_path, len); + snprintf(gfid_path + strlen(gfid_path), len - strlen(gfid_path), "/%s", + uuid_utoa(gfid)); } static void -check_delete_stale_index_file (xlator_t *this, char *filename) +make_file_path(char *base, const char *subdir, const char *filename, + char *file_path, size_t len) { - int ret = 0; - struct stat st = {0}; - struct stat base_index_st = {0}; - char filepath[PATH_MAX] = {0}; - char filepath_under_base_indices_holder[PATH_MAX] = {0}; - index_priv_t *priv = NULL; - - priv = this->private; - if (priv->to_be_healed_states != synced_state) - return; - - make_file_path (priv->index_basepath, XATTROP_SUBDIR, - filename, filepath, sizeof (filepath)); - - make_file_path (priv->index_basepath, BASE_INDICES_HOLDER_SUBDIR, - filename, filepath_under_base_indices_holder, - sizeof (filepath_under_base_indices_holder)); - - - ret = stat (filepath_under_base_indices_holder, &base_index_st); - if (ret) { - gf_log (THIS->name, GF_LOG_ERROR, "Base index is not created" - "under index/base_indices_holder"); - return; - } - - ret = stat (filepath, &st); - if (!ret && st.st_nlink == 2) { - unlink (filepath); - unlink (filepath_under_base_indices_holder); - } + make_index_dir_path(base, subdir, file_path, len); + snprintf(file_path + strlen(file_path), len - strlen(file_path), "/%s", + filename); } static int -index_fill_readdir (fd_t *fd, DIR *dir, off_t off, - size_t size, gf_dirent_t *entries, readdir_directory type) -{ - off_t in_case = -1; - size_t filled = 0; - int count = 0; - char entrybuf[sizeof(struct dirent) + 256 + 8]; - struct dirent *entry = NULL; - int32_t this_size = -1; - gf_dirent_t *this_entry = NULL; - xlator_t *this = NULL; - - this = THIS; - if (!off) { - rewinddir (dir); - } else { - seekdir (dir, off); - } - - while (filled <= size) { - in_case = telldir (dir); - - if (in_case == -1) { - gf_log (THIS->name, GF_LOG_ERROR, - "telldir failed on dir=%p: %s", - dir, strerror (errno)); - goto out; - } - - errno = 0; - entry = NULL; - readdir_r (dir, (struct dirent *)entrybuf, &entry); - - if (!entry) { - if (errno == EBADF) { - gf_log (THIS->name, GF_LOG_WARNING, - "readdir failed on dir=%p: %s", - dir, strerror (errno)); - goto out; - } - break; - } - - if (!strncmp (entry->d_name, XATTROP_SUBDIR"-", - strlen (XATTROP_SUBDIR"-")) && - (type == INDEX_XATTROP)) { - check_delete_stale_index_file (this, entry->d_name); - continue; - } - - this_size = max (sizeof (gf_dirent_t), - sizeof (gfs3_dirplist)) - + strlen (entry->d_name) + 1; - - if (this_size + filled > size) { - seekdir (dir, in_case); - break; - } +is_index_file_current(char *filename, uuid_t priv_index, char *subdir) +{ + char current_index[GF_UUID_BUF_SIZE + 16] = { + 0, + }; - this_entry = gf_dirent_for_name (entry->d_name); + snprintf(current_index, sizeof current_index, "%s-%s", subdir, + uuid_utoa(priv_index)); + return (!strcmp(filename, current_index)); +} - if (!this_entry) { - gf_log (THIS->name, GF_LOG_ERROR, - "could not create gf_dirent for entry %s: (%s)", - entry->d_name, strerror (errno)); - goto out; - } - this_entry->d_off = telldir (dir); - this_entry->d_ino = entry->d_ino; +static void +check_delete_stale_index_file(xlator_t *this, char *filename, char *subdir) +{ + int ret = 0; + struct stat st = {0}; + char filepath[PATH_MAX] = {0}; + index_priv_t *priv = NULL; - list_add_tail (&this_entry->list, &entries->list); + priv = this->private; - filled += this_size; - count ++; - } + if (is_index_file_current(filename, priv->index, subdir)) + return; - if ((!readdir (dir) && (errno == 0))) - /* Indicate EOF */ - errno = ENOENT; -out: - return count; + make_file_path(priv->index_basepath, subdir, filename, filepath, + sizeof(filepath)); + ret = sys_stat(filepath, &st); + if (!ret && st.st_nlink == 1) + sys_unlink(filepath); } -int -sync_base_indices (void *index_priv) -{ - index_priv_t *priv = NULL; - DIR *dir_base_holder = NULL; - DIR *xattrop_dir = NULL; - struct dirent *entry = NULL; - char base_indices_holder[PATH_MAX] = {0}; - char xattrop_directory[PATH_MAX] = {0}; - char base_index_path[PATH_MAX] = {0}; - char xattrop_index_path[PATH_MAX] = {0}; - int ret = 0; - - priv = index_priv; - - snprintf (base_indices_holder, PATH_MAX, "%s/%s", priv->index_basepath, - BASE_INDICES_HOLDER_SUBDIR); - snprintf (xattrop_directory, PATH_MAX, "%s/%s", priv->index_basepath, - XATTROP_SUBDIR); - - if ((dir_base_holder = opendir(base_indices_holder)) == NULL) { - ret = -1; - goto out; - } - if ((xattrop_dir = opendir (xattrop_directory)) == NULL) { - ret = -1; - goto out; - } +static void +index_set_link_count(index_priv_t *priv, int64_t count, + index_xattrop_type_t type) +{ + switch (type) { + case XATTROP: + LOCK(&priv->lock); + { + priv->pending_count = count; + } + UNLOCK(&priv->lock); + break; + default: + break; + } +} - priv->to_be_healed_states = sync_started; - while ((entry = readdir(xattrop_dir)) != NULL) { - if (!strcmp (entry->d_name, ".") || - !strcmp (entry->d_name, "..")) { - continue; - } - if (strncmp (entry->d_name, XATTROP_SUBDIR"-", - strlen (XATTROP_SUBDIR"-"))) { - continue; - } - if (!strncmp (entry->d_name, XATTROP_SUBDIR"-", - strlen (XATTROP_SUBDIR"-"))) { +static void +index_get_link_count(index_priv_t *priv, int64_t *count, + index_xattrop_type_t type) +{ + switch (type) { + case XATTROP: + LOCK(&priv->lock); + { + *count = priv->pending_count; + } + UNLOCK(&priv->lock); + break; + default: + break; + } +} - snprintf (xattrop_index_path, PATH_MAX, "%s/%s", - xattrop_directory, entry->d_name); +static void +index_dec_link_count(index_priv_t *priv, index_xattrop_type_t type) +{ + switch (type) { + case XATTROP: + LOCK(&priv->lock); + { + priv->pending_count--; + if (priv->pending_count == 0) + priv->pending_count--; + } + UNLOCK(&priv->lock); + break; + default: + break; + } +} - snprintf (base_index_path, PATH_MAX, "%s/%s", - base_indices_holder, entry->d_name); +char * +index_get_subdir_from_type(index_xattrop_type_t type) +{ + if (type < XATTROP || type >= XATTROP_TYPE_END) + return NULL; + return index_subdirs[type]; +} - ret = link (xattrop_index_path, base_index_path); - if (ret && errno != EEXIST) - goto out; +char * +index_get_subdir_from_vgfid(index_priv_t *priv, uuid_t vgfid) +{ + return index_get_subdir_from_type(index_get_type_from_vgfid(priv, vgfid)); +} - } - } - ret = closedir (xattrop_dir); - if (ret) +static int +index_fill_readdir(fd_t *fd, index_fd_ctx_t *fctx, DIR *dir, off_t off, + size_t size, gf_dirent_t *entries) +{ + off_t in_case = -1; + off_t last_off = 0; + size_t filled = 0; + int count = 0; + struct dirent *entry = NULL; + struct dirent scratch[2] = { + { + 0, + }, + }; + int32_t this_size = -1; + gf_dirent_t *this_entry = NULL; + xlator_t *this = NULL; + + this = THIS; + if (!off) { + rewinddir(dir); + } else { + seekdir(dir, off); +#ifndef GF_LINUX_HOST_OS + if ((u_long)telldir(dir) != off && off != fctx->dir_eof) { + gf_msg(THIS->name, GF_LOG_ERROR, EINVAL, + INDEX_MSG_INDEX_READDIR_FAILED, + "seekdir(0x%llx) failed on dir=%p: " + "Invalid argument (offset reused from " + "another DIR * structure?)", + off, dir); + errno = EINVAL; + count = -1; + goto out; + } +#endif /* GF_LINUX_HOST_OS */ + } + + while (filled <= size) { + in_case = (u_long)telldir(dir); + + if (in_case == -1) { + gf_msg(THIS->name, GF_LOG_ERROR, errno, + INDEX_MSG_INDEX_READDIR_FAILED, "telldir failed on dir=%p", + dir); + goto out; + } + + errno = 0; + entry = sys_readdir(dir, scratch); + if (!entry || errno != 0) { + if (errno == EBADF) { + gf_msg(THIS->name, GF_LOG_WARNING, errno, + INDEX_MSG_INDEX_READDIR_FAILED, + "readdir failed on dir=%p", dir); goto out; - ret = closedir (dir_base_holder); - if (ret) + } + break; + } + + if (!strncmp(entry->d_name, XATTROP_SUBDIR "-", + strlen(XATTROP_SUBDIR "-"))) { + check_delete_stale_index_file(this, entry->d_name, XATTROP_SUBDIR); + continue; + } else if (!strncmp(entry->d_name, DIRTY_SUBDIR "-", + strlen(DIRTY_SUBDIR "-"))) { + check_delete_stale_index_file(this, entry->d_name, DIRTY_SUBDIR); + continue; + } + + this_size = max(sizeof(gf_dirent_t), sizeof(gfs3_dirplist)) + + strlen(entry->d_name) + 1; + + if (this_size + filled > size) { + seekdir(dir, in_case); +#ifndef GF_LINUX_HOST_OS + if ((u_long)telldir(dir) != in_case && in_case != fctx->dir_eof) { + gf_msg(THIS->name, GF_LOG_ERROR, EINVAL, + INDEX_MSG_INDEX_READDIR_FAILED, + "seekdir(0x%llx) failed on dir=%p: " + "Invalid argument (offset reused from " + "another DIR * structure?)", + in_case, dir); + errno = EINVAL; + count = -1; goto out; + } +#endif /* GF_LINUX_HOST_OS */ + break; + } + + this_entry = gf_dirent_for_name(entry->d_name); + + if (!this_entry) { + gf_msg(THIS->name, GF_LOG_ERROR, errno, + INDEX_MSG_INDEX_READDIR_FAILED, + "could not create gf_dirent for entry %s", entry->d_name); + goto out; + } + /* + * we store the offset of next entry here, which is + * probably not intended, but code using syncop_readdir() + * (glfs-heal.c, afr-self-heald.c, pump.c) rely on it + * for directory read resumption. + */ + last_off = (u_long)telldir(dir); + this_entry->d_off = last_off; + this_entry->d_ino = entry->d_ino; + + list_add_tail(&this_entry->list, &entries->list); + + filled += this_size; + count++; + } + + errno = 0; + + if ((!sys_readdir(dir, scratch) && (errno == 0))) { + /* Indicate EOF */ + errno = ENOENT; + /* Remember EOF offset for later detection */ + fctx->dir_eof = last_off; + } +out: + return count; +} +int +index_link_to_base(xlator_t *this, char *fpath, const char *subdir) +{ + int ret = 0; + int fd = 0; + int op_errno = 0; + uuid_t index = {0}; + index_priv_t *priv = this->private; + char base[PATH_MAX] = {0}; + + index_get_index(priv, index); + make_index_path(priv->index_basepath, subdir, index, base, sizeof(base)); + + ret = sys_link(base, fpath); + if (!ret || (errno == EEXIST)) { ret = 0; -out: - return ret; + goto out; + } + op_errno = errno; + if (op_errno == ENOENT) { + ret = index_dir_create(this, subdir); + if (ret) { + op_errno = errno; + goto out; + } + } else if (op_errno == EMLINK) { + index_generate_index(priv, index); + make_index_path(priv->index_basepath, subdir, index, base, + sizeof(base)); + } else { + goto out; + } + + op_errno = 0; + fd = sys_creat(base, 0); + if ((fd < 0) && (errno != EEXIST)) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, INDEX_MSG_INDEX_ADD_FAILED, + "%s: Not able to " + "create index", + fpath); + goto out; + } + + if (fd >= 0) + sys_close(fd); + + ret = sys_link(base, fpath); + if (ret && (errno != EEXIST)) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, INDEX_MSG_INDEX_ADD_FAILED, + "%s: Not able to " + "add to index", + fpath); + goto out; + } +out: + return -op_errno; } int -base_indices_syncing_done (int ret, call_frame_t *frame, void *data) +index_add(xlator_t *this, uuid_t gfid, const char *subdir, + index_xattrop_type_t type) { - index_priv_t *priv = NULL; - priv = data; + char gfid_path[PATH_MAX] = {0}; + int ret = -1; + index_priv_t *priv = NULL; + struct stat st = {0}; - if (!priv) - goto out; + priv = this->private; - if (ret) { - priv->to_be_healed_states = sync_not_started; - } else { - priv->to_be_healed_states = synced_state; - } + if (gf_uuid_is_null(gfid)) { + GF_ASSERT(0); + goto out; + } - STACK_DESTROY (frame->root); + make_gfid_path(priv->index_basepath, subdir, gfid, gfid_path, + sizeof(gfid_path)); + ret = sys_stat(gfid_path, &st); + if (!ret) + goto out; + ret = index_link_to_base(this, gfid_path, subdir); out: - return 0; + return ret; } int -sync_base_indices_from_xattrop (xlator_t *this) +index_del(xlator_t *this, uuid_t gfid, const char *subdir, int type) { + int32_t op_errno __attribute__((unused)) = 0; + index_priv_t *priv = NULL; + int ret = 0; + char gfid_path[PATH_MAX] = {0}; + char rename_dst[PATH_MAX] = { + 0, + }; + uuid_t uuid; + + priv = this->private; + GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, !gf_uuid_is_null(gfid), out, + op_errno, EINVAL); + make_gfid_path(priv->index_basepath, subdir, gfid, gfid_path, + sizeof(gfid_path)); + + if ((strcmp(subdir, ENTRY_CHANGES_SUBDIR)) == 0) { + ret = sys_rmdir(gfid_path); + /* rmdir above could fail with ENOTEMPTY if the indices under + * it were created when granular-entry-heal was enabled, whereas + * the actual heal that happened was non-granular (or full) in + * nature, resulting in name indices getting left out. To + * clean up this directory without it affecting the IO path perf, + * the directory is renamed to a unique name under + * indices/entry-changes. Self-heal will pick up this entry + * during crawl and on lookup into the file system figure that + * the index is stale and subsequently wipe it out using rmdir(). + */ + if ((ret) && (errno == ENOTEMPTY)) { + gf_uuid_generate(uuid); + make_gfid_path(priv->index_basepath, subdir, uuid, rename_dst, + sizeof(rename_dst)); + ret = sys_rename(gfid_path, rename_dst); + } + } else { + ret = sys_unlink(gfid_path); + } + + if (ret && (errno != ENOENT)) { + gf_msg(this->name, GF_LOG_ERROR, errno, INDEX_MSG_INDEX_DEL_FAILED, + "%s: failed to delete" + " from index", + gfid_path); + ret = -errno; + goto out; + } + + index_dec_link_count(priv, type); + ret = 0; +out: + return ret; +} - index_priv_t *priv = NULL; - char base_indices_holder[PATH_MAX] = {0}; - int ret = 0; - struct stat st = {0}; - DIR *dir = NULL; - struct dirent *entry = NULL; - call_frame_t *frame = NULL; - - priv = this->private; - - if (priv->to_be_healed_states != sync_not_started) { - ret = -1; - goto out; - } +static gf_boolean_t +_is_xattr_in_watchlist(dict_t *d, char *k, data_t *v, void *tmp) +{ + if (!strncmp(k, tmp, strlen(k))) + return _gf_true; - snprintf (base_indices_holder, PATH_MAX, "%s/%s", priv->index_basepath, - BASE_INDICES_HOLDER_SUBDIR); + return _gf_false; +} - ret = stat (base_indices_holder, &st); +static gf_boolean_t +is_xattr_in_watchlist(dict_t *this, char *key, data_t *value, void *matchdata) +{ + int ret = -1; + + // matchdata is a list of xattrs + // key is strncmp'ed with each xattr in matchdata. + // ret will be 0 if key pattern is not present in the matchdata + // else ret will be count number of xattrs the key pattern-matches with. + ret = dict_foreach_match(matchdata, _is_xattr_in_watchlist, key, + dict_null_foreach_fn, NULL); + + if (ret > 0) + return _gf_true; + return _gf_false; +} - if (ret && (errno != ENOENT)) { - goto out; - } else if (errno == ENOENT) { - ret = index_dir_create (this, BASE_INDICES_HOLDER_SUBDIR); - if (ret) - goto out; - } else { - if ((dir = opendir (base_indices_holder)) == NULL) { - ret = -1; - goto out; - } - while ((entry = readdir (dir)) != NULL) { - if (!strcmp (entry->d_name, ".") || - !strcmp (entry->d_name,"..")) { - continue; - } - ret = unlink (entry->d_name); - if (ret) - goto out; - } - closedir (dir); - } +static int +index_find_xattr_type(dict_t *d, char *k, data_t *v) +{ + int idx = -1; + index_priv_t *priv = THIS->private; - /*At this point of time we have index/base_indicies_holder directory - *is with no entries*/ + if (priv->dirty_watchlist && + is_xattr_in_watchlist(d, k, v, priv->dirty_watchlist)) + idx = DIRTY; + else if (priv->pending_watchlist && + is_xattr_in_watchlist(d, k, v, priv->pending_watchlist)) + idx = XATTROP; - frame = create_frame (this, this->ctx->pool); - if (!frame) { - ret = -1; - goto out; - } - set_lk_owner_from_ptr (&frame->root->lk_owner, frame->root); + return idx; +} - frame->root->pid = LOW_PRIO_PROC_PID; +int +index_fill_zero_array(dict_t *d, char *k, data_t *v, void *adata) +{ + int idx = -1; + int *zfilled = adata; + // zfilled array contains `state` for all types xattrs. + // state : whether the gfid file of this file exists in + // corresponding xattr directory or not. + + idx = index_find_xattr_type(d, k, v); + if (idx == -1) + return 0; + zfilled[idx] = 0; + return 0; +} - ret = synctask_new (this->ctx->env, sync_base_indices, - base_indices_syncing_done,frame, priv); +static int +_check_key_is_zero_filled(dict_t *d, char *k, data_t *v, void *tmp) +{ + int *zfilled = tmp; + int idx = -1; + idx = index_find_xattr_type(d, k, v); + if (idx == -1) + return 0; + /* Along with checking that the value of a key is zero filled + * the key's corresponding index should be assigned + * appropriate value. + * zfilled[idx] will be 0(false) if value not zero. + * will be 1(true) if value is zero. + */ + if (mem_0filled((const char *)v->data, v->len)) { + zfilled[idx] = 0; + return 0; + } + + /* If zfilled[idx] was previously 0, it means at least + * one xattr of its "kind" is non-zero. Keep its value + * the same. + */ + if (zfilled[idx]) + zfilled[idx] = 1; + return 0; +} +int +index_entry_create(xlator_t *this, inode_t *inode, char *filename) +{ + int ret = -1; + int op_errno = 0; + char pgfid_path[PATH_MAX] = {0}; + char entry_path[PATH_MAX] = {0}; + index_priv_t *priv = NULL; + index_inode_ctx_t *ctx = NULL; + int32_t len = 0; + + priv = this->private; + + GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, !gf_uuid_is_null(inode->gfid), + out, op_errno, EINVAL); + GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, filename, out, op_errno, EINVAL); + + ret = index_inode_ctx_get(inode, this, &ctx); + if (ret) { + op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + INDEX_MSG_INODE_CTX_GET_SET_FAILED, + "Not able to get inode ctx for %s", uuid_utoa(inode->gfid)); + goto out; + } + + make_gfid_path(priv->index_basepath, ENTRY_CHANGES_SUBDIR, inode->gfid, + pgfid_path, sizeof(pgfid_path)); + + if (ctx->state[ENTRY_CHANGES] != IN) { + ret = sys_mkdir(pgfid_path, 0600); + if (ret != 0 && errno != EEXIST) { + op_errno = errno; + goto out; + } + ctx->state[ENTRY_CHANGES] = IN; + } + + if (strchr(filename, '/')) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, INDEX_MSG_INDEX_ADD_FAILED, + "Got invalid entry (%s) for pargfid path (%s)", filename, + pgfid_path); + op_errno = EINVAL; + goto out; + } + + len = snprintf(entry_path, sizeof(entry_path), "%s/%s", pgfid_path, + filename); + if ((len < 0) || (len >= sizeof(entry_path))) { + op_errno = EINVAL; + goto out; + } + + op_errno = 0; + + ret = index_link_to_base(this, entry_path, ENTRY_CHANGES_SUBDIR); out: - return ret; - + if (op_errno) + ret = -op_errno; + return ret; } int -index_add (xlator_t *this, uuid_t gfid, const char *subdir) -{ - int32_t op_errno = 0; - char gfid_path[PATH_MAX] = {0}; - char index_path[PATH_MAX] = {0}; - char base_path[PATH_MAX] = {0}; - int ret = 0; - uuid_t index = {0}; - index_priv_t *priv = NULL; - struct stat st = {0}; - int fd = 0; - int index_created = 0; - - priv = this->private; - GF_ASSERT_AND_GOTO_WITH_ERROR (this->name, !uuid_is_null (gfid), - out, op_errno, EINVAL); - - make_gfid_path (priv->index_basepath, subdir, gfid, - gfid_path, sizeof (gfid_path)); - - ret = stat (gfid_path, &st); - if (!ret) - goto out; - index_get_index (priv, index); - make_index_path (priv->index_basepath, subdir, - index, index_path, sizeof (index_path)); - ret = link (index_path, gfid_path); - if (!ret || (errno == EEXIST)) { - ret = 0; - index_created = 1; - goto out; - } - - +index_entry_delete(xlator_t *this, uuid_t pgfid, char *filename) +{ + int ret = 0; + int op_errno = 0; + char pgfid_path[PATH_MAX] = {0}; + char entry_path[PATH_MAX] = {0}; + index_priv_t *priv = NULL; + int32_t len = 0; + + priv = this->private; + + GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, !gf_uuid_is_null(pgfid), out, + op_errno, EINVAL); + GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, filename, out, op_errno, EINVAL); + + make_gfid_path(priv->index_basepath, ENTRY_CHANGES_SUBDIR, pgfid, + pgfid_path, sizeof(pgfid_path)); + + if (strchr(filename, '/')) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, INDEX_MSG_INDEX_DEL_FAILED, + "Got invalid entry (%s) for pargfid path (%s)", filename, + pgfid_path); + op_errno = EINVAL; + goto out; + } + + len = snprintf(entry_path, sizeof(entry_path), "%s/%s", pgfid_path, + filename); + if ((len < 0) || (len >= sizeof(entry_path))) { + op_errno = EINVAL; + goto out; + } + + ret = sys_unlink(entry_path); + if (ret && (errno != ENOENT)) { op_errno = errno; - if (op_errno == ENOENT) { - ret = index_dir_create (this, subdir); - if (ret) - goto out; - } else if (op_errno == EMLINK) { - index_generate_index (priv, index); - make_index_path (priv->index_basepath, subdir, - index, index_path, sizeof (index_path)); - } else { - goto out; - } + gf_msg(this->name, GF_LOG_ERROR, op_errno, INDEX_MSG_INDEX_DEL_FAILED, + "%s: failed to delete from index/entry-changes", entry_path); + } - fd = creat (index_path, 0); - if ((fd < 0) && (errno != EEXIST)) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, "%s: Not able to " - "create index (%s)", uuid_utoa (gfid), - strerror (errno)); - goto out; - } +out: + return -op_errno; +} - if (fd >= 0) - close (fd); +int +index_entry_action(xlator_t *this, inode_t *inode, dict_t *xdata, char *key) +{ + int ret = 0; + char *filename = NULL; - ret = link (index_path, gfid_path); - if (ret && (errno != EEXIST)) { - gf_log (this->name, GF_LOG_ERROR, "%s: Not able to " - "add to index (%s)", uuid_utoa (gfid), - strerror (errno)); - goto out; - } else { - index_created = 1; - } + ret = dict_get_str(xdata, key, &filename); + if (ret != 0) { + ret = 0; + goto out; + } - if (priv->to_be_healed_states != sync_not_started) { - make_index_path (priv->index_basepath, - GF_BASE_INDICES_HOLDER_GFID, - index, base_path, sizeof (base_path)); - ret = link (index_path, base_path); - if (ret) - goto out; - } + if (strcmp(key, GF_XATTROP_ENTRY_IN_KEY) == 0) + ret = index_entry_create(this, inode, filename); + else if (strcmp(key, GF_XATTROP_ENTRY_OUT_KEY) == 0) + ret = index_entry_delete(this, inode->gfid, filename); - ret = 0; out: - /*If base_indices_holder is not created: create and sync - *If directory is present: delete contents and start syncing - *If syncing is in progress :No need to do any thing - *If syncing is done: No need to do anything*/ - if (!ret) { - switch (priv->to_be_healed_states) { - case sync_not_started: - ret = sync_base_indices_from_xattrop (this); - break; - case sync_started: - case synced_state: - /*No need to do anything*/ - break; - } - } - return ret; + return ret; } -int -index_del (xlator_t *this, uuid_t gfid, const char *subdir) -{ - int32_t op_errno __attribute__((unused)) = 0; - index_priv_t *priv = NULL; - int ret = 0; - char gfid_path[PATH_MAX] = {0}; - - priv = this->private; - GF_ASSERT_AND_GOTO_WITH_ERROR (this->name, !uuid_is_null (gfid), - out, op_errno, EINVAL); - make_gfid_path (priv->index_basepath, subdir, gfid, - gfid_path, sizeof (gfid_path)); - ret = unlink (gfid_path); - if (ret && (errno != ENOENT)) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to delete from index (%s)", - gfid_path, strerror (errno)); - ret = -errno; - goto out; - } - ret = 0; +void +_index_action(xlator_t *this, inode_t *inode, int *zfilled) +{ + int ret = 0; + int i = 0; + index_inode_ctx_t *ctx = NULL; + char *subdir = NULL; + + ret = index_inode_ctx_get(inode, this, &ctx); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, + INDEX_MSG_INODE_CTX_GET_SET_FAILED, + "Not able to get" + " inode context for %s.", + uuid_utoa(inode->gfid)); + goto out; + } + + for (i = 0; i < XATTROP_TYPE_END; i++) { + subdir = index_get_subdir_from_type(i); + if (zfilled[i] == 1) { + if (ctx->state[i] == NOTIN) + continue; + ret = index_del(this, inode->gfid, subdir, i); + if (!ret) + ctx->state[i] = NOTIN; + } else if (zfilled[i] == 0) { + if (ctx->state[i] == IN) + continue; + ret = index_add(this, inode->gfid, subdir, i); + if (!ret) + ctx->state[i] = IN; + } + } out: - return ret; + return; } -static int -_check_key_is_zero_filled (dict_t *d, char *k, data_t *v, - void *tmp) +static void +index_init_state(xlator_t *this, inode_t *inode, index_inode_ctx_t *ctx, + char *subdir) { - if (mem_0filled ((const char*)v->data, v->len)) { - /* -1 means, no more iterations, treat as 'break' */ - return -1; - } - return 0; -} + int ret = -1; + char pgfid_path[PATH_MAX] = {0}; + struct stat st = {0}; + index_priv_t *priv = NULL; + priv = this->private; -void -_xattrop_index_action (xlator_t *this, inode_t *inode, dict_t *xattr) -{ - gf_boolean_t zero_xattr = _gf_true; - index_inode_ctx_t *ctx = NULL; - int ret = 0; + make_gfid_path(priv->index_basepath, subdir, inode->gfid, pgfid_path, + sizeof(pgfid_path)); - ret = dict_foreach (xattr, _check_key_is_zero_filled, NULL); - if (ret == -1) - zero_xattr = _gf_false; + ret = sys_stat(pgfid_path, &st); + if (ret == 0) + ctx->state[ENTRY_CHANGES] = IN; + else if (ret != 0 && errno == ENOENT) + ctx->state[ENTRY_CHANGES] = NOTIN; - ret = index_inode_ctx_get (inode, this, &ctx); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Not able to %s %s -> index", - zero_xattr?"add":"del", uuid_utoa (inode->gfid)); - goto out; - } - if (zero_xattr) { - if (ctx->state == NOTIN) - goto out; - ret = index_del (this, inode->gfid, XATTROP_SUBDIR); - if (!ret) - ctx->state = NOTIN; - } else { - if (ctx->state == IN) - goto out; - ret = index_add (this, inode->gfid, XATTROP_SUBDIR); - if (!ret) - ctx->state = IN; - } -out: - return; + return; } void -fop_xattrop_index_action (xlator_t *this, inode_t *inode, dict_t *xattr) +xattrop_index_action(xlator_t *this, index_local_t *local, dict_t *xattr, + dict_match_t match, void *match_data) { - _xattrop_index_action (this, inode, xattr); -} + int ret = 0; + int zfilled[XATTROP_TYPE_END] = { + 0, + }; + int8_t value = 0; + char *subdir = NULL; + dict_t *req_xdata = NULL; + inode_t *inode = NULL; + index_inode_ctx_t *ctx = NULL; + + inode = local->inode; + req_xdata = local->xdata; + + memset(zfilled, -1, sizeof(zfilled)); + ret = dict_foreach_match(xattr, match, match_data, + _check_key_is_zero_filled, zfilled); + _index_action(this, inode, zfilled); + + if (req_xdata) { + ret = index_entry_action(this, inode, req_xdata, + GF_XATTROP_ENTRY_OUT_KEY); + + ret = dict_get_int8(req_xdata, GF_XATTROP_PURGE_INDEX, &value); + if ((ret) || (value == 0)) + goto out; + } + + if (zfilled[XATTROP] != 1) + goto out; + + if (inode->ia_type != IA_IFDIR) + goto out; + + subdir = index_get_subdir_from_type(ENTRY_CHANGES); + ret = index_inode_ctx_get(inode, this, &ctx); + if (ctx->state[ENTRY_CHANGES] == UNKNOWN) + index_init_state(this, inode, ctx, subdir); + if (ctx->state[ENTRY_CHANGES] == IN) { + ret = index_del(this, inode->gfid, subdir, ENTRY_CHANGES); + ctx->state[ENTRY_CHANGES] = NOTIN; + } -void -fop_fxattrop_index_action (xlator_t *this, inode_t *inode, dict_t *xattr) -{ - _xattrop_index_action (this, inode, xattr); +out: + return; } -inline gf_boolean_t -index_xattrop_track (loc_t *loc, gf_xattrop_flags_t flags, dict_t *dict) +static gf_boolean_t +index_xattrop_track(xlator_t *this, gf_xattrop_flags_t flags, dict_t *dict) { - return (flags == GF_XATTROP_ADD_ARRAY); -} + index_priv_t *priv = this->private; -inline gf_boolean_t -index_fxattrop_track (fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict) -{ - return (flags == GF_XATTROP_ADD_ARRAY); -} + if (flags == GF_XATTROP_ADD_ARRAY) + return _gf_true; -int -__index_fd_ctx_get (fd_t *fd, xlator_t *this, index_fd_ctx_t **ctx) -{ - int ret = 0; - index_fd_ctx_t *fctx = NULL; - uint64_t tmpctx = 0; - char index_dir[PATH_MAX] = {0}; - index_priv_t *priv = NULL; + if (flags != GF_XATTROP_ADD_ARRAY64) + return _gf_false; - priv = this->private; - if (uuid_compare (fd->inode->gfid, priv->xattrop_vgfid)) { - ret = -EINVAL; - goto out; - } + if (!priv->pending_watchlist) + return _gf_false; - ret = __fd_ctx_get (fd, this, &tmpctx); - if (!ret) { - fctx = (index_fd_ctx_t*) (long) tmpctx; - goto out; - } + if (dict_foreach_match(dict, is_xattr_in_watchlist, priv->pending_watchlist, + dict_null_foreach_fn, NULL) > 0) + return _gf_true; - fctx = GF_CALLOC (1, sizeof (*fctx), gf_index_fd_ctx_t); - if (!fctx) { - ret = -ENOMEM; - goto out; - } + return _gf_false; +} - make_index_dir_path (priv->index_basepath, XATTROP_SUBDIR, - index_dir, sizeof (index_dir)); - fctx->dir = opendir (index_dir); - if (!fctx->dir) { - ret = -errno; - GF_FREE (fctx); - fctx = NULL; - goto out; - } +int +index_inode_path(xlator_t *this, inode_t *inode, char *dirpath, size_t len) +{ + char *subdir = NULL; + int ret = 0; + index_priv_t *priv = NULL; + index_inode_ctx_t *ictx = NULL; + + priv = this->private; + if (!index_is_fop_on_internal_inode(this, inode, NULL)) { + ret = -EINVAL; + goto out; + } + + subdir = index_get_subdir_from_vgfid(priv, inode->gfid); + if (subdir) { + if (len <= strlen(priv->index_basepath) + 1 /*'/'*/ + strlen(subdir)) { + ret = -EINVAL; + goto out; + } + make_index_dir_path(priv->index_basepath, subdir, dirpath, len); + } else { + ret = index_inode_ctx_get(inode, this, &ictx); + if (ret) + goto out; + if (gf_uuid_is_null(ictx->virtual_pargfid)) { + ret = -EINVAL; + goto out; + } + make_index_dir_path(priv->index_basepath, ENTRY_CHANGES_SUBDIR, dirpath, + len); + if (len <= strlen(dirpath) + 1 /*'/'*/ + SLEN(UUID0_STR)) { + ret = -EINVAL; + goto out; + } + strcat(dirpath, "/"); + strcat(dirpath, uuid_utoa(ictx->virtual_pargfid)); + } +out: + return ret; +} - ret = __fd_ctx_set (fd, this, (uint64_t)(long)fctx); - if (ret) { - GF_FREE (fctx); - fctx = NULL; - ret = -EINVAL; - goto out; - } +int +__index_fd_ctx_get(fd_t *fd, xlator_t *this, index_fd_ctx_t **ctx) +{ + int ret = 0; + index_fd_ctx_t *fctx = NULL; + uint64_t tmpctx = 0; + char dirpath[PATH_MAX] = {0}; + + ret = __fd_ctx_get(fd, this, &tmpctx); + if (!ret) { + fctx = (index_fd_ctx_t *)(long)tmpctx; + *ctx = fctx; + goto out; + } + + ret = index_inode_path(this, fd->inode, dirpath, sizeof(dirpath)); + if (ret) + goto out; + + fctx = GF_CALLOC(1, sizeof(*fctx), gf_index_fd_ctx_t); + if (!fctx) { + ret = -ENOMEM; + goto out; + } + + fctx->dir = sys_opendir(dirpath); + if (!fctx->dir) { + ret = -errno; + GF_FREE(fctx); + fctx = NULL; + goto out; + } + fctx->dir_eof = -1; + + ret = __fd_ctx_set(fd, this, (uint64_t)(long)fctx); + if (ret) { + (void)sys_closedir(fctx->dir); + GF_FREE(fctx); + fctx = NULL; + ret = -EINVAL; + goto out; + } + *ctx = fctx; out: - if (fctx) - *ctx = fctx; - return ret; + return ret; } int -index_fd_ctx_get (fd_t *fd, xlator_t *this, index_fd_ctx_t **ctx) +index_fd_ctx_get(fd_t *fd, xlator_t *this, index_fd_ctx_t **ctx) { - int ret = 0; - LOCK (&fd->lock); - { - ret = __index_fd_ctx_get (fd, this, ctx); - } - UNLOCK (&fd->lock); - return ret; + int ret = 0; + LOCK(&fd->lock); + { + ret = __index_fd_ctx_get(fd, this, ctx); + } + UNLOCK(&fd->lock); + return ret; } -//new - Not NULL means start a fop -//new - NULL means done processing the fop +// new - Not NULL means start a fop +// new - NULL means done processing the fop void -index_queue_process (xlator_t *this, inode_t *inode, call_stub_t *new) +index_queue_process(xlator_t *this, inode_t *inode, call_stub_t *new) { - call_stub_t *stub = NULL; - index_inode_ctx_t *ctx = NULL; - int ret = 0; - call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + index_inode_ctx_t *ctx = NULL; + int ret = 0; + call_frame_t *frame = NULL; + + LOCK(&inode->lock); + { + ret = __index_inode_ctx_get(inode, this, &ctx); + if (ret) + goto unlock; - LOCK (&inode->lock); - { - ret = __index_inode_ctx_get (inode, this, &ctx); - if (ret) - goto unlock; - - if (new) { - __index_enqueue (&ctx->callstubs, new); - new = NULL; - } else { - ctx->processing = _gf_false; - } + if (new) { + __index_enqueue(&ctx->callstubs, new); + new = NULL; + } else { + ctx->processing = _gf_false; + } - if (!ctx->processing) { - stub = __index_dequeue (&ctx->callstubs); - if (stub) - ctx->processing = _gf_true; - else - ctx->processing = _gf_false; - } + if (!ctx->processing) { + stub = __index_dequeue(&ctx->callstubs); + if (stub) + ctx->processing = _gf_true; + else + ctx->processing = _gf_false; } + } unlock: - UNLOCK (&inode->lock); - - if (ret && new) { - frame = new->frame; - if (new->fop == GF_FOP_XATTROP) { - INDEX_STACK_UNWIND (xattrop, frame, -1, ENOMEM, - NULL, NULL); - } else if (new->fop == GF_FOP_FXATTROP) { - INDEX_STACK_UNWIND (fxattrop, frame, -1, ENOMEM, - NULL, NULL); - } - call_stub_destroy (new); - } else if (stub) { - call_resume (stub); - } - return; + UNLOCK(&inode->lock); + + if (ret && new) { + frame = new->frame; + if (new->fop == GF_FOP_XATTROP) { + INDEX_STACK_UNWIND(xattrop, frame, -1, ENOMEM, NULL, NULL); + } else if (new->fop == GF_FOP_FXATTROP) { + INDEX_STACK_UNWIND(fxattrop, frame, -1, ENOMEM, NULL, NULL); + } + call_stub_destroy(new); + } else if (stub) { + call_resume(stub); + } + return; } -int32_t -index_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata) +static int +xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr, dict_t *xdata, dict_match_t match, + dict_t *matchdata) { - inode_t *inode = NULL; + inode_t *inode = NULL; + index_local_t *local = NULL; - inode = inode_ref (frame->local); - if (op_ret < 0) - goto out; - fop_xattrop_index_action (this, frame->local, xattr); + local = frame->local; + inode = inode_ref(local->inode); + + if (op_ret < 0) + goto out; + + xattrop_index_action(this, local, xattr, match, matchdata); out: - INDEX_STACK_UNWIND (xattrop, frame, op_ret, op_errno, xattr, xdata); - index_queue_process (this, inode, NULL); - inode_unref (inode); + INDEX_STACK_UNWIND(xattrop, frame, op_ret, op_errno, xattr, xdata); + index_queue_process(this, inode, NULL); + inode_unref(inode); - return 0; + return 0; } int32_t -index_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +index_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) +{ + index_priv_t *priv = this->private; + + xattrop_cbk(frame, cookie, this, op_ret, op_errno, xattr, xdata, + is_xattr_in_watchlist, priv->complete_watchlist); + return 0; +} + +int32_t +index_xattrop64_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata) { - inode_t *inode = NULL; - - inode = inode_ref (frame->local); - if (op_ret < 0) - goto out; + index_priv_t *priv = this->private; - fop_fxattrop_index_action (this, frame->local, xattr); -out: - INDEX_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, xattr, xdata); - index_queue_process (this, inode, NULL); - inode_unref (inode); + return xattrop_cbk(frame, cookie, this, op_ret, op_errno, xattr, xdata, + is_xattr_in_watchlist, priv->pending_watchlist); +} - return 0; +void +index_xattrop_do(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + int ret = -1; + int zfilled[XATTROP_TYPE_END] = { + 0, + }; + index_local_t *local = NULL; + fop_xattrop_cbk_t x_cbk = NULL; + + local = frame->local; + + if (optype == GF_XATTROP_ADD_ARRAY) + x_cbk = index_xattrop_cbk; + else + x_cbk = index_xattrop64_cbk; + + // In wind phase bring the gfid into index. This way if the brick crashes + // just after posix performs xattrop before _cbk reaches index xlator + // we will still have the gfid in index. + memset(zfilled, -1, sizeof(zfilled)); + + /* Foreach xattr, set corresponding index of zfilled to 1 + * zfilled[index] = 1 implies the xattr's value is zero filled + * and should be added in its corresponding subdir. + * + * zfilled should be set to 1 only for those index that + * exist in xattr variable. This is to distinguish + * between different types of volumes. + * For e.g., if the check is not made, + * zfilled[DIRTY] is set to 1 for EC volumes, + * index file will be tried to create in indices/dirty dir + * which doesn't exist for an EC volume. + */ + ret = dict_foreach(xattr, index_fill_zero_array, zfilled); + + _index_action(this, local->inode, zfilled); + if (xdata) + ret = index_entry_action(this, local->inode, xdata, + GF_XATTROP_ENTRY_IN_KEY); + if (ret < 0) { + x_cbk(frame, NULL, this, -1, -ret, NULL, NULL); + return; + } + + if (loc) + STACK_WIND(frame, x_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, loc, optype, xattr, xdata); + else + STACK_WIND(frame, x_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, fd, optype, xattr, xdata); } int -index_xattrop_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +index_xattrop_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - STACK_WIND (frame, index_xattrop_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->xattrop, loc, optype, xattr, - xdata); - return 0; + index_xattrop_do(frame, this, loc, NULL, optype, xattr, xdata); + return 0; } int -index_fxattrop_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +index_fxattrop_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - STACK_WIND (frame, index_fxattrop_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fxattrop, fd, optype, xattr, - xdata); - return 0; + index_xattrop_do(frame, this, NULL, fd, optype, xattr, xdata); + return 0; } int32_t -index_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +index_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { - call_stub_t *stub = NULL; - - if (!index_xattrop_track (loc, flags, dict)) - goto out; - - frame->local = inode_ref (loc->inode); - stub = fop_xattrop_stub (frame, index_xattrop_wrapper, - loc, flags, dict, xdata); - if (!stub) { - INDEX_STACK_UNWIND (xattrop, frame, -1, ENOMEM, NULL, NULL); - return 0; - } - - index_queue_process (this, loc->inode, stub); + call_stub_t *stub = NULL; + index_local_t *local = NULL; + + if (!index_xattrop_track(this, flags, dict)) + goto out; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + local->inode = inode_ref(loc->inode); + if (xdata) + local->xdata = dict_ref(xdata); + stub = fop_xattrop_stub(frame, index_xattrop_wrapper, loc, flags, dict, + xdata); + +err: + if ((!local) || (!stub)) { + INDEX_STACK_UNWIND(xattrop, frame, -1, ENOMEM, NULL, NULL); return 0; + } + + index_queue_process(this, loc->inode, stub); + return 0; out: - STACK_WIND (frame, default_xattrop_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->xattrop, loc, flags, dict, xdata); - return 0; + STACK_WIND(frame, default_xattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, loc, flags, dict, xdata); + return 0; } int32_t -index_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +index_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { - call_stub_t *stub = NULL; - - if (!index_fxattrop_track (fd, flags, dict)) - goto out; - - frame->local = inode_ref (fd->inode); - stub = fop_fxattrop_stub (frame, index_fxattrop_wrapper, - fd, flags, dict, xdata); - if (!stub) { - INDEX_STACK_UNWIND (fxattrop, frame, -1, ENOMEM, NULL, xdata); - return 0; - } - - index_queue_process (this, fd->inode, stub); + call_stub_t *stub = NULL; + index_local_t *local = NULL; + + if (!index_xattrop_track(this, flags, dict)) + goto out; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + local->inode = inode_ref(fd->inode); + if (xdata) + local->xdata = dict_ref(xdata); + stub = fop_fxattrop_stub(frame, index_fxattrop_wrapper, fd, flags, dict, + xdata); + +err: + if ((!local) || (!stub)) { + INDEX_STACK_UNWIND(fxattrop, frame, -1, ENOMEM, NULL, xdata); return 0; + } + + index_queue_process(this, fd->inode, stub); + return 0; out: - STACK_WIND (frame, default_fxattrop_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fxattrop, fd, flags, dict, xdata); - return 0; + STACK_WIND(frame, default_fxattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, fd, flags, dict, xdata); + return 0; } -int32_t -index_getxattr_wrapper (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) +uint64_t +index_entry_count(xlator_t *this, char *subdir) { - index_priv_t *priv = NULL; - dict_t *xattr = NULL; - int ret = 0; + uint64_t count = 0; + index_priv_t *priv = NULL; + DIR *dirp = NULL; + struct dirent *entry = NULL; + struct dirent scratch[2] = { + { + 0, + }, + }; + char index_dir[PATH_MAX] = { + 0, + }; - priv = this->private; + priv = this->private; - xattr = dict_new (); - if (!xattr) { - ret = -ENOMEM; - goto done; - } + make_index_dir_path(priv->index_basepath, subdir, index_dir, + sizeof(index_dir)); - if (!strcmp (name, GF_XATTROP_INDEX_GFID)) { + dirp = sys_opendir(index_dir); + if (!dirp) + return 0; - ret = dict_set_static_bin (xattr, (char*)name, - priv->xattrop_vgfid, - sizeof (priv->xattrop_vgfid)); + for (;;) { + errno = 0; + entry = sys_readdir(dirp, scratch); + if (!entry || errno != 0) + break; - } else if (!strcmp (name, GF_BASE_INDICES_HOLDER_GFID)) { + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) + continue; - ret = dict_set_static_bin (xattr, (char*)name, - priv->base_indices_holder_vgfid, - sizeof (priv->base_indices_holder_vgfid)); - } - if (ret) { - ret = -ENOMEM; - gf_log (THIS->name, GF_LOG_ERROR, "xattrop index " - "gfid set failed"); - goto done; - } -done: - if (ret) - STACK_UNWIND_STRICT (getxattr, frame, -1, -ret, xattr, xdata); - else - STACK_UNWIND_STRICT (getxattr, frame, 0, 0, xattr, xdata); + if (!strncmp(entry->d_name, subdir, strlen(subdir))) + continue; - if (xattr) - dict_unref (xattr); + count++; + } - return 0; + (void)sys_closedir(dirp); + + return count; } int32_t -index_lookup_wrapper (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) -{ - index_priv_t *priv = NULL; - struct stat lstatbuf = {0}; - int ret = 0; - int32_t op_errno = EINVAL; - int32_t op_ret = -1; - char path[PATH_MAX] = {0}; - struct iatt stbuf = {0, }; - struct iatt postparent = {0,}; - dict_t *xattr = NULL; - gf_boolean_t is_dir = _gf_false; - - priv = this->private; - - VALIDATE_OR_GOTO (loc, done); - if (!uuid_compare (loc->gfid, priv->xattrop_vgfid)) { - make_index_dir_path (priv->index_basepath, XATTROP_SUBDIR, - path, sizeof (path)); - is_dir = _gf_true; - } else if (!uuid_compare (loc->pargfid, priv->xattrop_vgfid)) { - make_file_path (priv->index_basepath, XATTROP_SUBDIR, - loc->name, path, sizeof (path)); - } else if (!uuid_compare (loc->gfid,priv->base_indices_holder_vgfid)){ - make_index_dir_path (priv->index_basepath, - BASE_INDICES_HOLDER_SUBDIR, path, - sizeof (path)); - is_dir = _gf_true; - } else if (!uuid_compare (loc->pargfid, priv->base_indices_holder_vgfid)) { - make_file_path (priv->index_basepath, - BASE_INDICES_HOLDER_SUBDIR,loc->name, path, - sizeof (path)); +index_getxattr_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + index_priv_t *priv = NULL; + dict_t *xattr = NULL; + int ret = 0; + int vgfid_type = 0; + uint64_t count = 0; + + priv = this->private; + + xattr = dict_new(); + if (!xattr) { + ret = -ENOMEM; + goto done; + } + + vgfid_type = index_get_type_from_vgfid_xattr(name); + if (vgfid_type >= 0) { + ret = dict_set_static_bin(xattr, (char *)name, + priv->internal_vgfid[vgfid_type], + sizeof(priv->internal_vgfid[vgfid_type])); + if (ret) { + ret = -EINVAL; + gf_msg(this->name, GF_LOG_ERROR, -ret, INDEX_MSG_DICT_SET_FAILED, + "xattrop index " + "gfid set failed"); + goto done; } + } - ret = lstat (path, &lstatbuf); + /* TODO: Need to check what kind of link-counts are needed for + * ENTRY-CHANGES before refactor of this block with array*/ + if (strcmp(name, GF_XATTROP_INDEX_COUNT) == 0) { + count = index_entry_count(this, XATTROP_SUBDIR); + + ret = dict_set_uint64(xattr, (char *)name, count); if (ret) { - gf_log (this->name, GF_LOG_DEBUG, "Stat failed on index dir " - "(%s)", strerror (errno)); - op_errno = errno; - goto done; - } else if (!S_ISDIR (lstatbuf.st_mode) && is_dir) { - gf_log (this->name, GF_LOG_DEBUG, "Stat failed on index dir, " - "not a directory"); - op_errno = ENOENT; - goto done; - } - xattr = dict_new (); - if (!xattr) { - op_errno = ENOMEM; - goto done; + ret = -EINVAL; + gf_msg(this->name, GF_LOG_ERROR, -ret, INDEX_MSG_DICT_SET_FAILED, + "xattrop index " + "count set failed"); + goto done; } + } else if (strcmp(name, GF_XATTROP_DIRTY_COUNT) == 0) { + count = index_entry_count(this, DIRTY_SUBDIR); - iatt_from_stat (&stbuf, &lstatbuf); - if (is_dir && !uuid_compare (loc->gfid, priv->xattrop_vgfid)) { - uuid_copy (stbuf.ia_gfid, priv->xattrop_vgfid); - } else if (is_dir && - !uuid_compare (loc->gfid, priv->base_indices_holder_vgfid)) { - uuid_copy (stbuf.ia_gfid, priv->base_indices_holder_vgfid); - } else { - uuid_generate (stbuf.ia_gfid); + ret = dict_set_uint64(xattr, (char *)name, count); + if (ret) { + ret = -EINVAL; + gf_msg(this->name, GF_LOG_ERROR, -ret, INDEX_MSG_DICT_SET_FAILED, + "dirty index " + "count set failed"); + goto done; } - stbuf.ia_ino = -1; - op_ret = 0; + } done: - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, - loc->inode, &stbuf, xattr, &postparent); - if (xattr) - dict_unref (xattr); + if (ret) + STACK_UNWIND_STRICT(getxattr, frame, -1, -ret, xattr, NULL); + else + STACK_UNWIND_STRICT(getxattr, frame, 0, 0, xattr, NULL); + + if (xattr) + dict_unref(xattr); + + return 0; +} + +static int +index_save_pargfid_for_entry_changes(xlator_t *this, loc_t *loc, char *path) +{ + index_priv_t *priv = NULL; + index_inode_ctx_t *ctx = NULL; + int ret = 0; + + priv = this->private; + if (!loc) + return -1; + if (gf_uuid_compare(loc->pargfid, priv->internal_vgfid[ENTRY_CHANGES])) return 0; + + ret = index_inode_ctx_get(loc->inode, this, &ctx); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, + INDEX_MSG_INODE_CTX_GET_SET_FAILED, + "Unable to get inode context for %s", path); + return -EINVAL; + } + ret = gf_uuid_parse(loc->name, ctx->virtual_pargfid); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, + INDEX_MSG_INODE_CTX_GET_SET_FAILED, + "Unable to store " + "virtual gfid in inode context for %s", + path); + return -EINVAL; + } + return 0; } int32_t -base_indices_readdir_wrapper (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, dict_t *xdata) -{ - index_priv_t *priv = NULL; - char base_indices_holder[PATH_MAX] = {0}; - DIR *dir = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; - int count = 0; - gf_dirent_t entries; - - priv = this->private; - - make_index_dir_path (priv->index_basepath, BASE_INDICES_HOLDER_SUBDIR, - base_indices_holder, sizeof (base_indices_holder)); - - dir = opendir (base_indices_holder); - if (!dir) { - op_errno = EINVAL; - goto done; +index_lookup_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xattr_req) +{ + index_priv_t *priv = NULL; + struct stat lstatbuf = {0}; + int ret = 0; + int32_t op_errno = EINVAL; + int32_t op_ret = -1; + uint64_t val = IA_INVAL; + char path[PATH_MAX] = {0}; + struct iatt stbuf = { + 0, + }; + struct iatt postparent = { + 0, + }; + dict_t *xattr = NULL; + gf_boolean_t is_dir = _gf_false; + char *subdir = NULL; + loc_t iloc = {0}; + + priv = this->private; + loc_copy(&iloc, loc); + + VALIDATE_OR_GOTO(loc, done); + if (index_is_fop_on_internal_inode(this, loc->parent, loc->pargfid)) { + subdir = index_get_subdir_from_vgfid(priv, loc->pargfid); + ret = index_inode_path(this, loc->parent, path, sizeof(path)); + if (ret < 0) { + op_errno = -ret; + goto done; } + ret = snprintf(path + strlen(path), PATH_MAX - strlen(path), "/%s", + loc->name); + if ((ret < 0) || (ret > (PATH_MAX - strlen(path)))) { + op_errno = EINVAL; + op_ret = -1; + goto done; + } - INIT_LIST_HEAD (&entries.list); + } else if (index_is_virtual_gfid(priv, loc->gfid)) { + subdir = index_get_subdir_from_vgfid(priv, loc->gfid); + make_index_dir_path(priv->index_basepath, subdir, path, sizeof(path)); + is_dir = _gf_true; - count = index_fill_readdir (fd, dir, off, size, &entries, - BASE_INDICES_HOLDER); - /* pick ENOENT to indicate EOF */ + if ((xattr_req) && (dict_get(xattr_req, GF_INDEX_IA_TYPE_GET_REQ))) { + if (0 == strcmp(subdir, index_get_subdir_from_type(ENTRY_CHANGES))) + val = IA_IFDIR; + else + val = IA_IFREG; + } + } else { + if (!inode_is_linked(loc->inode)) { + inode_unref(iloc.inode); + iloc.inode = inode_find(loc->inode->table, loc->gfid); + } + ret = index_inode_path(this, iloc.inode, path, sizeof(path)); + if (ret < 0) { + op_errno = -ret; + goto done; + } + } + ret = sys_lstat(path, &lstatbuf); + if (ret) { + gf_msg_debug(this->name, errno, "Stat failed on %s dir ", path); op_errno = errno; - op_ret = count; - closedir (dir); + goto done; + } else if (!S_ISDIR(lstatbuf.st_mode) && is_dir) { + op_errno = ENOTDIR; + gf_msg_debug(this->name, op_errno, + "Stat failed on %s dir, " + "not a directory", + path); + goto done; + } + xattr = dict_new(); + if (!xattr) { + op_errno = ENOMEM; + goto done; + } + + if (val != IA_INVAL) { + ret = dict_set_uint64(xattr, GF_INDEX_IA_TYPE_GET_RSP, val); + if (ret) { + op_ret = -1; + op_errno = -ret; + goto done; + } + } + + iatt_from_stat(&stbuf, &lstatbuf); + if (is_dir || inode_is_linked(iloc.inode)) + loc_gfid(&iloc, stbuf.ia_gfid); + else + gf_uuid_generate(stbuf.ia_gfid); + + ret = index_save_pargfid_for_entry_changes(this, &iloc, path); + if (ret) { + op_ret = -1; + op_errno = -ret; + goto done; + } + + stbuf.ia_ino = -1; + op_ret = 0; done: - STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, xdata); - gf_dirent_free (&entries); - return 0; + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, + loc ? loc->inode : NULL, &stbuf, xattr, &postparent); + if (xattr) + dict_unref(xattr); + loc_wipe(&iloc); + return 0; +} + +int +index_get_gfid_type(void *opaque) +{ + gf_dirent_t *entry = NULL; + xlator_t *this = THIS; + struct index_syncop_args *args = opaque; + loc_t loc = {0}; + struct iatt iatt = {0}; + int ret = 0; + + list_for_each_entry(entry, &args->entries->list, list) + { + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) + continue; + + loc_wipe(&loc); + + entry->d_type = gf_d_type_from_ia_type(IA_INVAL); + entry->d_stat.ia_type = IA_INVAL; + if (gf_uuid_parse(entry->d_name, loc.gfid)) + continue; + + loc.inode = inode_find(args->parent->table, loc.gfid); + if (loc.inode) { + entry->d_stat.ia_type = loc.inode->ia_type; + entry->d_type = gf_d_type_from_ia_type(loc.inode->ia_type); + continue; + } + loc.inode = inode_new(args->parent->table); + if (!loc.inode) + continue; + ret = syncop_lookup(FIRST_CHILD(this), &loc, &iatt, 0, 0, 0); + if (ret == 0) { + entry->d_type = gf_d_type_from_ia_type(iatt.ia_type); + entry->d_stat = iatt; + } + } + loc_wipe(&loc); + + return 0; } int32_t -index_readdir_wrapper (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, dict_t *xdata) +index_readdir_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t off, dict_t *xdata) { - index_fd_ctx_t *fctx = NULL; - DIR *dir = NULL; - int ret = -1; - int32_t op_ret = -1; - int32_t op_errno = 0; - int count = 0; - gf_dirent_t entries; + index_fd_ctx_t *fctx = NULL; + index_priv_t *priv = NULL; + DIR *dir = NULL; + int ret = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + int count = 0; + gf_dirent_t entries; + struct index_syncop_args args = {0}; + + priv = this->private; + INIT_LIST_HEAD(&entries.list); + + ret = index_fd_ctx_get(fd, this, &fctx); + if (ret < 0) { + op_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, op_errno, INDEX_MSG_FD_OP_FAILED, + "pfd is NULL, fd=%p", fd); + goto done; + } + + dir = fctx->dir; + if (!dir) { + op_errno = EINVAL; + gf_msg(this->name, GF_LOG_WARNING, op_errno, + INDEX_MSG_INDEX_READDIR_FAILED, "dir is NULL for fd=%p", fd); + goto done; + } + + count = index_fill_readdir(fd, fctx, dir, off, size, &entries); + + /* pick ENOENT to indicate EOF */ + op_errno = errno; + op_ret = count; + if (index_is_virtual_gfid(priv, fd->inode->gfid) && xdata && + dict_get(xdata, "get-gfid-type")) { + args.parent = fd->inode; + args.entries = &entries; + ret = synctask_new(this->ctx->env, index_get_gfid_type, NULL, NULL, + &args); + } +done: + STACK_UNWIND_STRICT(readdir, frame, op_ret, op_errno, &entries, NULL); + gf_dirent_free(&entries); + return 0; +} - INIT_LIST_HEAD (&entries.list); +int +deletion_handler(const char *fpath, const struct stat *sb, int typeflag, + struct FTW *ftwbuf) +{ + ia_type_t type = IA_INVAL; + + switch (sb->st_mode & S_IFMT) { + case S_IFREG: + sys_unlink(fpath); + break; + + case S_IFDIR: + sys_rmdir(fpath); + break; + default: + type = ia_type_from_st_mode(sb->st_mode); + gf_msg(THIS->name, GF_LOG_WARNING, EINVAL, INDEX_MSG_INVALID_ARGS, + "%s neither a regular file nor a directory - type:%s", fpath, + gf_inode_type_to_str(type)); + break; + } + return 0; +} - ret = index_fd_ctx_get (fd, this, &fctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "pfd is NULL, fd=%p", fd); - op_errno = -ret; - goto done; - } +static int +index_wipe_index_subdir(void *opaque) +{ + struct index_syncop_args *args = opaque; - dir = fctx->dir; + nftw(args->path, deletion_handler, 1, FTW_DEPTH | FTW_PHYS); + return 0; +} - if (!dir) { - gf_log (this->name, GF_LOG_WARNING, - "dir is NULL for fd=%p", fd); - op_errno = EINVAL; - goto done; - } +static void +index_get_parent_iatt(struct iatt *parent, char *path, loc_t *loc, + int32_t *op_ret, int32_t *op_errno) +{ + int ret = -1; + struct stat lstatbuf = { + 0, + }; + + ret = sys_lstat(path, &lstatbuf); + if (ret < 0) { + *op_ret = -1; + *op_errno = errno; + return; + } - count = index_fill_readdir (fd, dir, off, size, &entries, - INDEX_XATTROP); + iatt_from_stat(parent, &lstatbuf); + gf_uuid_copy(parent->ia_gfid, loc->pargfid); + parent->ia_ino = -1; - /* pick ENOENT to indicate EOF */ - op_errno = errno; - op_ret = count; -done: - STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, xdata); - gf_dirent_free (&entries); - return 0; + return; } int -index_unlink_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, int flag, - dict_t *xdata) -{ - index_priv_t *priv = NULL; - int32_t op_ret = 0; - int32_t op_errno = 0; - int ret = 0; - struct iatt preparent = {0}; - struct iatt postparent = {0}; - char index_dir[PATH_MAX] = {0}; - struct stat lstatbuf = {0}; - uuid_t gfid = {0}; - - priv = this->private; - make_index_dir_path (priv->index_basepath, XATTROP_SUBDIR, - index_dir, sizeof (index_dir)); - ret = lstat (index_dir, &lstatbuf); +index_rmdir_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc, int flag, + dict_t *xdata) +{ + int ret = 0; + int32_t op_ret = 0; + int32_t op_errno = 0; + char *subdir = NULL; + char index_dir[PATH_MAX] = {0}; + char index_subdir[PATH_MAX] = {0}; + uuid_t gfid = {0}; + struct iatt preparent = {0}; + struct iatt postparent = {0}; + index_priv_t *priv = NULL; + index_xattrop_type_t type = XATTROP_TYPE_UNSET; + struct index_syncop_args args = { + 0, + }; + + priv = this->private; + + type = index_get_type_from_vgfid(priv, loc->pargfid); + subdir = index_get_subdir_from_vgfid(priv, loc->pargfid); + make_index_dir_path(priv->index_basepath, subdir, index_dir, + sizeof(index_dir)); + + index_get_parent_iatt(&preparent, index_dir, loc, &op_ret, &op_errno); + if (op_ret < 0) + goto done; + + gf_uuid_parse(loc->name, gfid); + make_gfid_path(priv->index_basepath, subdir, gfid, index_subdir, + sizeof(index_subdir)); + + if (flag == 0) { + ret = index_del(this, gfid, subdir, type); if (ret < 0) { - op_ret = -1; - op_errno = errno; - goto done; + op_ret = -1; + op_errno = -ret; + goto done; } + } else { + args.path = index_subdir; + ret = synctask_new(this->ctx->env, index_wipe_index_subdir, NULL, NULL, + &args); + } + + index_get_parent_iatt(&postparent, index_dir, loc, &op_ret, &op_errno); + if (op_ret < 0) + goto done; - iatt_from_stat (&preparent, &lstatbuf); - uuid_copy (preparent.ia_gfid, priv->xattrop_vgfid); - preparent.ia_ino = -1; - uuid_parse (loc->name, gfid); - ret = index_del (this, gfid, XATTROP_SUBDIR); - if (ret < 0) { - op_ret = -1; - op_errno = -ret; - goto done; - } - memset (&lstatbuf, 0, sizeof (lstatbuf)); - ret = lstat (index_dir, &lstatbuf); - if (ret < 0) { - op_ret = -1; - op_errno = errno; - goto done; - } - iatt_from_stat (&postparent, &lstatbuf); - uuid_copy (postparent.ia_gfid, priv->xattrop_vgfid); - postparent.ia_ino = -1; done: - INDEX_STACK_UNWIND (unlink, frame, op_ret, op_errno, &preparent, - &postparent, xdata); - return 0; + INDEX_STACK_UNWIND(rmdir, frame, op_ret, op_errno, &preparent, &postparent, + xdata); + return 0; +} + +int +index_unlink_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc, int flag, + dict_t *xdata) +{ + index_priv_t *priv = NULL; + index_inode_ctx_t *ictx = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + int ret = 0; + index_xattrop_type_t type = XATTROP_TYPE_UNSET; + struct iatt preparent = {0}; + struct iatt postparent = {0}; + char index_dir[PATH_MAX] = {0}; + char filepath[PATH_MAX] = {0}; + uuid_t gfid = {0}; + char *subdir = NULL; + + priv = this->private; + type = index_get_type_from_vgfid(priv, loc->pargfid); + ret = index_inode_path(this, loc->parent, index_dir, sizeof(index_dir)); + if (ret < 0) { + op_ret = -1; + op_errno = -ret; + goto done; + } + + index_get_parent_iatt(&preparent, index_dir, loc, &op_ret, &op_errno); + if (op_ret < 0) + goto done; + + if (type <= XATTROP_TYPE_UNSET) { + ret = index_inode_ctx_get(loc->parent, this, &ictx); + if ((ret == 0) && gf_uuid_is_null(ictx->virtual_pargfid)) { + ret = -EINVAL; + } + if (ret == 0) { + ret = index_entry_delete(this, ictx->virtual_pargfid, + (char *)loc->name); + } + } else if (type == ENTRY_CHANGES) { + make_file_path(priv->index_basepath, ENTRY_CHANGES_SUBDIR, + (char *)loc->name, filepath, sizeof(filepath)); + ret = sys_unlink(filepath); + } else { + subdir = index_get_subdir_from_type(type); + gf_uuid_parse(loc->name, gfid); + ret = index_del(this, gfid, subdir, type); + } + if (ret < 0) { + op_ret = -1; + op_errno = -ret; + goto done; + } + + index_get_parent_iatt(&postparent, index_dir, loc, &op_ret, &op_errno); + if (op_ret < 0) + goto done; +done: + INDEX_STACK_UNWIND(unlink, frame, op_ret, op_errno, &preparent, &postparent, + xdata); + return 0; } int32_t -index_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) +index_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - call_stub_t *stub = NULL; + call_stub_t *stub = NULL; - if (!name) - goto out; - if (strcmp (GF_XATTROP_INDEX_GFID, name) && - strcmp (GF_BASE_INDICES_HOLDER_GFID, name)) - goto out; + if (!name || + (!index_is_vgfid_xattr(name) && strcmp(GF_XATTROP_INDEX_COUNT, name) && + strcmp(GF_XATTROP_DIRTY_COUNT, name))) + goto out; - stub = fop_getxattr_stub (frame, index_getxattr_wrapper, loc, name, - xdata); - if (!stub) { - STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL); - return 0; - } - worker_enqueue (this, stub); + stub = fop_getxattr_stub(frame, index_getxattr_wrapper, loc, name, xdata); + if (!stub) { + STACK_UNWIND_STRICT(getxattr, frame, -1, ENOMEM, NULL, NULL); return 0; + } + worker_enqueue(this, stub); + return 0; out: - STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); - return 0; + STACK_WIND(frame, default_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + return 0; } -int32_t -index_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) +int64_t +index_fetch_link_count(xlator_t *this, index_xattrop_type_t type) { - call_stub_t *stub = NULL; - index_priv_t *priv = NULL; + index_priv_t *priv = this->private; + char *subdir = NULL; + struct stat lstatbuf = { + 0, + }; + int ret = -1; + int64_t count = -1; + DIR *dirp = NULL; + struct dirent *entry = NULL; + struct dirent scratch[2] = { + { + 0, + }, + }; + char index_dir[PATH_MAX] = { + 0, + }; + char index_path[PATH_MAX] = { + 0, + }; + + subdir = index_get_subdir_from_type(type); + make_index_dir_path(priv->index_basepath, subdir, index_dir, + sizeof(index_dir)); + + dirp = sys_opendir(index_dir); + if (!dirp) + goto out; + + for (;;) { + errno = 0; + entry = sys_readdir(dirp, scratch); + if (!entry || errno != 0) { + if (count == -1) + count = 0; + goto out; + } + + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) + continue; + + make_file_path(priv->index_basepath, subdir, entry->d_name, index_path, + sizeof(index_path)); + + ret = sys_lstat(index_path, &lstatbuf); + if (ret < 0) { + count = -2; + continue; + } else { + count = lstatbuf.st_nlink - 1; + if (count == 0) + continue; + else + break; + } + } +out: + if (dirp) + (void)sys_closedir(dirp); + return count; +} - priv = this->private; +dict_t * +index_fill_link_count(xlator_t *this, dict_t *xdata) +{ + int ret = -1; + index_priv_t *priv = NULL; + int64_t count = -1; + + priv = this->private; + xdata = (xdata) ? dict_ref(xdata) : dict_new(); + if (!xdata) + goto out; + + index_get_link_count(priv, &count, XATTROP); + if (count < 0) { + count = index_fetch_link_count(this, XATTROP); + index_set_link_count(priv, count, XATTROP); + } + + if (count == 0) { + ret = dict_set_int8(xdata, "link-count", 0); + if (ret < 0) + gf_msg(this->name, GF_LOG_ERROR, EINVAL, INDEX_MSG_DICT_SET_FAILED, + "Unable to set link-count"); + } else { + ret = dict_set_int8(xdata, "link-count", 1); + if (ret < 0) + gf_msg(this->name, GF_LOG_ERROR, EINVAL, INDEX_MSG_DICT_SET_FAILED, + "Unable to set link-count"); + } - if (uuid_compare (loc->gfid, priv->xattrop_vgfid) && - uuid_compare (loc->pargfid, priv->xattrop_vgfid) && - uuid_compare (loc->gfid, priv->base_indices_holder_vgfid) && - uuid_compare (loc->pargfid, priv->base_indices_holder_vgfid)) - goto normal; +out: + return xdata; +} - stub = fop_lookup_stub (frame, index_lookup_wrapper, loc, xattr_req); - if (!stub) { - STACK_UNWIND_STRICT (lookup, frame, -1, ENOMEM, loc->inode, - NULL, NULL, NULL); - return 0; +int32_t +index_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + xdata = index_fill_link_count(this, xdata); + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + if (xdata) + dict_unref(xdata); + return 0; +} + +int32_t +index_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + inode_t *inode = NULL; + call_stub_t *stub = NULL; + char *flag = NULL; + int ret = -1; + + if (!index_is_fop_on_internal_inode(this, loc->parent, loc->pargfid) && + !index_is_fop_on_internal_inode(this, loc->inode, loc->gfid)) { + if (!inode_is_linked(loc->inode)) { + inode = inode_find(loc->inode->table, loc->gfid); + if (!index_is_fop_on_internal_inode(this, inode, loc->gfid)) { + inode_unref(inode); + goto normal; + } + inode_unref(inode); + } else { + goto normal; } - worker_enqueue (this, stub); + } + + stub = fop_lookup_stub(frame, index_lookup_wrapper, loc, xattr_req); + if (!stub) { + STACK_UNWIND_STRICT(lookup, frame, -1, ENOMEM, loc->inode, NULL, NULL, + NULL); return 0; + } + worker_enqueue(this, stub); + return 0; normal: - STACK_WIND (frame, default_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, xattr_req); + ret = dict_get_str_sizen(xattr_req, "link-count", &flag); + if ((ret == 0) && (strcmp(flag, GF_XATTROP_INDEX_COUNT) == 0)) { + STACK_WIND(frame, index_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xattr_req); + } else { + STACK_WIND(frame, default_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xattr_req); + } + + return 0; +} - return 0; +int32_t +index_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + xdata = index_fill_link_count(this, xdata); + STACK_UNWIND_STRICT(fstat, frame, op_ret, op_errno, buf, xdata); + if (xdata) + dict_unref(xdata); + return 0; } int32_t -index_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, dict_t *xdata) +index_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - call_stub_t *stub = NULL; - index_priv_t *priv = NULL; + int ret = -1; + char *flag = NULL; + + ret = dict_get_str(xdata, "link-count", &flag); + if ((ret == 0) && (strcmp(flag, GF_XATTROP_INDEX_COUNT) == 0)) { + STACK_WIND(frame, index_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + } else { + STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + } + + return 0; +} - priv = this->private; - if (uuid_compare (fd->inode->gfid, priv->xattrop_vgfid) && - uuid_compare (fd->inode->gfid, priv->base_indices_holder_vgfid)) - goto out; +int32_t +index_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + if (!index_is_fop_on_internal_inode(this, fd->inode, NULL)) + goto normal; - if (!uuid_compare (fd->inode->gfid, priv->xattrop_vgfid)) { - stub = fop_readdir_stub (frame, index_readdir_wrapper, fd, size, - off, xdata); - } else if (!uuid_compare (fd->inode->gfid, - priv->base_indices_holder_vgfid)) { - stub = fop_readdir_stub (frame, base_indices_readdir_wrapper, - fd, size, off, xdata); - } + frame->local = NULL; + STACK_UNWIND_STRICT(opendir, frame, 0, 0, fd, NULL); + return 0; - if (!stub) { - STACK_UNWIND_STRICT (readdir, frame, -1, ENOMEM, NULL, NULL); - return 0; - } - worker_enqueue (this, stub); +normal: + STACK_WIND(frame, default_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + return 0; +} + +int32_t +index_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + call_stub_t *stub = NULL; + + if (!index_is_fop_on_internal_inode(this, fd->inode, NULL)) + goto out; + + stub = fop_readdir_stub(frame, index_readdir_wrapper, fd, size, off, xdata); + if (!stub) { + STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL); return 0; + } + worker_enqueue(this, stub); + return 0; out: - STACK_WIND (frame, default_readdir_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdir, fd, size, off, xdata); - return 0; + STACK_WIND(frame, default_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, fd, size, off, xdata); + return 0; } int -index_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, - dict_t *xdata) +index_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - call_stub_t *stub = NULL; - index_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; - if (uuid_compare (loc->pargfid, priv->xattrop_vgfid)) - goto out; + if (!index_is_fop_on_internal_inode(this, loc->parent, NULL)) + goto out; - stub = fop_unlink_stub (frame, index_unlink_wrapper, loc, xflag, xdata); - if (!stub) { - STACK_UNWIND_STRICT (unlink, frame, -1, ENOMEM, NULL, NULL, - NULL); - return 0; - } - worker_enqueue (this, stub); + stub = fop_unlink_stub(frame, index_unlink_wrapper, loc, xflag, xdata); + if (!stub) { + STACK_UNWIND_STRICT(unlink, frame, -1, ENOMEM, NULL, NULL, NULL); return 0; + } + worker_enqueue(this, stub); + return 0; out: - STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); - return 0; + STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); + return 0; } -int32_t -mem_acct_init (xlator_t *this) +int +index_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + dict_t *xdata) { - int ret = -1; + call_stub_t *stub = NULL; - ret = xlator_mem_acct_init (this, gf_index_mt_end + 1); + if (!index_is_fop_on_internal_inode(this, loc->parent, NULL)) + goto out; - return ret; + stub = fop_rmdir_stub(frame, index_rmdir_wrapper, loc, flags, xdata); + if (!stub) { + STACK_UNWIND_STRICT(rmdir, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } + worker_enqueue(this, stub); + return 0; +out: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir, + loc, flags, xdata); + return 0; } int -init (xlator_t *this) -{ - int ret = -1; - index_priv_t *priv = NULL; - pthread_t thread; - pthread_attr_t w_attr; - gf_boolean_t mutex_inited = _gf_false; - gf_boolean_t cond_inited = _gf_false; - gf_boolean_t attr_inited = _gf_false; - - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "'index' not configured with exactly one child"); - goto out; - } +index_make_xattrop_watchlist(xlator_t *this, index_priv_t *priv, + char *watchlist, index_xattrop_type_t type) +{ + char *delim = NULL; + char *dup_watchlist = NULL; + char *key = NULL; + char *saveptr = NULL; + dict_t *xattrs = NULL; + data_t *dummy = NULL; + int ret = 0; + + if (!watchlist) + return 0; - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } + dup_watchlist = gf_strdup(watchlist); + if (!dup_watchlist) + return -1; - priv = GF_CALLOC (1, sizeof (*priv), gf_index_mt_priv_t); - if (!priv) - goto out; + xattrs = dict_new(); + if (!xattrs) { + ret = -1; + goto out; + } - LOCK_INIT (&priv->lock); - if ((ret = pthread_cond_init(&priv->cond, NULL)) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "pthread_cond_init failed (%d)", ret); - goto out; - } - cond_inited = _gf_true; + dummy = int_to_data(1); + if (!dummy) { + ret = -1; + goto out; + } - if ((ret = pthread_mutex_init(&priv->mutex, NULL)) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "pthread_mutex_init failed (%d)", ret); - goto out; - } - mutex_inited = _gf_true; + data_ref(dummy); - if ((ret = pthread_attr_init (&w_attr)) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "pthread_attr_init failed (%d)", ret); - goto out; + delim = ","; + key = strtok_r(dup_watchlist, delim, &saveptr); + while (key) { + if (strlen(key) == 0) { + ret = -1; + goto out; } - attr_inited = _gf_true; - ret = pthread_attr_setstacksize (&w_attr, INDEX_THREAD_STACK_SIZE); - if (ret == EINVAL) { - gf_log (this->name, GF_LOG_WARNING, - "Using default thread stack size"); - } - GF_OPTION_INIT ("index-base", priv->index_basepath, path, out); - uuid_generate (priv->index); - uuid_generate (priv->xattrop_vgfid); - /*base_indices_holder is a directory which contains hard links to - * all base indices inside indices/xattrop directory*/ - uuid_generate (priv->base_indices_holder_vgfid); - INIT_LIST_HEAD (&priv->callstubs); - - this->private = priv; - ret = gf_thread_create (&thread, &w_attr, index_worker, this); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, "Failed to create " - "worker thread, aborting"); + ret = dict_set(xattrs, key, dummy); + if (ret) + goto out; + + key = strtok_r(NULL, delim, &saveptr); + } + + switch (type) { + case DIRTY: + priv->dirty_watchlist = dict_copy_with_ref(xattrs, + priv->dirty_watchlist); + if (!priv->dirty_watchlist) { + ret = -1; goto out; - } - ret = 0; + } + break; + case XATTROP: + priv->pending_watchlist = dict_copy_with_ref( + xattrs, priv->pending_watchlist); + if (!priv->pending_watchlist) { + ret = -1; + goto out; + } + break; + default: + break; + } + + ret = 0; out: - if (ret) { - if (cond_inited) - pthread_cond_destroy (&priv->cond); - if (mutex_inited) - pthread_mutex_destroy (&priv->mutex); - if (priv) - GF_FREE (priv); - this->private = NULL; - } - if (attr_inited) - pthread_attr_destroy (&w_attr); - return ret; + if (xattrs) + dict_unref(xattrs); + + GF_FREE(dup_watchlist); + + if (dummy) + data_unref(dummy); + + return ret; } -void -fini (xlator_t *this) +int32_t +mem_acct_init(xlator_t *this) { - index_priv_t *priv = NULL; + int ret = -1; - priv = this->private; - if (!priv) - goto out; + ret = xlator_mem_acct_init(this, gf_index_mt_end + 1); + + return ret; +} + +int +init(xlator_t *this) +{ + int i = 0; + int ret = -1; + int64_t count = -1; + index_priv_t *priv = NULL; + pthread_attr_t w_attr; + gf_boolean_t mutex_inited = _gf_false; + gf_boolean_t cond_inited = _gf_false; + gf_boolean_t attr_inited = _gf_false; + char *watchlist = NULL; + char *dirtylist = NULL; + char *pendinglist = NULL; + char *index_base_parent = NULL; + char *tmp = NULL; + + if (!this->children || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, INDEX_MSG_INVALID_GRAPH, + "'index' not configured with exactly one child"); + goto out; + } + + if (!this->parents) { + gf_msg(this->name, GF_LOG_WARNING, EINVAL, INDEX_MSG_INVALID_GRAPH, + "dangling volume. check volfile "); + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_index_mt_priv_t); + if (!priv) + goto out; + + LOCK_INIT(&priv->lock); + if ((ret = pthread_cond_init(&priv->cond, NULL)) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ret, INDEX_MSG_INVALID_ARGS, + "pthread_cond_init failed"); + goto out; + } + cond_inited = _gf_true; + + if ((ret = pthread_mutex_init(&priv->mutex, NULL)) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ret, INDEX_MSG_INVALID_ARGS, + "pthread_mutex_init failed"); + goto out; + } + mutex_inited = _gf_true; + + if ((ret = pthread_attr_init(&w_attr)) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ret, INDEX_MSG_INVALID_ARGS, + "pthread_attr_init failed"); + goto out; + } + attr_inited = _gf_true; + + ret = pthread_attr_setstacksize(&w_attr, INDEX_THREAD_STACK_SIZE); + if (ret == EINVAL) { + gf_msg(this->name, GF_LOG_WARNING, ret, INDEX_MSG_INVALID_ARGS, + "Using default thread stack size"); + } + + GF_OPTION_INIT("index-base", priv->index_basepath, path, out); + tmp = gf_strdup(priv->index_basepath); + index_base_parent = dirname(tmp); + if (gf_lstat_dir(index_base_parent, NULL) != 0) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, errno, + INDEX_MSG_INDEX_DIR_CREATE_FAILED, + "Failed to find parent dir (%s) of index basepath %s.", + index_base_parent, priv->index_basepath); + goto out; + } + + GF_OPTION_INIT("xattrop64-watchlist", watchlist, str, out); + ret = index_make_xattrop_watchlist(this, priv, watchlist, XATTROP); + if (ret) + goto out; + + GF_OPTION_INIT("xattrop-dirty-watchlist", dirtylist, str, out); + ret = index_make_xattrop_watchlist(this, priv, dirtylist, DIRTY); + if (ret) + goto out; + + GF_OPTION_INIT("xattrop-pending-watchlist", pendinglist, str, out); + ret = index_make_xattrop_watchlist(this, priv, pendinglist, XATTROP); + if (ret) + goto out; + + if (priv->dirty_watchlist) + priv->complete_watchlist = dict_copy_with_ref(priv->dirty_watchlist, + priv->complete_watchlist); + if (priv->pending_watchlist) + priv->complete_watchlist = dict_copy_with_ref(priv->pending_watchlist, + priv->complete_watchlist); + + gf_uuid_generate(priv->index); + for (i = 0; i < XATTROP_TYPE_END; i++) + gf_uuid_generate(priv->internal_vgfid[i]); + + INIT_LIST_HEAD(&priv->callstubs); + GF_ATOMIC_INIT(priv->stub_cnt, 0); + + this->local_pool = mem_pool_new(index_local_t, 64); + if (!this->local_pool) { + ret = -1; + goto out; + } + + this->private = priv; + + ret = index_dir_create(this, XATTROP_SUBDIR); + if (ret < 0) + goto out; + + if (priv->dirty_watchlist) { + ret = index_dir_create(this, DIRTY_SUBDIR); + if (ret < 0) + goto out; + } + + ret = index_dir_create(this, ENTRY_CHANGES_SUBDIR); + if (ret < 0) + goto out; + + /*init indices files counts*/ + count = index_fetch_link_count(this, XATTROP); + index_set_link_count(priv, count, XATTROP); + priv->down = _gf_false; + + priv->curr_count = 0; + ret = gf_thread_create(&priv->thread, &w_attr, index_worker, this, + "idxwrker"); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ret, + INDEX_MSG_WORKER_THREAD_CREATE_FAILED, + "Failed to create worker thread, aborting"); + goto out; + } + priv->curr_count++; + ret = 0; +out: + GF_FREE(tmp); + + if (ret) { + if (cond_inited) + pthread_cond_destroy(&priv->cond); + if (mutex_inited) + pthread_mutex_destroy(&priv->mutex); + if (priv && priv->dirty_watchlist) + dict_unref(priv->dirty_watchlist); + if (priv && priv->pending_watchlist) + dict_unref(priv->pending_watchlist); + if (priv && priv->complete_watchlist) + dict_unref(priv->complete_watchlist); + if (priv) + GF_FREE(priv); this->private = NULL; - LOCK_DESTROY (&priv->lock); - pthread_cond_destroy (&priv->cond); - pthread_mutex_destroy (&priv->mutex); - GF_FREE (priv); + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } + + if (attr_inited) + pthread_attr_destroy(&w_attr); + return ret; +} + +void +fini(xlator_t *this) +{ + index_priv_t *priv = NULL; + + priv = this->private; + if (!priv) + goto out; + + priv->down = _gf_true; + pthread_cond_broadcast(&priv->cond); + if (priv->thread) { + gf_thread_cleanup_xint(priv->thread); + priv->thread = 0; + } + this->private = NULL; + LOCK_DESTROY(&priv->lock); + pthread_cond_destroy(&priv->cond); + pthread_mutex_destroy(&priv->mutex); + if (priv->dirty_watchlist) + dict_unref(priv->dirty_watchlist); + if (priv->pending_watchlist) + dict_unref(priv->pending_watchlist); + if (priv->complete_watchlist) + dict_unref(priv->complete_watchlist); + GF_FREE(priv); + + if (this->local_pool) { + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } out: - return; + return; } int -index_forget (xlator_t *this, inode_t *inode) +index_forget(xlator_t *this, inode_t *inode) { - uint64_t tmp_cache = 0; - if (!inode_ctx_del (inode, this, &tmp_cache)) - GF_FREE ((index_inode_ctx_t*) (long)tmp_cache); + uint64_t tmp_cache = 0; + if (!inode_ctx_del(inode, this, &tmp_cache)) + GF_FREE((index_inode_ctx_t *)(long)tmp_cache); - return 0; + return 0; } int32_t -index_releasedir (xlator_t *this, fd_t *fd) +index_releasedir(xlator_t *this, fd_t *fd) { - index_fd_ctx_t *fctx = NULL; - uint64_t ctx = 0; - int ret = 0; + index_fd_ctx_t *fctx = NULL; + uint64_t ctx = 0; + int ret = 0; - ret = fd_ctx_del (fd, this, &ctx); - if (ret < 0) - goto out; + ret = fd_ctx_del(fd, this, &ctx); + if (ret < 0) + goto out; - fctx = (index_fd_ctx_t*) (long) ctx; - if (fctx->dir) - closedir (fctx->dir); + fctx = (index_fd_ctx_t *)(long)ctx; + if (fctx->dir) { + ret = sys_closedir(fctx->dir); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, errno, INDEX_MSG_FD_OP_FAILED, + "closedir error"); + } - GF_FREE (fctx); + GF_FREE(fctx); out: - return 0; + return 0; } int32_t -index_release (xlator_t *this, fd_t *fd) +index_release(xlator_t *this, fd_t *fd) { - index_fd_ctx_t *fctx = NULL; - uint64_t ctx = 0; - int ret = 0; + index_fd_ctx_t *fctx = NULL; + uint64_t ctx = 0; + int ret = 0; - ret = fd_ctx_del (fd, this, &ctx); - if (ret < 0) - goto out; + ret = fd_ctx_del(fd, this, &ctx); + if (ret < 0) + goto out; - fctx = (index_fd_ctx_t*) (long) ctx; - GF_FREE (fctx); + fctx = (index_fd_ctx_t *)(long)ctx; + GF_FREE(fctx); out: - return 0; + return 0; } int -notify (xlator_t *this, int event, void *data, ...) +notify(xlator_t *this, int event, void *data, ...) { - int ret = 0; - ret = default_notify (this, event, data); - return ret; + int ret = 0; + index_priv_t *priv = NULL; + uint64_t stub_cnt = 0; + xlator_t *victim = data; + struct timespec sleep_till = { + 0, + }; + + if (!this) + return 0; + + priv = this->private; + if (!priv) + return 0; + + if ((event == GF_EVENT_PARENT_DOWN) && victim->cleanup_starting) { + stub_cnt = GF_ATOMIC_GET(priv->stub_cnt); + timespec_now_realtime(&sleep_till); + sleep_till.tv_sec += 1; + + /* Wait for draining stub from queue before notify PARENT_DOWN */ + pthread_mutex_lock(&priv->mutex); + { + while (stub_cnt) { + (void)pthread_cond_timedwait(&priv->cond, &priv->mutex, + &sleep_till); + stub_cnt = GF_ATOMIC_GET(priv->stub_cnt); + } + } + pthread_mutex_unlock(&priv->mutex); + gf_log(this->name, GF_LOG_INFO, + "Notify GF_EVENT_PARENT_DOWN for brick %s", victim->name); + } + + if ((event == GF_EVENT_CHILD_DOWN) && victim->cleanup_starting) { + pthread_mutex_lock(&priv->mutex); + { + priv->down = _gf_true; + pthread_cond_broadcast(&priv->cond); + while (priv->curr_count) + pthread_cond_wait(&priv->cond, &priv->mutex); + } + pthread_mutex_unlock(&priv->mutex); + + gf_log(this->name, GF_LOG_INFO, + "Notify GF_EVENT_CHILD_DOWN for brick %s", victim->name); + } + + ret = default_notify(this, event, data); + return ret; } struct xlator_fops fops = { - .xattrop = index_xattrop, - .fxattrop = index_fxattrop, - - //interface functions follow - .getxattr = index_getxattr, - .lookup = index_lookup, - .readdir = index_readdir, - .unlink = index_unlink + .xattrop = index_xattrop, + .fxattrop = index_fxattrop, + + // interface functions follow + .getxattr = index_getxattr, + .lookup = index_lookup, + .opendir = index_opendir, + .readdir = index_readdir, + .unlink = index_unlink, + .rmdir = index_rmdir, + .fstat = index_fstat, }; struct xlator_dumpops dumpops; -struct xlator_cbks cbks = { - .forget = index_forget, - .release = index_release, - .releasedir = index_releasedir -}; +struct xlator_cbks cbks = {.forget = index_forget, + .release = index_release, + .releasedir = index_releasedir}; struct volume_options options[] = { - { .key = {"index-base" }, - .type = GF_OPTION_TYPE_PATH, - .description = "path where the index files need to be stored", - }, - { .key = {NULL} }, + {.key = {"index-base"}, + .type = GF_OPTION_TYPE_PATH, + .description = "path where the index files need to be stored", + .default_value = "{{ brick.path }}/.glusterfs/indices"}, + {.key = {"xattrop64-watchlist"}, + .type = GF_OPTION_TYPE_STR, + .description = "Comma separated list of xattrs that are watched", + .default_value = "trusted.ec.dirty"}, + {.key = {"xattrop-dirty-watchlist"}, + .type = GF_OPTION_TYPE_STR, + .description = "Comma separated list of xattrs that are watched", + .default_value = "trusted.afr.dirty"}, + {.key = {"xattrop-pending-watchlist"}, + .type = GF_OPTION_TYPE_STR, + .description = "Comma separated list of xattrs that are watched", + .default_value = "trusted.afr.{{ volume.name }}"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "index", + .category = GF_MAINTAINED, }; diff --git a/xlators/features/index/src/index.h b/xlators/features/index/src/index.h index d6dcb1c23b4..a2b6e6e2570 100644 --- a/xlators/features/index/src/index.h +++ b/xlators/features/index/src/index.h @@ -11,63 +11,76 @@ #ifndef __INDEX_H__ #define __INDEX_H__ -#include "xlator.h" -#include "call-stub.h" -#include "defaults.h" -#include "byte-order.h" -#include "common-utils.h" +#include <glusterfs/xlator.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/defaults.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/common-utils.h> #include "index-mem-types.h" -#define INDEX_THREAD_STACK_SIZE ((size_t)(1024*1024)) +#define INDEX_THREAD_STACK_SIZE ((size_t)(1024 * 1024)) + +typedef enum { UNKNOWN, IN, NOTIN } index_state_t; typedef enum { - UNKNOWN, - IN, - NOTIN -} index_state_t; + XATTROP_TYPE_UNSET = -1, + XATTROP, + DIRTY, + ENTRY_CHANGES, + XATTROP_TYPE_END +} index_xattrop_type_t; typedef struct index_inode_ctx { - gf_boolean_t processing; - struct list_head callstubs; - index_state_t state; + gf_boolean_t processing; + struct list_head callstubs; + int state[XATTROP_TYPE_END]; + uuid_t virtual_pargfid; /* virtual gfid of dir under + .glusterfs/indices/entry-changes. */ } index_inode_ctx_t; typedef struct index_fd_ctx { - DIR *dir; + DIR *dir; + off_t dir_eof; } index_fd_ctx_t; -typedef enum { - sync_not_started, - sync_started, - synced_state, -} to_be_healed_states_t; - -typedef enum { - INDEX_XATTROP, - BASE_INDICES_HOLDER, -} readdir_directory; - typedef struct index_priv { - char *index_basepath; - uuid_t index; - gf_lock_t lock; - uuid_t xattrop_vgfid;//virtual gfid of the xattrop index dir - uuid_t base_indices_holder_vgfid; //virtual gfid of the - //to_be_healed_xattrop directory - struct list_head callstubs; - pthread_mutex_t mutex; - pthread_cond_t cond; - to_be_healed_states_t to_be_healed_states; + char *index_basepath; + char *dirty_basepath; + uuid_t index; + gf_lock_t lock; + uuid_t internal_vgfid[XATTROP_TYPE_END]; + struct list_head callstubs; + pthread_mutex_t mutex; + pthread_cond_t cond; + dict_t *dirty_watchlist; + dict_t *pending_watchlist; + dict_t *complete_watchlist; + int64_t pending_count; + pthread_t thread; + gf_boolean_t down; + gf_atomic_t stub_cnt; + int32_t curr_count; } index_priv_t; -#define INDEX_STACK_UNWIND(fop, frame, params ...) \ -do { \ - if (frame) { \ - inode_t *_inode = frame->local; \ - frame->local = NULL; \ - inode_unref (_inode); \ - } \ - STACK_UNWIND_STRICT (fop, frame, params); \ -} while (0) +typedef struct index_local { + inode_t *inode; + dict_t *xdata; +} index_local_t; + +#define INDEX_STACK_UNWIND(fop, frame, params...) \ + do { \ + index_local_t *__local = NULL; \ + if (frame) { \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + if (__local) { \ + inode_unref(__local->inode); \ + if (__local->xdata) \ + dict_unref(__local->xdata); \ + mem_put(__local); \ + } \ + } while (0) #endif diff --git a/xlators/features/leases/Makefile.am b/xlators/features/leases/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/features/leases/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/leases/src/Makefile.am b/xlators/features/leases/src/Makefile.am new file mode 100644 index 00000000000..a1aef10e299 --- /dev/null +++ b/xlators/features/leases/src/Makefile.am @@ -0,0 +1,20 @@ +if WITH_SERVER +xlator_LTLIBRARIES = leases.la +endif +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +leases_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +leases_la_SOURCES = leases.c leases-internal.c + +leases_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = leases.h leases-mem-types.h leases-messages.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(CONTRIBDIR)/timer-wheel + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/leases/src/leases-internal.c b/xlators/features/leases/src/leases-internal.c new file mode 100644 index 00000000000..56dee244281 --- /dev/null +++ b/xlators/features/leases/src/leases-internal.c @@ -0,0 +1,1412 @@ +/* + Copyright (c) 2015-2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "leases.h" + +/* Mutex locks used in this xlator and their order of acquisition: + * Check lease conflict: + * lease_ctx lock + * add_timer => internal timer locks + * lease_ctx unlock + * + * Add/remove lease: + * lease_ctx lock + * add_timer => internal timer locks + * OR + * priv lock => Adding/removing to/from the cleanup client list + * priv unlock + * lease_ctx unlock + * + * Timer thread: + * Timer internal lock + * priv lock => By timer handler + * priv unlock + * Timer internal unlock + * + * Expired recall cleanup thread: + * priv lock + * priv condwait + * priv unlock + * lease_ctx lock + * priv lock + * priv unlock + * lease_ctx unlock + */ + +/* + * Check if lease_lk is enabled + * Return Value: + * _gf_true - lease lock option enabled + * _gf_false - lease lock option disabled + */ +gf_boolean_t +is_leases_enabled(xlator_t *this) +{ + leases_private_t *priv = NULL; + gf_boolean_t is_enabled = _gf_false; + + GF_VALIDATE_OR_GOTO("leases", this, out); + + if (this->private) { + priv = (leases_private_t *)this->private; + is_enabled = priv->leases_enabled; + } +out: + return is_enabled; +} + +/* + * Get the recall_leaselk_timeout + * Return Value: + * timeout value(in seconds) set as an option to this xlator. + * -1 error case + */ +static int32_t +get_recall_lease_timeout(xlator_t *this) +{ + leases_private_t *priv = NULL; + int32_t timeout = -1; + + GF_VALIDATE_OR_GOTO("leases", this, out); + + if (this->private) { + priv = (leases_private_t *)this->private; + timeout = priv->recall_lease_timeout; + } +out: + return timeout; +} + +static void +__dump_leases_info(xlator_t *this, lease_inode_ctx_t *lease_ctx) +{ + lease_id_entry_t *lease_entry = NULL; + lease_id_entry_t *tmp = NULL; + + GF_VALIDATE_OR_GOTO("leases", this, out); + GF_VALIDATE_OR_GOTO("leases", lease_ctx, out); + + gf_msg_debug(this->name, 0, + "Lease held on this inode, lease_type: %d," + " lease_cnt:%" PRIu64 + ", RD lease:%d, RW lease:%d, " + "openfd cnt:%" PRIu64, + lease_ctx->lease_type, lease_ctx->lease_cnt, + lease_ctx->lease_type_cnt[GF_RD_LEASE], + lease_ctx->lease_type_cnt[GF_RW_LEASE], lease_ctx->openfd_cnt); + + list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list, + lease_id_list) + { + gf_msg_debug(this->name, 0, + "Leases held by client: %s, lease " + "ID:%s, RD lease:%d, RW lease:%d, lease_type: %d, " + "lease_cnt:%" PRIu64, + lease_entry->client_uid, lease_entry->lease_id, + lease_entry->lease_type_cnt[GF_RD_LEASE], + lease_entry->lease_type_cnt[GF_RW_LEASE], + lease_entry->lease_type, lease_entry->lease_cnt); + } +out: + return; +} + +static int +__lease_ctx_set(inode_t *inode, xlator_t *this) +{ + lease_inode_ctx_t *inode_ctx = NULL; + int ret = -1; + uint64_t ctx = 0; + + GF_VALIDATE_OR_GOTO("leases", inode, out); + GF_VALIDATE_OR_GOTO("leases", this, out); + + ret = __inode_ctx_get(inode, this, &ctx); + if (!ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, LEASE_MSG_INVAL_INODE_CTX, + "inode_ctx_get failed"); + goto out; + } + + inode_ctx = GF_CALLOC(1, sizeof(*inode_ctx), + gf_leases_mt_lease_inode_ctx_t); + GF_CHECK_ALLOC(inode_ctx, ret, out); + + pthread_mutex_init(&inode_ctx->lock, NULL); + INIT_LIST_HEAD(&inode_ctx->lease_id_list); + INIT_LIST_HEAD(&inode_ctx->blocked_list); + + inode_ctx->lease_cnt = 0; + + ret = __inode_ctx_set(inode, this, (uint64_t *)inode_ctx); + if (ret) { + GF_FREE(inode_ctx); + gf_msg(this->name, GF_LOG_INFO, 0, LEASE_MSG_INVAL_INODE_CTX, + "failed to set inode ctx (%p)", inode); + } +out: + return ret; +} + +static lease_inode_ctx_t * +__lease_ctx_get(inode_t *inode, xlator_t *this) +{ + lease_inode_ctx_t *inode_ctx = NULL; + uint64_t ctx = 0; + int ret = 0; + + GF_VALIDATE_OR_GOTO("leases", inode, out); + GF_VALIDATE_OR_GOTO("leases", this, out); + + ret = __inode_ctx_get(inode, this, &ctx); + if (ret < 0) { + ret = __lease_ctx_set(inode, this); + if (ret < 0) + goto out; + + ret = __inode_ctx_get(inode, this, &ctx); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, LEASE_MSG_INVAL_INODE_CTX, + "failed to get inode ctx (%p)", inode); + goto out; + } + } + + inode_ctx = (lease_inode_ctx_t *)(long)ctx; +out: + return inode_ctx; +} + +lease_inode_ctx_t * +lease_ctx_get(inode_t *inode, xlator_t *this) +{ + lease_inode_ctx_t *inode_ctx = NULL; + + GF_VALIDATE_OR_GOTO("leases", inode, out); + GF_VALIDATE_OR_GOTO("leases", this, out); + + LOCK(&inode->lock); + { + inode_ctx = __lease_ctx_get(inode, this); + } + UNLOCK(&inode->lock); +out: + return inode_ctx; +} + +static lease_id_entry_t * +new_lease_id_entry(call_frame_t *frame, const char *lease_id) +{ + lease_id_entry_t *lease_entry = NULL; + + GF_VALIDATE_OR_GOTO("leases", frame, out); + GF_VALIDATE_OR_GOTO("leases", lease_id, out); + + lease_entry = GF_CALLOC(1, sizeof(*lease_entry), + gf_leases_mt_lease_id_entry_t); + if (!lease_entry) { + gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, LEASE_MSG_NO_MEM, + "Memory allocation for lease_entry failed"); + return NULL; + } + + INIT_LIST_HEAD(&lease_entry->lease_id_list); + lease_entry->lease_type = NONE; + lease_entry->lease_cnt = 0; + lease_entry->recall_time = get_recall_lease_timeout(frame->this); + lease_entry->client_uid = gf_strdup(frame->root->client->client_uid); + if (!lease_entry->client_uid) { + gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, LEASE_MSG_NO_MEM, + "Memory allocation for client_uid failed"); + GF_FREE(lease_entry); + lease_entry = NULL; + goto out; + } + + memcpy(lease_entry->lease_id, lease_id, LEASE_ID_SIZE); +out: + return lease_entry; +} + +static void +__destroy_lease_id_entry(lease_id_entry_t *lease_entry) +{ + GF_VALIDATE_OR_GOTO("leases", lease_entry, out); + + list_del_init(&lease_entry->lease_id_list); + GF_FREE(lease_entry->client_uid); + GF_FREE(lease_entry); +out: + return; +} + +static inline gf_boolean_t +__is_same_lease_id(const char *k1, const char *k2) +{ + if (memcmp(k1, k2, strlen(k1)) == 0) + return _gf_true; + + return _gf_false; +} + +/* Checks if there are any leases, other than the leases taken + * by the given lease_id + */ +static gf_boolean_t +__another_lease_found(lease_inode_ctx_t *lease_ctx, const char *lease_id) +{ + lease_id_entry_t *lease_entry = NULL; + lease_id_entry_t *tmp = NULL; + gf_boolean_t found_lease = _gf_false; + + GF_VALIDATE_OR_GOTO("leases", lease_id, out); + GF_VALIDATE_OR_GOTO("leases", lease_ctx, out); + + list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list, + lease_id_list) + { + if (!__is_same_lease_id(lease_id, lease_entry->lease_id)) { + if (lease_entry->lease_cnt > 0) { + found_lease = _gf_true; + break; + } + } + } +out: + return found_lease; +} + +/* Returns the lease_id_entry for a given lease_id and a given inode. + * Return values: + * NULL - If no client entry found + * lease_id_entry_t* - a pointer to the client entry if found + */ +static lease_id_entry_t * +__get_lease_id_entry(lease_inode_ctx_t *lease_ctx, const char *lease_id) +{ + lease_id_entry_t *lease_entry = NULL; + lease_id_entry_t *tmp = NULL; + lease_id_entry_t *found = NULL; + + GF_VALIDATE_OR_GOTO("leases", lease_id, out); + GF_VALIDATE_OR_GOTO("leases", lease_ctx, out); + + list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list, + lease_id_list) + { + if (__is_same_lease_id(lease_id, lease_entry->lease_id)) { + found = lease_entry; + gf_msg_debug("leases", 0, + "lease ID entry found " + "Client UID:%s, lease id:%s", + lease_entry->client_uid, + leaseid_utoa(lease_entry->lease_id)); + break; + } + } +out: + return found; +} + +/* Returns the lease_id_entry for a given lease_id and a given inode, + * if none found creates one. + * Return values: + * lease_id_entry_t* - a pointer to the client entry + */ +static lease_id_entry_t * +__get_or_new_lease_entry(call_frame_t *frame, const char *lease_id, + lease_inode_ctx_t *lease_ctx) +{ + lease_id_entry_t *lease_entry = NULL; + + GF_VALIDATE_OR_GOTO("leases", frame, out); + GF_VALIDATE_OR_GOTO("leases", lease_id, out); + GF_VALIDATE_OR_GOTO("leases", lease_ctx, out); + + lease_entry = __get_lease_id_entry(lease_ctx, lease_id); + if (!lease_entry) { /* create one */ + lease_entry = new_lease_id_entry(frame, lease_id); + if (!lease_entry) + goto out; + + list_add_tail(&lease_entry->lease_id_list, &lease_ctx->lease_id_list); + + gf_msg_debug(frame->this->name, 0, + "lease ID entry added," + " Client UID:%s, lease id:%s", + lease_entry->client_uid, + leaseid_utoa(lease_entry->lease_id)); + } +out: + return lease_entry; +} + +static lease_inode_t * +new_lease_inode(inode_t *inode) +{ + lease_inode_t *l_inode = GF_MALLOC(sizeof(*l_inode), + gf_leases_mt_lease_inode_t); + if (!l_inode) + goto out; + + INIT_LIST_HEAD(&l_inode->list); + l_inode->inode = inode_ref(inode); +out: + return l_inode; +} + +static void +__destroy_lease_inode(lease_inode_t *l_inode) +{ + list_del_init(&l_inode->list); + inode_unref(l_inode->inode); + GF_FREE(l_inode); +} + +static lease_client_t * +new_lease_client(const char *client_uid) +{ + lease_client_t *clnt = GF_MALLOC(sizeof(*clnt), + gf_leases_mt_lease_client_t); + if (!clnt) + goto out; + + INIT_LIST_HEAD(&clnt->client_list); + INIT_LIST_HEAD(&clnt->inode_list); + clnt->client_uid = gf_strdup(client_uid); +out: + return clnt; +} + +static void +__destroy_lease_client(lease_client_t *clnt) +{ + list_del_init(&clnt->inode_list); + list_del_init(&clnt->client_list); + GF_FREE(clnt); + + return; +} + +static lease_client_t * +__get_lease_client(xlator_t *this, leases_private_t *priv, + const char *client_uid) +{ + lease_client_t *clnt = NULL; + lease_client_t *tmp = NULL; + lease_client_t *found = NULL; + + list_for_each_entry_safe(clnt, tmp, &priv->client_list, client_list) + { + if ((strcmp(clnt->client_uid, client_uid) == 0)) { + found = clnt; + gf_msg_debug(this->name, 0, + "Client:%s already found " + "in the cleanup list", + client_uid); + break; + } + } + return found; +} + +static lease_client_t * +__get_or_new_lease_client(xlator_t *this, leases_private_t *priv, + const char *client_uid) +{ + lease_client_t *found = NULL; + + found = __get_lease_client(this, priv, client_uid); + if (!found) { + found = new_lease_client(client_uid); + if (!found) + goto out; + list_add_tail(&found->client_list, &priv->client_list); + gf_msg_debug(this->name, 0, + "Adding a new client:%s entry " + "to the cleanup list", + client_uid); + } +out: + return found; +} + +static int +add_inode_to_client_list(xlator_t *this, inode_t *inode, const char *client_uid) +{ + leases_private_t *priv = this->private; + lease_client_t *clnt = NULL; + + lease_inode_t *lease_inode = new_lease_inode(inode); + if (!lease_inode) + return -ENOMEM; + + pthread_mutex_lock(&priv->mutex); + { + clnt = __get_or_new_lease_client(this, priv, client_uid); + if (!clnt) { + pthread_mutex_unlock(&priv->mutex); + __destroy_lease_inode(lease_inode); + return -ENOMEM; + } + list_add_tail(&clnt->inode_list, &lease_inode->list); + } + pthread_mutex_unlock(&priv->mutex); + gf_msg_debug(this->name, 0, + "Added a new inode:%p to the client(%s) " + "cleanup list, gfid(%s)", + inode, client_uid, uuid_utoa(inode->gfid)); + return 0; +} + +/* Add lease entry to the corresponding client entry. + * Return values: + * 0 Success + * -1 Failure + */ +static int +__add_lease(call_frame_t *frame, inode_t *inode, lease_inode_ctx_t *lease_ctx, + const char *client_uid, struct gf_lease *lease) +{ + lease_id_entry_t *lease_entry = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO("leases", frame, out); + GF_VALIDATE_OR_GOTO("leases", client_uid, out); + GF_VALIDATE_OR_GOTO("leases", lease_ctx, out); + GF_VALIDATE_OR_GOTO("leases", inode, out); + GF_VALIDATE_OR_GOTO("leases", lease, out); + + gf_msg_trace(frame->this->name, 0, + "Granting lease lock to client %s with lease id %s" + " on gfid(%s)", + client_uid, leaseid_utoa(lease->lease_id), + uuid_utoa(inode->gfid)); + + lease_entry = __get_or_new_lease_entry(frame, lease->lease_id, lease_ctx); + if (!lease_entry) { + errno = ENOMEM; + goto out; + } + + lease_entry->lease_type_cnt[lease->lease_type]++; + lease_entry->lease_cnt++; + lease_entry->lease_type |= lease->lease_type; + /* If this is the first lease taken by the client on the file, then + * add this inode/file to the client disconnect cleanup list + */ + if (lease_entry->lease_cnt == 1) { + add_inode_to_client_list(frame->this, inode, client_uid); + } + + lease_ctx->lease_cnt++; + lease_ctx->lease_type_cnt[lease->lease_type]++; + lease_ctx->lease_type |= lease->lease_type; + + /* Take a ref for the first lock taken on this inode. Corresponding + * unref when all the leases are unlocked or during DISCONNECT + * Ref is required because the inode on which lease is acquired should + * not be deleted when lru cleanup kicks in*/ + if (lease_ctx->lease_cnt == 1) { + lease_ctx->inode = inode_ref(inode); + } + + ret = 0; +out: + return ret; +} + +static gf_boolean_t +__is_clnt_lease_none(const char *client_uid, lease_inode_ctx_t *lease_ctx) +{ + gf_boolean_t lease_none = _gf_true; + lease_id_entry_t *lease_entry = NULL; + lease_id_entry_t *tmp = NULL; + + list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list, + lease_id_list) + { + if ((strcmp(client_uid, lease_entry->client_uid) == 0) && + (lease_entry->lease_cnt != 0)) { + lease_none = _gf_false; + break; + } + } + + return lease_none; +} + +static int +__remove_inode_from_clnt_list(xlator_t *this, lease_client_t *clnt, + inode_t *inode) +{ + int ret = -1; + lease_inode_t *l_inode = NULL; + lease_inode_t *tmp1 = NULL; + + list_for_each_entry_safe(l_inode, tmp1, &clnt->inode_list, list) + { + if (l_inode->inode == inode) { + __destroy_lease_inode(l_inode); + gf_msg_debug(this->name, 0, + "Removed the inode from the client cleanup list"); + ret = 0; + } + } + /* TODO: Remove the client entry from the cleanup list */ + + return ret; +} + +static int +remove_from_clnt_list(xlator_t *this, const char *client_uid, inode_t *inode) +{ + leases_private_t *priv = NULL; + int ret = -1; + lease_client_t *clnt = NULL; + + priv = this->private; + if (!priv) + goto out; + + pthread_mutex_lock(&priv->mutex); + { + clnt = __get_lease_client(this, priv, client_uid); + if (!clnt) { + pthread_mutex_unlock(&priv->mutex); + gf_msg(this->name, GF_LOG_ERROR, 0, LEASE_MSG_CLNT_NOTFOUND, + "There is no client entry found in the cleanup list"); + goto out; + } + ret = __remove_inode_from_clnt_list(this, clnt, inode); + if (ret) { + pthread_mutex_unlock(&priv->mutex); + gf_msg(this->name, GF_LOG_ERROR, 0, LEASE_MSG_INODE_NOTFOUND, + "There is no inode entry found in the cleanup list"); + goto out; + } + } + pthread_mutex_unlock(&priv->mutex); +out: + return ret; +} + +/* Remove lease entry in the corresponding client entry. + */ +static int +__remove_lease(xlator_t *this, inode_t *inode, lease_inode_ctx_t *lease_ctx, + const char *client_uid, struct gf_lease *lease) +{ + lease_id_entry_t *lease_entry = NULL; + int ret = 0; + int32_t lease_type = 0; + leases_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("leases", lease_ctx, out); + GF_VALIDATE_OR_GOTO("leases", lease, out); + + priv = this->private; + + gf_msg_trace(this->name, 0, + "Removing lease entry for client: %s, " + "lease type:%d, lease id:%s", + client_uid, lease->lease_type, leaseid_utoa(lease->lease_id)); + + /* There could be a race where in server recalled the lease and by the time + * client sends lease_unlock request, server may have revoked it. To handle + * such cases, if lease doesnt exist treat it as noop and return success. + */ + lease_entry = __get_lease_id_entry(lease_ctx, lease->lease_id); + if (!lease_entry) { + gf_msg(this->name, GF_LOG_INFO, 0, LEASE_MSG_INVAL_UNLK_LEASE, + "Got unlock lease request from client:%s, but has no " + "corresponding lock", + client_uid); + ret = 0; + goto out; + } + + if (!(lease_entry->lease_type & lease->lease_type)) { + gf_msg(this->name, GF_LOG_INFO, 0, LEASE_MSG_INVAL_UNLK_LEASE, + "Got unlock lease request from client:%s for an invalid " + "lease_type", + client_uid); + ret = -EINVAL; + errno = EINVAL; + goto out; + } + lease_type = lease->lease_type; + lease_entry->lease_type_cnt[lease_type]--; + lease_entry->lease_cnt--; + + lease_ctx->lease_type_cnt[lease_type]--; + lease_ctx->lease_cnt--; + + if (lease_entry->lease_type_cnt[lease_type] == 0) + lease_entry->lease_type = lease_entry->lease_type & (~lease_type); + + if (lease_ctx->lease_type_cnt[lease_type] == 0) + lease_ctx->lease_type = lease_ctx->lease_type & (~lease_type); + + if (lease_entry->lease_cnt == 0) { + if (__is_clnt_lease_none(client_uid, lease_ctx)) { + gf_msg_trace(this->name, 0, + "Client(%s) has no leases" + " on gfid (%s), hence removing the inode" + " from the client cleanup list", + client_uid, uuid_utoa(inode->gfid)); + remove_from_clnt_list(this, client_uid, lease_ctx->inode); + } + __destroy_lease_id_entry(lease_entry); + lease_ctx->blocked_fops_resuming = _gf_true; + } + + if (lease_ctx->lease_cnt == 0 && lease_ctx->timer) { + ret = gf_tw_del_timer(priv->timer_wheel, lease_ctx->timer); + lease_ctx->recall_in_progress = _gf_false; + lease_ctx->timer = NULL; + } +out: + return ret; +} + +static gf_boolean_t +__is_lease_grantable(xlator_t *this, lease_inode_ctx_t *lease_ctx, + struct gf_lease *lease, inode_t *inode) +{ + uint32_t fd_count = 0; + int32_t flags = 0; + fd_t *iter_fd = NULL; + gf_boolean_t grant = _gf_false; + int ret = 0; + lease_fd_ctx_t *fd_ctx = NULL; + uint64_t ctx = 0; + + GF_VALIDATE_OR_GOTO("leases", lease_ctx, out); + GF_VALIDATE_OR_GOTO("leases", lease, out); + GF_VALIDATE_OR_GOTO("leases", inode, out); + + if (lease_ctx->recall_in_progress) { + gf_msg_debug(this->name, 0, + "Recall in progress, hence " + "failing the lease request"); + grant = _gf_false; + goto out; + } + + if (lease_ctx->blocked_fops_resuming) { + gf_msg_debug(this->name, 0, + "Previously blocked fops resuming, hence " + "failing the lease request"); + grant = _gf_false; + goto out; + } + + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) + { + ret = fd_ctx_get(iter_fd, this, &ctx); + if (ret < 0) { + grant = _gf_false; + UNLOCK(&inode->lock); + gf_msg(this->name, GF_LOG_ERROR, 0, LEASE_MSG_INVAL_FD_CTX, + "Unable to get fd ctx"); + goto out; + } + fd_ctx = (lease_fd_ctx_t *)(long)ctx; + + /* Check for open fd conflict, note that open fds from + * the same lease id is not checked for conflict, as it is + * lease id based lease. + */ + if (fd_ctx->client_uid != NULL && + !__is_same_lease_id(fd_ctx->lease_id, lease->lease_id)) { + fd_count++; + flags |= iter_fd->flags; + } + } + } + UNLOCK(&inode->lock); + + gf_msg_debug(this->name, 0, "open fd count:%d flags:%d", fd_count, flags); + + __dump_leases_info(this, lease_ctx); + + switch (lease->lease_type) { + case GF_RD_LEASE: + /* check open fd conflict */ + if ((fd_count > 0) && ((flags & O_WRONLY) || (flags & O_RDWR))) { + grant = _gf_false; + break; + } + + /* check for conflict with existing leases */ + if (lease_ctx->lease_type == NONE || + lease_ctx->lease_type == GF_RD_LEASE || + !(__another_lease_found(lease_ctx, lease->lease_id))) + grant = _gf_true; + else + grant = _gf_false; + break; + + case GF_RW_LEASE: + /* check open fd conflict; conflict if there are any fds open + * other than the client on which the lease is requested. */ + if (fd_count > 0) { + grant = _gf_false; + break; + } + + /* check existing lease conflict */ + if (lease_ctx->lease_type == NONE || + !(__another_lease_found(lease_ctx, lease->lease_id))) + grant = _gf_true; + else + grant = _gf_false; + break; + + default: + gf_msg(this->name, GF_LOG_ERROR, EINVAL, LEASE_MSG_INVAL_LEASE_TYPE, + "Invalid lease type specified"); + break; + } +out: + return grant; +} + +static void +do_blocked_fops(xlator_t *this, lease_inode_ctx_t *lease_ctx) +{ + struct list_head wind_list; + fop_stub_t *blk_fop = NULL; + fop_stub_t *tmp = NULL; + + INIT_LIST_HEAD(&wind_list); + + pthread_mutex_lock(&lease_ctx->lock); + { + if (!lease_ctx->blocked_fops_resuming) { + /* lease_ctx->blocked_fops_resuming will be set + * only when the last lease is released. That + * is when we need to resume blocked fops and unref + * the inode taken in __add_lease (when lease_cnt == 1). + * Return otherwise. + */ + pthread_mutex_unlock(&lease_ctx->lock); + return; + } + + list_for_each_entry_safe(blk_fop, tmp, &lease_ctx->blocked_list, list) + { + list_del_init(&blk_fop->list); + list_add_tail(&blk_fop->list, &wind_list); + } + } + pthread_mutex_unlock(&lease_ctx->lock); + + gf_msg_trace(this->name, 0, "Executing the blocked stubs on gfid(%s)", + uuid_utoa(lease_ctx->inode->gfid)); + list_for_each_entry_safe(blk_fop, tmp, &wind_list, list) + { + list_del_init(&blk_fop->list); + gf_msg_trace(this->name, 0, "Executing fop:%d", blk_fop->stub->fop); + call_resume(blk_fop->stub); + GF_FREE(blk_fop); + } + + pthread_mutex_lock(&lease_ctx->lock); + { + lease_ctx->lease_type = NONE; + /* unref the inode taken in __add_lease + * (when lease_cnt == 1) */ + lease_ctx->blocked_fops_resuming = _gf_false; + inode_unref(lease_ctx->inode); + lease_ctx->inode = NULL; + } + pthread_mutex_unlock(&lease_ctx->lock); + + return; +} + +void +recall_lease_timer_handler(struct gf_tw_timer_list *timer, void *data, + unsigned long calltime) +{ + inode_t *inode = NULL; + lease_inode_t *lease_inode = NULL; + leases_private_t *priv = NULL; + lease_timer_data_t *timer_data = NULL; + + timer_data = data; + + priv = timer_data->this->private; + inode = timer_data->inode; + lease_inode = new_lease_inode(inode); + if (!lease_inode) { + errno = ENOMEM; + goto out; + } + pthread_mutex_lock(&priv->mutex); + { + list_add_tail(&lease_inode->list, &priv->recall_list); + pthread_cond_broadcast(&priv->cond); + } + pthread_mutex_unlock(&priv->mutex); +out: + /* unref the inode_ref taken by timer_data in __recall_lease */ + inode_unref(timer_data->inode); + + GF_FREE(timer); +} + +static void +__recall_lease(xlator_t *this, lease_inode_ctx_t *lease_ctx) +{ + lease_id_entry_t *lease_entry = NULL; + lease_id_entry_t *tmp = NULL; + struct gf_upcall up_req = { + 0, + }; + struct gf_upcall_recall_lease recall_req = { + 0, + }; + int notify_ret = -1; + struct gf_tw_timer_list *timer = NULL; + leases_private_t *priv = NULL; + lease_timer_data_t *timer_data = NULL; + time_t recall_time; + + if (lease_ctx->recall_in_progress) { + gf_msg_debug(this->name, 0, + "Lease recall is already in " + "progress, hence not sending another recall"); + goto out; + } + + priv = this->private; + recall_time = gf_time(); + list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list, + lease_id_list) + { + gf_uuid_copy(up_req.gfid, lease_ctx->inode->gfid); + up_req.client_uid = lease_entry->client_uid; + up_req.event_type = GF_UPCALL_RECALL_LEASE; + up_req.data = &recall_req; + + notify_ret = this->notify(this, GF_EVENT_UPCALL, &up_req); + if (notify_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, LEASE_MSG_RECALL_FAIL, + "Recall notification to client: %s failed", + lease_entry->client_uid); + /* Do not return from here, continue registering the timer, + this is required mostly o keep replicas in sync*/ + } else { + gf_msg_debug(this->name, 0, + "Recall lease (all)" + "notification sent to client %s", + lease_entry->client_uid); + } + + lease_ctx->recall_in_progress = _gf_true; + lease_entry->recall_time = recall_time; + } + timer = GF_MALLOC(sizeof(*timer), gf_common_mt_tw_timer_list); + if (!timer) { + goto out; + } + timer_data = GF_MALLOC(sizeof(lease_timer_data_t), + gf_leases_mt_timer_data_t); + if (!timer_data) { + GF_FREE(timer); + goto out; + } + + timer_data->inode = inode_ref(lease_ctx->inode); + timer_data->this = this; + timer->data = timer_data; + + INIT_LIST_HEAD(&timer->entry); + timer->expires = get_recall_lease_timeout(this); + timer->function = recall_lease_timer_handler; + lease_ctx->timer = timer; + gf_tw_add_timer(priv->timer_wheel, timer); + gf_msg_trace(this->name, 0, + "Registering timer " + "%p, after " + "sending recall", + timer); +out: + return; +} + +/* ret = 0; STACK_UNWIND Success + * ret = -1; STACK_UNWIND failure + */ +int +process_lease_req(call_frame_t *frame, xlator_t *this, inode_t *inode, + struct gf_lease *lease) +{ + int ret = 0; + char *client_uid = NULL; + lease_inode_ctx_t *lease_ctx = NULL; + + GF_VALIDATE_OR_GOTO("leases", frame, out); + GF_VALIDATE_OR_GOTO("leases", this, out); + GF_VALIDATE_OR_GOTO("leases", inode, out); + GF_VALIDATE_OR_GOTO("leases", lease, out); + + client_uid = frame->root->client->client_uid; + + if (!is_valid_lease_id(lease->lease_id)) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, LEASE_MSG_INVAL_LEASE_ID, + "Invalid lease id, from" + "client:%s", + client_uid); + ret = -EINVAL; + errno = EINVAL; + goto out; + } + + lease_ctx = lease_ctx_get(inode, this); + if (!lease_ctx) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM, + "Unable to create/get inode ctx, " + "inode:%p", + inode); + ret = -ENOMEM; + errno = ENOMEM; + goto out; + } + + gf_msg_debug(this->name, 0, + "Lease request from client: %s, " + "lease type:%d, lease cmd:%d, lease ID:%s, gfid:%s", + client_uid, lease->lease_type, lease->cmd, + leaseid_utoa(lease->lease_id), uuid_utoa(inode->gfid)); + + pthread_mutex_lock(&lease_ctx->lock); + { + switch (lease->cmd) { + case GF_GET_LEASE: + lease->lease_type = lease_ctx->lease_type; + gf_msg_debug(this->name, 0, + "Get lease, existing lease" + "type: %d", + lease_ctx->lease_type); + /*TODO:Should it consider lease id or client_uid?*/ + break; + + case GF_SET_LEASE: + if (__is_lease_grantable(this, lease_ctx, lease, inode)) { + __add_lease(frame, inode, lease_ctx, client_uid, lease); + ret = 0; + } else { + gf_msg_debug(this->name, GF_LOG_DEBUG, + "Not granting the conflicting lease" + " request from %s on gfid(%s)", + client_uid, uuid_utoa(inode->gfid)); + __recall_lease(this, lease_ctx); + ret = -1; + } + break; + case GF_UNLK_LEASE: + ret = __remove_lease(this, inode, lease_ctx, client_uid, lease); + if ((ret >= 0) && (lease_ctx->lease_cnt == 0)) { + pthread_mutex_unlock(&lease_ctx->lock); + goto unblock; + } + break; + default: + ret = -EINVAL; + break; + } + } + pthread_mutex_unlock(&lease_ctx->lock); + + return ret; + +unblock: + do_blocked_fops(this, lease_ctx); +out: + return ret; +} + +/* ret = 1 conflict + * ret = 0 no conflict + */ +gf_boolean_t +__check_lease_conflict(call_frame_t *frame, lease_inode_ctx_t *lease_ctx, + const char *lease_id, gf_boolean_t is_write) +{ + gf_lease_types_t lease_type = { + 0, + }; + gf_boolean_t conflicts = _gf_false; + lease_id_entry_t *lease_entry = NULL; + + GF_VALIDATE_OR_GOTO("leases", frame, out); + GF_VALIDATE_OR_GOTO("leases", lease_ctx, out); + + lease_type = lease_ctx->lease_type; + + /* If the fop is rename or unlink conflict the lease even if its + * from the same client?? + */ + if ((frame->root->op == GF_FOP_RENAME) || + (frame->root->op == GF_FOP_UNLINK)) { + conflicts = _gf_true; + goto recall; + } + + /* As internal fops are used to maintain data integrity but do not + * make modififications to the client data, no need to conflict with + * them. + * + * @todo: like for locks, even lease state has to be handled by + * rebalance or self-heal daemon process. */ + if (frame->root->pid < 0) { + conflicts = _gf_false; + goto recall; + } + + /* If lease_id is not sent, set conflicts = true if there is + * an existing lease */ + if (!lease_id && (lease_ctx->lease_cnt > 0)) { + conflicts = _gf_true; + goto recall; + } + + switch (lease_type) { + case (GF_RW_LEASE | GF_RD_LEASE): + case GF_RW_LEASE: + lease_entry = __get_lease_id_entry(lease_ctx, lease_id); + if (lease_entry && (lease_entry->lease_type & GF_RW_LEASE)) + conflicts = _gf_false; + else + conflicts = _gf_true; + break; + case GF_RD_LEASE: + if (is_write && __another_lease_found(lease_ctx, lease_id)) + conflicts = _gf_true; + else + conflicts = _gf_false; + break; + default: + break; + } + +recall: + /* If there is a conflict found and recall is not already sent to all + * the clients, then send recall to each of the client holding lease. + */ + if (conflicts) + __recall_lease(frame->this, lease_ctx); +out: + return conflicts; +} + +/* Return values: + * -1 : error, unwind the fop + * WIND_FOP: No conflict, wind the fop + * BLOCK_FOP: Found a conflicting lease, block the fop + */ +int +check_lease_conflict(call_frame_t *frame, inode_t *inode, const char *lease_id, + uint32_t fop_flags) +{ + lease_inode_ctx_t *lease_ctx = NULL; + gf_boolean_t is_blocking_fop = _gf_false; + gf_boolean_t is_write_fop = _gf_false; + gf_boolean_t conflicts = _gf_false; + int ret = WIND_FOP; + + lease_ctx = lease_ctx_get(inode, frame->this); + if (!lease_ctx) { + gf_msg(frame->this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM, + "Unable to create/get inode ctx"); + ret = -1; + errno = ENOMEM; + goto out; + } + + is_blocking_fop = ((fop_flags & BLOCKING_FOP) != 0); + is_write_fop = ((fop_flags & DATA_MODIFY_FOP) != 0); + + pthread_mutex_lock(&lease_ctx->lock); + { + if (lease_ctx->lease_type == NONE) { + pthread_mutex_unlock(&lease_ctx->lock); + gf_msg_debug(frame->this->name, 0, + "No leases found continuing with the" + " fop:%s", + gf_fop_list[frame->root->op]); + ret = WIND_FOP; + goto out; + } + conflicts = __check_lease_conflict(frame, lease_ctx, lease_id, + is_write_fop); + if (conflicts) { + if (is_blocking_fop) { + gf_msg_debug(frame->this->name, 0, + "Fop: %s " + "conflicting existing " + "lease: %d, blocking the" + "fop", + gf_fop_list[frame->root->op], + lease_ctx->lease_type); + ret = BLOCK_FOP; + } else { + gf_msg_debug(frame->this->name, 0, + "Fop: %s " + "conflicting existing " + "lease: %d, sending " + "EAGAIN", + gf_fop_list[frame->root->op], + lease_ctx->lease_type); + errno = EAGAIN; + ret = -1; + } + } + } + pthread_mutex_unlock(&lease_ctx->lock); +out: + return ret; +} + +static int +remove_clnt_leases(const char *client_uid, inode_t *inode, xlator_t *this) +{ + lease_inode_ctx_t *lease_ctx = NULL; + lease_id_entry_t *lease_entry = NULL; + lease_id_entry_t *tmp = NULL; + int ret = 0; + int i = 0; + + lease_ctx = lease_ctx_get(inode, this); + if (!lease_ctx) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_INVAL_INODE_CTX, + "Unable to create/get inode ctx"); + ret = -1; + errno = ENOMEM; + goto out; + } + + pthread_mutex_lock(&lease_ctx->lock); + { + list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list, + lease_id_list) + { + if (strcmp(client_uid, lease_entry->client_uid) == 0) { + for (i = 0; i < GF_LEASE_MAX_TYPE; i++) { + lease_ctx->lease_type_cnt[i] -= lease_entry + ->lease_type_cnt[i]; + } + lease_ctx->lease_cnt -= lease_entry->lease_cnt; + __destroy_lease_id_entry(lease_entry); + if (lease_ctx->lease_cnt == 0) { + lease_ctx->blocked_fops_resuming = _gf_true; + pthread_mutex_unlock(&lease_ctx->lock); + goto unblock; + } + } + } + } + pthread_mutex_unlock(&lease_ctx->lock); +out: + return ret; + +unblock: + do_blocked_fops(this, lease_ctx); + return ret; +} + +int +cleanup_client_leases(xlator_t *this, const char *client_uid) +{ + lease_client_t *clnt = NULL; + lease_client_t *tmp = NULL; + struct list_head cleanup_list = { + 0, + }; + lease_inode_t *l_inode = NULL; + lease_inode_t *tmp1 = NULL; + leases_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + if (!priv) { + ret = -1; + errno = EINVAL; + goto out; + } + + INIT_LIST_HEAD(&cleanup_list); + pthread_mutex_lock(&priv->mutex); + { + list_for_each_entry_safe(clnt, tmp, &priv->client_list, client_list) + { + if ((strcmp(clnt->client_uid, client_uid) == 0)) { + list_for_each_entry_safe(l_inode, tmp1, &clnt->inode_list, list) + { + list_del_init(&l_inode->list); + list_add_tail(&l_inode->list, &cleanup_list); + } + __destroy_lease_client(clnt); + break; + } + } + } + pthread_mutex_unlock(&priv->mutex); + + l_inode = tmp1 = NULL; + list_for_each_entry_safe(l_inode, tmp1, &cleanup_list, list) + { + remove_clnt_leases(client_uid, l_inode->inode, this); + __destroy_lease_inode(l_inode); + } +out: + return ret; +} + +static void +__remove_all_leases(xlator_t *this, lease_inode_ctx_t *lease_ctx) +{ + int i = 0; + lease_id_entry_t *lease_entry = NULL; + lease_id_entry_t *tmp = NULL; + + if (lease_ctx->lease_cnt == 0) { + /* No leases to remove. Return */ + return; + } + __dump_leases_info(this, lease_ctx); + + list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list, + lease_id_list) + { + lease_entry->lease_cnt = 0; + remove_from_clnt_list(this, lease_entry->client_uid, lease_ctx->inode); + __destroy_lease_id_entry(lease_entry); + } + INIT_LIST_HEAD(&lease_ctx->lease_id_list); + for (i = 0; i <= GF_LEASE_MAX_TYPE; i++) + lease_ctx->lease_type_cnt[i] = 0; + lease_ctx->lease_type = 0; + lease_ctx->lease_cnt = 0; + lease_ctx->recall_in_progress = _gf_false; + lease_ctx->timer = NULL; + lease_ctx->blocked_fops_resuming = _gf_true; + + /* TODO: + * - Mark the corresponding fd bad. Could be done on client side + * as a result of recall + * - Free the lease_ctx + */ + return; +} + +static int +remove_all_leases(xlator_t *this, inode_t *inode) +{ + lease_inode_ctx_t *lease_ctx = NULL; + int ret = 0; + + GF_VALIDATE_OR_GOTO("leases", inode, out); + + lease_ctx = lease_ctx_get(inode, this); + if (!lease_ctx) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_INVAL_INODE_CTX, + "Unable to create/get inode ctx"); + ret = -1; + errno = ENOMEM; + goto out; + } + + pthread_mutex_lock(&lease_ctx->lock); + { + __remove_all_leases(this, lease_ctx); + } + pthread_mutex_unlock(&lease_ctx->lock); + + do_blocked_fops(this, lease_ctx); +out: + return ret; +} + +void * +expired_recall_cleanup(void *data) +{ + struct timespec sleep_till = { + 0, + }; + struct list_head recall_cleanup_list; + lease_inode_t *recall_entry = NULL; + lease_inode_t *tmp = NULL; + leases_private_t *priv = NULL; + xlator_t *this = NULL; + time_t time_now; + + GF_VALIDATE_OR_GOTO("leases", data, out); + + this = data; + priv = this->private; + + gf_msg_debug(this->name, 0, "Started the expired_recall_cleanup thread"); + + while (1) { + time_now = gf_time(); + pthread_mutex_lock(&priv->mutex); + { + if (priv->fini) { + pthread_mutex_unlock(&priv->mutex); + goto out; + } + INIT_LIST_HEAD(&recall_cleanup_list); + if (list_empty(&priv->recall_list)) { + sleep_till.tv_sec = time_now + 600; + pthread_cond_timedwait(&priv->cond, &priv->mutex, &sleep_till); + } + if (!list_empty(&priv->recall_list)) { + gf_msg_debug(this->name, 0, "Found expired recalls"); + list_for_each_entry_safe(recall_entry, tmp, &priv->recall_list, + list) + { + list_del_init(&recall_entry->list); + list_add_tail(&recall_entry->list, &recall_cleanup_list); + } + } + } + pthread_mutex_unlock(&priv->mutex); + + recall_entry = tmp = NULL; + list_for_each_entry_safe(recall_entry, tmp, &recall_cleanup_list, list) + { + gf_msg_debug(this->name, 0, + "Recall lease was sent on" + " inode:%p, recall timer has expired" + " and clients haven't unlocked the lease" + " hence cleaning up leases on the inode", + recall_entry->inode); + remove_all_leases(this, recall_entry->inode); + /* no need to take priv->mutex lock as this entry + * reference is removed from global recall list. */ + __destroy_lease_inode(recall_entry); + } + } + +out: + return NULL; +} diff --git a/xlators/features/leases/src/leases-mem-types.h b/xlators/features/leases/src/leases-mem-types.h new file mode 100644 index 00000000000..25664b44156 --- /dev/null +++ b/xlators/features/leases/src/leases-mem-types.h @@ -0,0 +1,27 @@ +/* + Copyright (c) 2015-2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __LEASES_MEM_TYPES_H__ +#define __LEASES_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> + +enum gf_leases_mem_types_ { + gf_leases_mt_private_t = gf_common_mt_end + 1, + gf_leases_mt_lease_client_t, + gf_leases_mt_lease_inode_t, + gf_leases_mt_fd_ctx_t, + gf_leases_mt_lease_inode_ctx_t, + gf_leases_mt_lease_id_entry_t, + gf_leases_mt_fop_stub_t, + gf_leases_mt_timer_data_t, + gf_leases_mt_end +}; +#endif diff --git a/xlators/features/leases/src/leases-messages.h b/xlators/features/leases/src/leases-messages.h new file mode 100644 index 00000000000..da696b832de --- /dev/null +++ b/xlators/features/leases/src/leases-messages.h @@ -0,0 +1,33 @@ +/* + Copyright (c) 2015-2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _LEASES_MESSAGES_H_ +#define _LEASES_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(LEASES, LEASE_MSG_NO_MEM, LEASE_MSG_RECALL_FAIL, + LEASE_MSG_INVAL_LEASE_ID, LEASE_MSG_INVAL_UNLK_LEASE, + LEASE_MSG_INVAL_INODE_CTX, LEASE_MSG_NOT_ENABLED, + LEASE_MSG_NO_TIMER_WHEEL, LEASE_MSG_CLNT_NOTFOUND, + LEASE_MSG_INODE_NOTFOUND, LEASE_MSG_INVAL_FD_CTX, + LEASE_MSG_INVAL_LEASE_TYPE); + +#endif /* !_LEASES_MESSAGES_H_ */ diff --git a/xlators/features/leases/src/leases.c b/xlators/features/leases/src/leases.c new file mode 100644 index 00000000000..04bee50ba3f --- /dev/null +++ b/xlators/features/leases/src/leases.c @@ -0,0 +1,1168 @@ +/* + Copyright (c) 2015-2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "leases.h" + +int32_t +leases_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata); + + return 0; +} + +int32_t +leases_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + uint32_t fop_flags = 0; + int32_t op_errno = EINVAL; + int ret = 0; + lease_fd_ctx_t *fd_ctx = NULL; + char *lease_id = NULL; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + fd_ctx = GF_CALLOC(1, sizeof(*fd_ctx), gf_leases_mt_fd_ctx_t); + if (!fd_ctx) { + op_errno = ENOMEM; + goto err; + } + + fd_ctx->client_uid = gf_strdup(frame->root->client->client_uid); + if (!fd_ctx->client_uid) { + op_errno = ENOMEM; + goto err; + } + + GET_FLAGS(frame->root->op, flags); + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + if (lease_id != NULL) + memcpy(fd_ctx->lease_id, lease_id, LEASE_ID_SIZE); + else + memset(fd_ctx->lease_id, 0, LEASE_ID_SIZE); + + ret = fd_ctx_set(fd, this, (uint64_t)(uintptr_t)fd_ctx); + if (ret) { + op_errno = ENOMEM; + goto err; + } + + ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(fd->inode, open, frame, this, loc, flags, fd, xdata); + return 0; + +out: + STACK_WIND(frame, leases_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; + +err: + if (fd_ctx) { + GF_FREE(fd_ctx->client_uid); + GF_FREE(fd_ctx); + } + + STACK_UNWIND_STRICT(open, frame, -1, op_errno, NULL, NULL); + return 0; +} + +int32_t +leases_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +int32_t +leases_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int count, off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, fd->flags); + + ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(fd->inode, writev, frame, this, fd, vector, count, off, + flags, iobref, xdata); + return 0; + +out: + STACK_WIND(frame, leases_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, off, flags, + iobref, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(writev, frame, -1, errno, NULL, NULL, NULL); + return 0; +} + +int32_t +leases_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iovec *vector, int count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) +{ + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); + + return 0; +} + +int32_t +leases_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, fd->flags); + + ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(fd->inode, readv, frame, this, fd, size, offset, flags, + xdata); + return 0; + +out: + STACK_WIND(frame, leases_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(readv, frame, -1, errno, NULL, 0, NULL, NULL, NULL); + return 0; +} + +int32_t +leases_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_flock *lock, dict_t *xdata) +{ + STACK_UNWIND_STRICT(lk, frame, op_ret, op_errno, lock, xdata); + + return 0; +} + +int32_t +leases_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS_LK(cmd, flock->l_type, fd->flags); + + ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(fd->inode, lk, frame, this, fd, cmd, flock, xdata); + return 0; + +out: + STACK_WIND(frame, leases_lk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lk, fd, cmd, flock, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(lk, frame, -1, errno, NULL, NULL); + return 0; +} + +int32_t +leases_lease(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct gf_lease *lease, dict_t *xdata) +{ + int32_t op_errno = 0; + int ret = 0; + struct gf_lease nullease = { + 0, + }; + int32_t op_ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + ret = process_lease_req(frame, this, loc->inode, lease); + if (ret < 0) { + op_errno = -ret; + op_ret = -1; + } + goto unwind; + +out: + gf_msg(this->name, GF_LOG_ERROR, EINVAL, LEASE_MSG_NOT_ENABLED, + "\"features/leases\" translator is not enabled. " + "You need to enable it for proper functioning of your " + "application"); + op_errno = ENOSYS; + op_ret = -1; + +unwind: + STACK_UNWIND_STRICT(lease, frame, op_ret, op_errno, + (op_errno == ENOSYS) ? &nullease : lease, xdata); + return 0; +} + +int32_t +leases_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +int32_t +leases_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, 0); + + ret = check_lease_conflict(frame, loc->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(loc->inode, truncate, frame, this, loc, offset, xdata); + return 0; + +out: + STACK_WIND(frame, leases_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(truncate, frame, -1, errno, NULL, NULL, NULL); + return 0; +} + +int32_t +leases_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, statpre, statpost, + xdata); + + return 0; +} + +int32_t +leases_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, 0); + + ret = check_lease_conflict(frame, loc->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(loc->inode, setattr, frame, this, loc, stbuf, valid, xdata); + return 0; + +out: + STACK_WIND(frame, leases_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(setattr, frame, -1, errno, NULL, NULL, NULL); + return 0; +} + +int32_t +leases_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, stbuf, preoldparent, + postoldparent, prenewparent, postnewparent, xdata); + + return 0; +} + +int32_t +leases_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + /* should the lease be also checked for newloc */ + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, 0); + + ret = check_lease_conflict(frame, oldloc->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(oldloc->inode, rename, frame, this, oldloc, newloc, xdata); + return 0; + +out: + STACK_WIND(frame, leases_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(rename, frame, -1, errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; +} + +int32_t +leases_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); + + return 0; +} + +int32_t +leases_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, 0); + + ret = check_lease_conflict(frame, loc->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(loc->inode, unlink, frame, this, loc, xflag, xdata); + return 0; + +out: + STACK_WIND(frame, leases_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(unlink, frame, -1, errno, NULL, NULL, NULL); + return 0; +} + +int32_t +leases_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + STACK_UNWIND_STRICT(link, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, xdata); + + return 0; +} + +int32_t +leases_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, 0); + + ret = check_lease_conflict(frame, oldloc->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(oldloc->inode, link, frame, this, oldloc, newloc, xdata); + return 0; +out: + STACK_WIND(frame, leases_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(link, frame, -1, errno, NULL, NULL, NULL, NULL, NULL); + return 0; +} + +int32_t +leases_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, stbuf, + preparent, postparent, xdata); + + return 0; +} + +int32_t +leases_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, flags); + + ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(fd->inode, create, frame, this, loc, flags, mode, umask, fd, + xdata); + return 0; + +out: + STACK_WIND(frame, leases_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; + +err: + STACK_UNWIND_STRICT(create, frame, -1, errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; +} + +int32_t +leases_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} + +int32_t +leases_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, fd->flags); + + ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(fd->inode, fsync, frame, this, fd, flags, xdata); + return 0; + +out: + STACK_WIND(frame, leases_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, flags, xdata); + return 0; +err: + STACK_UNWIND_STRICT(fsync, frame, -1, errno, NULL, NULL, NULL); + return 0; +} + +int32_t +leases_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int32_t +leases_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, 0); /* TODO:fd->flags?*/ + + ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(fd->inode, ftruncate, frame, this, fd, offset, xdata); + return 0; + +out: + STACK_WIND(frame, leases_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(ftruncate, frame, -1, errno, NULL, NULL, NULL); + return 0; +} + +int32_t +leases_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + STACK_UNWIND_STRICT(fsetattr, frame, op_ret, op_errno, statpre, statpost, + xdata); + return 0; +} + +int32_t +leases_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, fd->flags); + + ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(fd->inode, fsetattr, frame, this, fd, stbuf, valid, xdata); + return 0; + +out: + STACK_WIND(frame, leases_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(fsetattr, frame, -1, errno, NULL, NULL, NULL); + return 0; +} + +int32_t +leases_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, pre, post, xdata); + + return 0; +} + +int32_t +leases_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, fd->flags); + + ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(fd->inode, fallocate, frame, this, fd, mode, offset, len, + xdata); + return 0; + +out: + STACK_WIND(frame, leases_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, + xdata); + return 0; + +err: + STACK_UNWIND_STRICT(fallocate, frame, -1, errno, NULL, NULL, NULL); + return 0; +} + +int32_t +leases_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, pre, post, xdata); + + return 0; +} + +int32_t +leases_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, fd->flags); + + ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(fd->inode, discard, frame, this, fd, offset, len, xdata); + return 0; + +out: + STACK_WIND(frame, leases_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(discard, frame, -1, errno, NULL, NULL, NULL); + return 0; +} + +int32_t +leases_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, pre, post, xdata); + + return 0; +} + +int +leases_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, fd->flags); + + ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(fd->inode, zerofill, frame, this, fd, offset, len, xdata); + return 0; + +out: + STACK_WIND(frame, leases_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(zerofill, frame, -1, errno, NULL, NULL, NULL); + return 0; +} + +int +leases_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, xdata); + + return 0; +} + +int +leases_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + uint32_t fop_flags = 0; + char *lease_id = NULL; + int ret = 0; + lease_fd_ctx_t *fd_ctx = NULL; + uint64_t ctx = 0; + + EXIT_IF_LEASES_OFF(this, out); + EXIT_IF_INTERNAL_FOP(frame, xdata, out); + + GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid); + GET_FLAGS(frame->root->op, fd->flags); + + ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags); + if (ret < 0) + goto err; + else if (ret == BLOCK_FOP) + goto block; + else if (ret == WIND_FOP) + goto out; + +block: + LEASE_BLOCK_FOP(fd->inode, flush, frame, this, fd, xdata); + return 0; + +out: + /* * + * currently release is not called after the close fop from the + * application. Hence lease fd ctx is reset on here. + * This is actually not the right way, since flush can be called + * not only from the close op. + * TODO : + * - Either identify the flush is called from close call on fd from + * from the application. + * OR + * - Find why release is not called post the last close call + */ + ret = fd_ctx_get(fd, this, &ctx); + if (ret == 0) { + fd_ctx = (lease_fd_ctx_t *)(long)ctx; + if (fd_ctx->client_uid) { + GF_FREE(fd_ctx->client_uid); + fd_ctx->client_uid = NULL; + } + memset(fd_ctx->lease_id, 0, LEASE_ID_SIZE); + } + STACK_WIND(frame, leases_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(create, frame, -1, errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init(this, gf_leases_mt_end + 1); + + if (ret != 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM, + "mem account init failed"); + return ret; + } + + return ret; +} + +static int +leases_init_priv(xlator_t *this) +{ + int ret = 0; + leases_private_t *priv = NULL; + + priv = this->private; + GF_ASSERT(priv); + + if (!priv->timer_wheel) { + priv->timer_wheel = glusterfs_ctx_tw_get(this->ctx); + if (!priv->timer_wheel) { + ret = -1; + goto out; + } + } + + if (!priv->inited_recall_thr) { + ret = gf_thread_create(&priv->recall_thr, NULL, expired_recall_cleanup, + this, "leasercl"); + if (!ret) + priv->inited_recall_thr = _gf_true; + } + +out: + return ret; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + leases_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + GF_ASSERT(priv); + + /* TODO: In case of reconfigure, if its enabling the leases + * its not an issue, but if its disabling the leases, there + * is more to it, like recall all the existing leases, wait + * for unlock of all the leases etc., hence not supporting the + * reconfigure for now. + + GF_OPTION_RECONF ("leases", priv->leases_enabled, + options, bool, out); + + if (priv->leases_enabled) { + ret = leases_init_priv (this); + if (ret) + goto out; + } + */ + + GF_OPTION_RECONF("lease-lock-recall-timeout", priv->recall_lease_timeout, + options, int32, out); + + ret = 0; +out: + return ret; +} + +int +init(xlator_t *this) +{ + int ret = -1; + leases_private_t *priv = NULL; + + priv = GF_CALLOC(1, sizeof(*priv), gf_leases_mt_private_t); + if (!priv) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM, + "Leases init failed"); + goto out; + } + + GF_OPTION_INIT("leases", priv->leases_enabled, bool, out); + GF_OPTION_INIT("lease-lock-recall-timeout", priv->recall_lease_timeout, + int32, out); + pthread_mutex_init(&priv->mutex, NULL); + INIT_LIST_HEAD(&priv->client_list); + INIT_LIST_HEAD(&priv->recall_list); + + this->private = priv; + + if (priv->leases_enabled) { + ret = leases_init_priv(this); + if (ret) + goto out; + } + + ret = 0; + +out: + if (ret) { + GF_FREE(priv); + this->private = NULL; + } + + return ret; +} + +void +fini(xlator_t *this) +{ + leases_private_t *priv = NULL; + + priv = this->private; + if (!priv) { + return; + } + this->private = NULL; + + priv->fini = _gf_true; + pthread_cond_broadcast(&priv->cond); + if (priv->recall_thr) { + gf_thread_cleanup_xint(priv->recall_thr); + priv->recall_thr = 0; + priv->inited_recall_thr = _gf_false; + } + + if (priv->timer_wheel) { + glusterfs_ctx_tw_put(this->ctx); + } + + GF_FREE(priv); + return; +} + +static int +leases_forget(xlator_t *this, inode_t *inode) +{ + /* TODO:leases_cleanup_inode_ctx (this, inode); */ + return 0; +} + +static int +leases_release(xlator_t *this, fd_t *fd) +{ + int ret = -1; + uint64_t tmp = 0; + lease_fd_ctx_t *fd_ctx = NULL; + + if (fd == NULL) { + goto out; + } + + gf_log(this->name, GF_LOG_TRACE, "Releasing all leases with fd %p", fd); + + ret = fd_ctx_del(fd, this, &tmp); + if (ret) { + gf_log(this->name, GF_LOG_DEBUG, "Could not get fdctx"); + goto out; + } + + fd_ctx = (lease_fd_ctx_t *)(long)tmp; + if (fd_ctx) + GF_FREE(fd_ctx); +out: + return ret; +} + +static int +leases_clnt_disconnect_cbk(xlator_t *this, client_t *client) +{ + int ret = 0; + + EXIT_IF_LEASES_OFF(this, out); + + ret = cleanup_client_leases(this, client->client_uid); +out: + return ret; +} + +struct xlator_fops fops = { + /* Metadata modifying fops */ + .fsetattr = leases_fsetattr, + .setattr = leases_setattr, + + /* File Data reading fops */ + .open = leases_open, + .readv = leases_readv, + + /* File Data modifying fops */ + .truncate = leases_truncate, + .ftruncate = leases_ftruncate, + .writev = leases_writev, + .zerofill = leases_zerofill, + .fallocate = leases_fallocate, + .discard = leases_discard, + .lk = leases_lk, + .fsync = leases_fsync, + .flush = leases_flush, + .lease = leases_lease, + + /* Directory Data modifying fops */ + .create = leases_create, + .rename = leases_rename, + .unlink = leases_unlink, + .link = leases_link, + +#ifdef NOT_SUPPORTED + /* internal lk fops */ + .inodelk = leases_inodelk, + .finodelk = leases_finodelk, + .entrylk = leases_entrylk, + .fentrylk = leases_fentrylk, + + /* Internal special fops*/ + .xattrop = leases_xattrop, + .fxattrop = leases_fxattrop, +#endif +}; + +struct xlator_cbks cbks = { + .forget = leases_forget, + .release = leases_release, + .client_disconnect = leases_clnt_disconnect_cbk, +}; + +struct volume_options options[] = { + {.key = {"leases"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {GD_OP_VERSION_3_8_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .description = "When \"on\", enables leases support"}, + {.key = {"lease-lock-recall-timeout"}, + .type = GF_OPTION_TYPE_INT, + .default_value = RECALL_LEASE_LK_TIMEOUT, + .op_version = {GD_OP_VERSION_3_8_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .description = "After 'timeout' seconds since the recall_lease" + " request has been sent to the client, the lease lock" + " will be forcefully purged by the server."}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "leases", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/leases/src/leases.h b/xlators/features/leases/src/leases.h new file mode 100644 index 00000000000..a6e8a6824cc --- /dev/null +++ b/xlators/features/leases/src/leases.h @@ -0,0 +1,259 @@ +/* + Copyright (c) 2015-2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _LEASES_H +#define _LEASES_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <glusterfs/common-utils.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/logging.h> +#include <glusterfs/client_t.h> +#include <glusterfs/lkowner.h> +#include <glusterfs/locking.h> +#include <glusterfs/upcall-utils.h> +#include "timer-wheel.h" +#include "leases-mem-types.h" +#include "leases-messages.h" + +/* The time period for which a client lease lock will be stored after its been + * recalled for the first time. */ +#define RECALL_LEASE_LK_TIMEOUT "60" + +#define DATA_MODIFY_FOP 0x0001 +#define BLOCKING_FOP 0x0002 + +#define BLOCK_FOP 0x0001 +#define WIND_FOP 0x0002 + +#define EXIT_IF_LEASES_OFF(this, label) \ + do { \ + if (!is_leases_enabled(this)) \ + goto label; \ + } while (0) + +#define EXIT_IF_INTERNAL_FOP(frame, xdata, label) \ + do { \ + if (frame->root->pid < 0) \ + goto label; \ + if (xdata && dict_get(xdata, GLUSTERFS_INTERNAL_FOP_KEY)) \ + goto label; \ + } while (0) + +#define GET_LEASE_ID(xdata, lease_id, client_uid) \ + do { \ + int ret_val = -1; \ + ret_val = dict_get_bin(xdata, "lease-id", (void **)&lease_id); \ + if (ret_val) { \ + ret_val = 0; \ + gf_msg_debug("leases", 0, "Lease id is not set for client:%s", \ + client_uid); \ + } \ + } while (0) + +#define GET_FLAGS(fop, fd_flags) \ + do { \ + if ((fd_flags & (O_WRONLY | O_RDWR)) && fop == GF_FOP_OPEN) \ + fop_flags = DATA_MODIFY_FOP; \ + \ + if (fop == GF_FOP_UNLINK || fop == GF_FOP_RENAME || \ + fop == GF_FOP_TRUNCATE || fop == GF_FOP_FTRUNCATE || \ + fop == GF_FOP_FLUSH || fop == GF_FOP_FSYNC || \ + fop == GF_FOP_WRITE || fop == GF_FOP_FALLOCATE || \ + fop == GF_FOP_DISCARD || fop == GF_FOP_ZEROFILL || \ + fop == GF_FOP_SETATTR || fop == GF_FOP_FSETATTR || \ + fop == GF_FOP_LINK) \ + fop_flags = DATA_MODIFY_FOP; \ + \ + if (!(fd_flags & (O_NONBLOCK | O_NDELAY))) \ + fop_flags |= BLOCKING_FOP; \ + \ + } while (0) + +#define GET_FLAGS_LK(cmd, l_type, fd_flags) \ + do { \ + /* TODO: handle F_RESLK_LCK and other glusterfs_lk_recovery_cmds_t */ \ + if ((cmd == F_SETLKW || cmd == F_SETLKW64 || cmd == F_SETLK || \ + cmd == F_SETLK64) && \ + l_type == F_WRLCK) \ + fop_flags = DATA_MODIFY_FOP; \ + \ + if (fd_flags & (O_NONBLOCK | O_NDELAY) && \ + (cmd == F_SETLKW || cmd == F_SETLKW64)) \ + fop_flags |= BLOCKING_FOP; \ + \ + } while (0) + +#define LEASE_BLOCK_FOP(inode, fop_name, frame, this, params...) \ + do { \ + call_stub_t *__stub = NULL; \ + fop_stub_t *blk_fop = NULL; \ + lease_inode_ctx_t *lease_ctx = NULL; \ + \ + __stub = fop_##fop_name##_stub(frame, default_##fop_name##_resume, \ + params); \ + if (!__stub) { \ + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM, \ + "Unable to create stub"); \ + ret = -ENOMEM; \ + goto __out; \ + } \ + \ + blk_fop = GF_CALLOC(1, sizeof(*blk_fop), gf_leases_mt_fop_stub_t); \ + if (!blk_fop) { \ + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM, \ + "Unable to create lease fop stub"); \ + ret = -ENOMEM; \ + goto __out; \ + } \ + \ + lease_ctx = lease_ctx_get(inode, this); \ + if (!lease_ctx) { \ + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM, \ + "Unable to create/get inode ctx"); \ + ret = -ENOMEM; \ + goto __out; \ + } \ + \ + blk_fop->stub = __stub; \ + pthread_mutex_lock(&lease_ctx->lock); \ + { \ + /*TODO: If the lease is unlocked btw check lease conflict and \ + * by now, then this fop shouldn't be add to the blocked fop \ + * list, can use generation number for the same?*/ \ + list_add_tail(&blk_fop->list, &lease_ctx->blocked_list); \ + } \ + pthread_mutex_unlock(&lease_ctx->lock); \ + \ + __out: \ + if (ret < 0) { \ + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM, \ + "Unable to create stub for blocking the fop:%s (%s)", \ + gf_fop_list[frame->root->op], strerror(ENOMEM)); \ + if (__stub != NULL) { \ + call_stub_destroy(__stub); \ + } \ + GF_FREE(blk_fop); \ + goto err; \ + } \ + } while (0) + +struct _leases_private { + struct list_head client_list; + struct list_head recall_list; + struct tvec_base *timer_wheel; /* timer wheel where the recall request + is qued and waits for unlock/expiry */ + pthread_t recall_thr; + pthread_mutex_t mutex; + pthread_cond_t cond; + int32_t recall_lease_timeout; + gf_boolean_t inited_recall_thr; + gf_boolean_t fini; + gf_boolean_t leases_enabled; + + char _pad[1]; /* manual padding */ +}; +typedef struct _leases_private leases_private_t; + +struct _lease_client { + char *client_uid; + struct list_head client_list; + struct list_head inode_list; +}; +typedef struct _lease_client lease_client_t; + +struct _lease_inode { + inode_t *inode; + struct list_head + list; /* This can be part of both inode_list and recall_list */ +}; +typedef struct _lease_inode lease_inode_t; + +struct _lease_fd_ctx { + char *client_uid; + char lease_id[LEASE_ID_SIZE]; +}; +typedef struct _lease_fd_ctx lease_fd_ctx_t; + +struct _lease_inode_ctx { + struct list_head lease_id_list; /* clients that have taken leases */ + int lease_type_cnt[GF_LEASE_MAX_TYPE + 1]; + uint64_t lease_cnt; /* Total number of leases on this inode */ + uint64_t openfd_cnt; /* number of fds open */ + struct list_head blocked_list; /* List of fops blocked until the + lease recall is complete */ + inode_t *inode; /* this represents the inode on which the + lock was taken, required mainly during + disconnect cleanup */ + struct gf_tw_timer_list *timer; + pthread_mutex_t lock; + int lease_type; /* Types of leases acquired */ + gf_boolean_t recall_in_progress; /* if lease recall is sent on this inode */ + gf_boolean_t blocked_fops_resuming; /* if blocked fops are being resumed */ + + char _pad[2]; /* manual padding */ +}; +typedef struct _lease_inode_ctx lease_inode_ctx_t; + +struct _lease_id_entry { + struct list_head lease_id_list; + char lease_id[LEASE_ID_SIZE]; + char *client_uid; /* uid of the client that has + taken the lease */ + int lease_type_cnt[GF_LEASE_MAX_TYPE + 1]; /* count of each lease type */ + uint64_t lease_cnt; /* Number of leases taken under the + given lease id */ + time_t recall_time; /* time @ which recall was sent */ + int lease_type; /* Union of all the leases taken + under the given lease id */ + char _pad[4]; /* manual padding */ +}; +typedef struct _lease_id_entry lease_id_entry_t; + +/* Required? as stub itself will have list */ +struct __fop_stub { + struct list_head list; + call_stub_t *stub; +}; +typedef struct __fop_stub fop_stub_t; + +struct __lease_timer_data { + inode_t *inode; + xlator_t *this; +}; +typedef struct __lease_timer_data lease_timer_data_t; + +gf_boolean_t +is_leases_enabled(xlator_t *this); + +lease_inode_ctx_t * +lease_ctx_get(inode_t *inode, xlator_t *this); + +int +process_lease_req(call_frame_t *frame, xlator_t *this, inode_t *inode, + struct gf_lease *lease); + +int +check_lease_conflict(call_frame_t *frame, inode_t *inode, const char *lease_id, + uint32_t fop_flags); + +int +cleanup_client_leases(xlator_t *this, const char *client_uid); + +void * +expired_recall_cleanup(void *data); + +#endif /* _LEASES_H */ diff --git a/xlators/features/locks/src/Makefile.am b/xlators/features/locks/src/Makefile.am index 0f79731b415..0b174c19d2d 100644 --- a/xlators/features/locks/src/Makefile.am +++ b/xlators/features/locks/src/Makefile.am @@ -1,23 +1,29 @@ +if WITH_SERVER xlator_LTLIBRARIES = locks.la +endif xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features -locks_la_LDFLAGS = -module -avoid-version +locks_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) locks_la_SOURCES = common.c posix.c entrylk.c inodelk.c reservelk.c \ - clear.c + clear.c + locks_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = locks.h common.h locks-mem-types.h clear.h +noinst_HEADERS = locks.h common.h locks-mem-types.h clear.h pl-messages.h -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS) CLEANFILES = +if WITH_SERVER uninstall-local: rm -f $(DESTDIR)$(xlatordir)/posix-locks.so install-data-hook: ln -sf locks.so $(DESTDIR)$(xlatordir)/posix-locks.so +endif diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c index 124b9ad0feb..ab1eac68a53 100644 --- a/xlators/features/locks/src/clear.c +++ b/xlators/features/locks/src/clear.c @@ -12,413 +12,449 @@ #include <limits.h> #include <pthread.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "inode.h" -#include "logging.h" -#include "common-utils.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/xlator.h> +#include <glusterfs/logging.h> +#include <glusterfs/common-utils.h> #include "locks.h" #include "common.h" -#include "statedump.h" +#include <glusterfs/statedump.h> #include "clear.h" +const char *clrlk_type_names[CLRLK_TYPE_MAX] = { + [CLRLK_INODE] = "inode", + [CLRLK_ENTRY] = "entry", + [CLRLK_POSIX] = "posix", +}; + int -clrlk_get_kind (char *kind) +clrlk_get_kind(char *kind) { - char *clrlk_kinds[CLRLK_KIND_MAX] = {"dummy", "blocked", "granted", - "all"}; - int ret_kind = CLRLK_KIND_MAX; - int i = 0; - - for (i = CLRLK_BLOCKED; i < CLRLK_KIND_MAX; i++) { - if (!strcmp (clrlk_kinds[i], kind)) { - ret_kind = i; - break; - } + char *clrlk_kinds[CLRLK_KIND_MAX] = {"dummy", "blocked", "granted", "all"}; + int ret_kind = CLRLK_KIND_MAX; + int i = 0; + + for (i = CLRLK_BLOCKED; i < CLRLK_KIND_MAX; i++) { + if (!strcmp(clrlk_kinds[i], kind)) { + ret_kind = i; + break; } + } - return ret_kind; + return ret_kind; } int -clrlk_get_type (char *type) +clrlk_get_type(char *type) { - char *clrlk_types[CLRLK_TYPE_MAX] = {"inode", "entry", "posix"}; - int ret_type = CLRLK_TYPE_MAX; - int i = 0; - - for (i = CLRLK_INODE; i < CLRLK_TYPE_MAX; i++) { - if (!strcmp (clrlk_types[i], type)) { - ret_type = i; - break; - } + char *clrlk_types[CLRLK_TYPE_MAX] = {"inode", "entry", "posix"}; + int ret_type = CLRLK_TYPE_MAX; + int i = 0; + + for (i = CLRLK_INODE; i < CLRLK_TYPE_MAX; i++) { + if (!strcmp(clrlk_types[i], type)) { + ret_type = i; + break; } + } - return ret_type; + return ret_type; } int -clrlk_get_lock_range (char *range_str, struct gf_flock *ulock, - gf_boolean_t *chk_range) +clrlk_get_lock_range(char *range_str, struct gf_flock *ulock, + gf_boolean_t *chk_range) { - int ret = -1; - - if (!chk_range) - goto out; + int ret = -1; - if (!range_str) { - ret = 0; - *chk_range = _gf_false; - goto out; - } - - if (sscanf (range_str, "%hd,%"PRId64"-""%"PRId64, &ulock->l_whence, - &ulock->l_start, &ulock->l_len) != 3) { - goto out; - } + if (!chk_range) + goto out; + if (!range_str) { ret = 0; - *chk_range = _gf_true; + *chk_range = _gf_false; + goto out; + } + + if (sscanf(range_str, + "%hd,%" PRId64 "-" + "%" PRId64, + &ulock->l_whence, &ulock->l_start, &ulock->l_len) != 3) { + goto out; + } + + ret = 0; + *chk_range = _gf_true; out: - return ret; + return ret; } int -clrlk_parse_args (const char* cmd, clrlk_args *args) +clrlk_parse_args(const char *cmd, clrlk_args *args) { - char *opts = NULL; - char *cur = NULL; - char *tok = NULL; - char *sptr = NULL; - char *free_ptr = NULL; - char kw[KW_MAX] = {[KW_TYPE] = 't', - [KW_KIND] = 'k', - }; - int ret = -1; - int i = 0; - - GF_ASSERT (cmd); - free_ptr = opts = GF_CALLOC (1, strlen (cmd), gf_common_mt_char); - if (!opts) - goto out; - - if (sscanf (cmd, GF_XATTR_CLRLK_CMD".%s", opts) < 1) { - ret = -1; - goto out; + char *opts = NULL; + char *cur = NULL; + char *tok = NULL; + char *sptr = NULL; + char *free_ptr = NULL; + char kw[KW_MAX] = { + [KW_TYPE] = 't', + [KW_KIND] = 'k', + }; + int ret = -1; + int i = 0; + + GF_ASSERT(cmd); + free_ptr = opts = GF_CALLOC(1, strlen(cmd), gf_common_mt_char); + if (!opts) + goto out; + + if (sscanf(cmd, GF_XATTR_CLRLK_CMD ".%s", opts) < 1) { + ret = -1; + goto out; + } + + /*clr_lk_prefix.ttype.kkind.args, args - type specific*/ + cur = opts; + for (i = 0; i < KW_MAX && (tok = strtok_r(cur, ".", &sptr)); + cur = NULL, i++) { + if (tok[0] != kw[i]) { + ret = -1; + goto out; } - - /*clr_lk_prefix.ttype.kkind.args, args - type specific*/ - cur = opts; - for (i = 0; i < KW_MAX && (tok = strtok_r (cur, ".", &sptr)); - cur = NULL, i++) { - if (tok[0] != kw[i]) { - ret = -1; - goto out; - } - if (i == KW_TYPE) - args->type = clrlk_get_type (tok+1); - if (i == KW_KIND) - args->kind = clrlk_get_kind (tok+1); - } - - if ((args->type == CLRLK_TYPE_MAX) || (args->kind == CLRLK_KIND_MAX)) - goto out; - - /*optional args, neither range nor basename can 'legally' contain - * "/" in them*/ - tok = strtok_r (NULL, "/", &sptr); - if (tok) - args->opts = gf_strdup (tok); - - ret = 0; + if (i == KW_TYPE) + args->type = clrlk_get_type(tok + 1); + if (i == KW_KIND) + args->kind = clrlk_get_kind(tok + 1); + } + + if ((args->type == CLRLK_TYPE_MAX) || (args->kind == CLRLK_KIND_MAX)) + goto out; + + /*optional args, neither range nor basename can 'legally' contain + * "/" in them*/ + tok = strtok_r(NULL, "/", &sptr); + if (tok) + args->opts = gf_strdup(tok); + + ret = 0; out: - GF_FREE (free_ptr); - return ret; + GF_FREE(free_ptr); + return ret; } int -clrlk_clear_posixlk (xlator_t *this, pl_inode_t *pl_inode, clrlk_args *args, - int *blkd, int *granted, int *op_errno) +clrlk_clear_posixlk(xlator_t *this, pl_inode_t *pl_inode, clrlk_args *args, + int *blkd, int *granted, int *op_errno) { - posix_lock_t *plock = NULL; - posix_lock_t *tmp = NULL; - struct gf_flock ulock = {0, }; - int ret = -1; - int bcount = 0; - int gcount = 0; - gf_boolean_t chk_range = _gf_false; - - if (clrlk_get_lock_range (args->opts, &ulock, &chk_range)) { - *op_errno = EINVAL; - goto out; - } - - pthread_mutex_lock (&pl_inode->mutex); + posix_lock_t *plock = NULL; + posix_lock_t *tmp = NULL; + struct gf_flock ulock = { + 0, + }; + int ret = -1; + int bcount = 0; + int gcount = 0; + gf_boolean_t chk_range = _gf_false; + + if (clrlk_get_lock_range(args->opts, &ulock, &chk_range)) { + *op_errno = EINVAL; + goto out; + } + + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(plock, tmp, &pl_inode->ext_list, list) { - list_for_each_entry_safe (plock, tmp, &pl_inode->ext_list, - list) { - if ((plock->blocked && - !(args->kind & CLRLK_BLOCKED)) || - (!plock->blocked && - !(args->kind & CLRLK_GRANTED))) - continue; - - if (chk_range && - (plock->user_flock.l_whence != ulock.l_whence - || plock->user_flock.l_start != ulock.l_start - || plock->user_flock.l_len != ulock.l_len)) - continue; - - list_del_init (&plock->list); - if (plock->blocked) { - bcount++; - pl_trace_out (this, plock->frame, NULL, NULL, - F_SETLKW, &plock->user_flock, - -1, EAGAIN, NULL); - - STACK_UNWIND_STRICT (lk, plock->frame, -1, EAGAIN, - &plock->user_flock, NULL); - - } else { - gcount++; - } - GF_FREE (plock); - } + if ((plock->blocked && !(args->kind & CLRLK_BLOCKED)) || + (!plock->blocked && !(args->kind & CLRLK_GRANTED))) + continue; + + if (chk_range && (plock->user_flock.l_whence != ulock.l_whence || + plock->user_flock.l_start != ulock.l_start || + plock->user_flock.l_len != ulock.l_len)) + continue; + + list_del_init(&plock->list); + if (plock->blocked) { + bcount++; + pl_trace_out(this, plock->frame, NULL, NULL, F_SETLKW, + &plock->user_flock, -1, EINTR, NULL); + + STACK_UNWIND_STRICT(lk, plock->frame, -1, EINTR, + &plock->user_flock, NULL); + + } else { + gcount++; + } + __destroy_lock(plock); } - pthread_mutex_unlock (&pl_inode->mutex); - grant_blocked_locks (this, pl_inode); - ret = 0; + } + pthread_mutex_unlock(&pl_inode->mutex); + grant_blocked_locks(this, pl_inode); + ret = 0; out: - *blkd = bcount; - *granted = gcount; - return ret; + *blkd = bcount; + *granted = gcount; + return ret; } /* Returns 0 on success and -1 on failure */ int -clrlk_clear_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, - clrlk_args *args, int *blkd, int *granted, int *op_errno) +clrlk_clear_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, + clrlk_args *args, int *blkd, int *granted, int *op_errno) { - pl_inode_lock_t *ilock = NULL; - pl_inode_lock_t *tmp = NULL; - struct gf_flock ulock = {0, }; - int ret = -1; - int bcount = 0; - int gcount = 0; - gf_boolean_t chk_range = _gf_false; - struct list_head released; - - INIT_LIST_HEAD (&released); - if (clrlk_get_lock_range (args->opts, &ulock, &chk_range)) { - *op_errno = EINVAL; - goto out; - } - - if (args->kind & CLRLK_BLOCKED) - goto blkd; - - if (args->kind & CLRLK_GRANTED) - goto granted; + posix_locks_private_t *priv; + pl_inode_lock_t *ilock = NULL; + pl_inode_lock_t *tmp = NULL; + struct gf_flock ulock = { + 0, + }; + int ret = -1; + int bcount = 0; + int gcount = 0; + gf_boolean_t chk_range = _gf_false; + struct list_head *pcontend = NULL; + struct list_head released; + struct list_head contend; + struct timespec now = {}; + + INIT_LIST_HEAD(&released); + + priv = this->private; + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD(pcontend); + timespec_now(&now); + } + + if (clrlk_get_lock_range(args->opts, &ulock, &chk_range)) { + *op_errno = EINVAL; + goto out; + } + + if (args->kind & CLRLK_BLOCKED) + goto blkd; + + if (args->kind & CLRLK_GRANTED) + goto granted; blkd: - pthread_mutex_lock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(ilock, tmp, &dom->blocked_inodelks, + blocked_locks) { - list_for_each_entry_safe (ilock, tmp, &dom->blocked_inodelks, - blocked_locks) { - if (chk_range && - (ilock->user_flock.l_whence != ulock.l_whence - || ilock->user_flock.l_start != ulock.l_start - || ilock->user_flock.l_len != ulock.l_len)) - continue; - - bcount++; - list_del_init (&ilock->blocked_locks); - list_add (&ilock->blocked_locks, &released); - } - } - pthread_mutex_unlock (&pl_inode->mutex); - - list_for_each_entry_safe (ilock, tmp, &released, blocked_locks) { - list_del_init (&ilock->blocked_locks); - pl_trace_out (this, ilock->frame, NULL, NULL, F_SETLKW, - &ilock->user_flock, -1, EAGAIN, - ilock->volume); - STACK_UNWIND_STRICT (inodelk, ilock->frame, -1, - EAGAIN, NULL); - //No need to take lock as the locks are only in one list - __pl_inodelk_unref (ilock); + if (chk_range && (ilock->user_flock.l_whence != ulock.l_whence || + ilock->user_flock.l_start != ulock.l_start || + ilock->user_flock.l_len != ulock.l_len)) + continue; + + bcount++; + list_del_init(&ilock->client_list); + list_del_init(&ilock->blocked_locks); + list_add(&ilock->blocked_locks, &released); } + } + pthread_mutex_unlock(&pl_inode->mutex); - if (!(args->kind & CLRLK_GRANTED)) { - ret = 0; - goto out; + if (!list_empty(&released)) { + list_for_each_entry_safe(ilock, tmp, &released, blocked_locks) + { + list_del_init(&ilock->blocked_locks); + pl_trace_out(this, ilock->frame, NULL, NULL, F_SETLKW, + &ilock->user_flock, -1, EAGAIN, ilock->volume); + STACK_UNWIND_STRICT(inodelk, ilock->frame, -1, EAGAIN, NULL); + // No need to take lock as the locks are only in one list + __pl_inodelk_unref(ilock); } + } + + if (!(args->kind & CLRLK_GRANTED)) { + ret = 0; + goto out; + } granted: - pthread_mutex_lock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(ilock, tmp, &dom->inodelk_list, list) { - list_for_each_entry_safe (ilock, tmp, &dom->inodelk_list, - list) { - if (chk_range && - (ilock->user_flock.l_whence != ulock.l_whence - || ilock->user_flock.l_start != ulock.l_start - || ilock->user_flock.l_len != ulock.l_len)) - continue; - - gcount++; - list_del_init (&ilock->list); - list_add (&ilock->list, &released); - } + if (chk_range && (ilock->user_flock.l_whence != ulock.l_whence || + ilock->user_flock.l_start != ulock.l_start || + ilock->user_flock.l_len != ulock.l_len)) + continue; + + gcount++; + list_del_init(&ilock->client_list); + list_del_init(&ilock->list); + list_add(&ilock->list, &released); } - pthread_mutex_unlock (&pl_inode->mutex); + } + pthread_mutex_unlock(&pl_inode->mutex); - list_for_each_entry_safe (ilock, tmp, &released, list) { - list_del_init (&ilock->list); - //No need to take lock as the locks are only in one list - __pl_inodelk_unref (ilock); - } + list_for_each_entry_safe(ilock, tmp, &released, list) + { + list_del_init(&ilock->list); + // No need to take lock as the locks are only in one list + __pl_inodelk_unref(ilock); + } - ret = 0; + ret = 0; out: - grant_blocked_inode_locks (this, pl_inode, dom); - *blkd = bcount; - *granted = gcount; - return ret; + grant_blocked_inode_locks(this, pl_inode, dom, &now, pcontend); + if (pcontend != NULL) { + inodelk_contention_notify(this, pcontend); + } + *blkd = bcount; + *granted = gcount; + return ret; } /* Returns 0 on success and -1 on failure */ int -clrlk_clear_entrylk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, - clrlk_args *args, int *blkd, int *granted, int *op_errno) +clrlk_clear_entrylk(xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, + clrlk_args *args, int *blkd, int *granted, int *op_errno) { - pl_entry_lock_t *elock = NULL; - pl_entry_lock_t *tmp = NULL; - int bcount = 0; - int gcount = 0; - int ret = -1; - struct list_head removed; - struct list_head released; - - INIT_LIST_HEAD (&released); - if (args->kind & CLRLK_BLOCKED) - goto blkd; - - if (args->kind & CLRLK_GRANTED) - goto granted; + posix_locks_private_t *priv; + pl_entry_lock_t *elock = NULL; + pl_entry_lock_t *tmp = NULL; + int bcount = 0; + int gcount = 0; + int ret = -1; + struct list_head *pcontend = NULL; + struct list_head removed; + struct list_head released; + struct list_head contend; + struct timespec now; + + INIT_LIST_HEAD(&released); + + priv = this->private; + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD(pcontend); + timespec_now(&now); + } + + if (args->kind & CLRLK_BLOCKED) + goto blkd; + + if (args->kind & CLRLK_GRANTED) + goto granted; blkd: - pthread_mutex_lock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(elock, tmp, &dom->blocked_entrylks, + blocked_locks) { - list_for_each_entry_safe (elock, tmp, &dom->blocked_entrylks, - blocked_locks) { - if (args->opts) { - if (!elock->basename || - strcmp (elock->basename, args->opts)) - continue; - } - - bcount++; - - list_del_init (&elock->blocked_locks); - list_add_tail (&elock->blocked_locks, &released); - } - } - pthread_mutex_unlock (&pl_inode->mutex); - - list_for_each_entry_safe (elock, tmp, &released, blocked_locks) { - list_del_init (&elock->blocked_locks); - entrylk_trace_out (this, elock->frame, elock->volume, NULL, NULL, - elock->basename, ENTRYLK_LOCK, elock->type, - -1, EAGAIN); - STACK_UNWIND_STRICT (entrylk, elock->frame, -1, EAGAIN, NULL); - GF_FREE ((char *) elock->basename); - GF_FREE (elock->connection_id); - GF_FREE (elock); + if (args->opts) { + if (!elock->basename || strcmp(elock->basename, args->opts)) + continue; + } + + bcount++; + + list_del_init(&elock->client_list); + list_del_init(&elock->blocked_locks); + list_add_tail(&elock->blocked_locks, &released); } + } + pthread_mutex_unlock(&pl_inode->mutex); + + if (!list_empty(&released)) { + list_for_each_entry_safe(elock, tmp, &released, blocked_locks) + { + list_del_init(&elock->blocked_locks); + entrylk_trace_out(this, elock->frame, elock->volume, NULL, NULL, + elock->basename, ENTRYLK_LOCK, elock->type, -1, + EAGAIN); + STACK_UNWIND_STRICT(entrylk, elock->frame, -1, EAGAIN, NULL); - if (!(args->kind & CLRLK_GRANTED)) { - ret = 0; - goto out; + __pl_entrylk_unref(elock); } + } + + if (!(args->kind & CLRLK_GRANTED)) { + ret = 0; + goto out; + } granted: - INIT_LIST_HEAD (&removed); - pthread_mutex_lock (&pl_inode->mutex); + INIT_LIST_HEAD(&removed); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(elock, tmp, &dom->entrylk_list, domain_list) { - list_for_each_entry_safe (elock, tmp, &dom->entrylk_list, - domain_list) { - if (args->opts) { - if (!elock->basename || - strcmp (elock->basename, args->opts)) - continue; - } - - gcount++; - list_del_init (&elock->domain_list); - list_add_tail (&elock->domain_list, &removed); - } - } - pthread_mutex_unlock (&pl_inode->mutex); + if (args->opts) { + if (!elock->basename || strcmp(elock->basename, args->opts)) + continue; + } - list_for_each_entry_safe (elock, tmp, &removed, domain_list) { - grant_blocked_entry_locks (this, pl_inode, elock, dom); + gcount++; + list_del_init(&elock->client_list); + list_del_init(&elock->domain_list); + list_add_tail(&elock->domain_list, &removed); + + __pl_entrylk_unref(elock); } + } + pthread_mutex_unlock(&pl_inode->mutex); - ret = 0; + grant_blocked_entry_locks(this, pl_inode, dom, &now, pcontend); + if (pcontend != NULL) { + entrylk_contention_notify(this, pcontend); + } + + ret = 0; out: - *blkd = bcount; - *granted = gcount; - return ret; + *blkd = bcount; + *granted = gcount; + return ret; } int -clrlk_clear_lks_in_all_domains (xlator_t *this, pl_inode_t *pl_inode, - clrlk_args *args, int *blkd, int *granted, - int *op_errno) +clrlk_clear_lks_in_all_domains(xlator_t *this, pl_inode_t *pl_inode, + clrlk_args *args, int *blkd, int *granted, + int *op_errno) { - pl_dom_list_t *dom = NULL; - int ret = -1; - int tmp_bcount = 0; - int tmp_gcount = 0; - - if (list_empty (&pl_inode->dom_list)) { - ret = 0; - goto out; - } + pl_dom_list_t *dom = NULL; + int ret = -1; + int tmp_bcount = 0; + int tmp_gcount = 0; - list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { - tmp_bcount = tmp_gcount = 0; - - switch (args->type) - { - case CLRLK_INODE: - ret = clrlk_clear_inodelk (this, pl_inode, dom, args, - &tmp_bcount, &tmp_gcount, - op_errno); - if (ret) - goto out; - break; - case CLRLK_ENTRY: - ret = clrlk_clear_entrylk (this, pl_inode, dom, args, - &tmp_bcount, &tmp_gcount, - op_errno); - if (ret) - goto out; - break; - } - - *blkd += tmp_bcount; - *granted += tmp_gcount; + if (list_empty(&pl_inode->dom_list)) { + ret = 0; + goto out; + } + + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + tmp_bcount = tmp_gcount = 0; + + switch (args->type) { + case CLRLK_INODE: + ret = clrlk_clear_inodelk(this, pl_inode, dom, args, + &tmp_bcount, &tmp_gcount, op_errno); + if (ret) + goto out; + break; + case CLRLK_ENTRY: + ret = clrlk_clear_entrylk(this, pl_inode, dom, args, + &tmp_bcount, &tmp_gcount, op_errno); + if (ret) + goto out; + break; } - ret = 0; + *blkd += tmp_bcount; + *granted += tmp_gcount; + } + + ret = 0; out: - return ret; + return ret; } diff --git a/xlators/features/locks/src/clear.h b/xlators/features/locks/src/clear.h index 511f3f74ae5..bc118cb1b81 100644 --- a/xlators/features/locks/src/clear.h +++ b/xlators/features/locks/src/clear.h @@ -10,67 +10,64 @@ #ifndef __CLEAR_H__ #define __CLEAR_H__ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "compat-errno.h" -#include "stack.h" -#include "call-stub.h" +#include <glusterfs/compat-errno.h> +#include <glusterfs/stack.h> +#include <glusterfs/call-stub.h> #include "locks.h" typedef enum { - CLRLK_INODE, - CLRLK_ENTRY, - CLRLK_POSIX, - CLRLK_TYPE_MAX + CLRLK_INODE, + CLRLK_ENTRY, + CLRLK_POSIX, + CLRLK_TYPE_MAX } clrlk_type; +extern const char *clrlk_type_names[]; + typedef enum { - CLRLK_BLOCKED = 1, - CLRLK_GRANTED, - CLRLK_ALL, - CLRLK_KIND_MAX + CLRLK_BLOCKED = 1, + CLRLK_GRANTED, + CLRLK_ALL, + CLRLK_KIND_MAX } clrlk_kind; typedef enum { - KW_TYPE, - KW_KIND, - /*add new keywords here*/ - KW_MAX + KW_TYPE, + KW_KIND, + /*add new keywords here*/ + KW_MAX } clrlk_opts; struct _clrlk_args; typedef struct _clrlk_args clrlk_args; struct _clrlk_args { - int type; - int kind; - char *opts; + int type; + int kind; + char *opts; }; int -clrlk_get__kind (char *kind); +clrlk_get__kind(char *kind); int -clrlk_get_type (char *type); +clrlk_get_type(char *type); int -clrlk_get_lock_range (char *range_str, struct gf_flock *ulock, - gf_boolean_t *chk_range); +clrlk_get_lock_range(char *range_str, struct gf_flock *ulock, + gf_boolean_t *chk_range); int -clrlk_parse_args (const char* cmd, clrlk_args *args); +clrlk_parse_args(const char *cmd, clrlk_args *args); int -clrlk_clear_posixlk (xlator_t *this, pl_inode_t *pl_inode, clrlk_args *args, - int *blkd, int *granted, int *op_errno); +clrlk_clear_posixlk(xlator_t *this, pl_inode_t *pl_inode, clrlk_args *args, + int *blkd, int *granted, int *op_errno); int -clrlk_clear_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, - clrlk_args *args, int *blkd, int *granted, int *op_errno); +clrlk_clear_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, + clrlk_args *args, int *blkd, int *granted, int *op_errno); int -clrlk_clear_entrylk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, - clrlk_args *args, int *blkd, int *granted, int *op_errno); +clrlk_clear_entrylk(xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, + clrlk_args *args, int *blkd, int *granted, int *op_errno); int -clrlk_clear_lks_in_all_domains (xlator_t *this, pl_inode_t *pl_inode, - clrlk_args *args, int *blkd, int *granted, - int *op_errno); +clrlk_clear_lks_in_all_domains(xlator_t *this, pl_inode_t *pl_inode, + clrlk_args *args, int *blkd, int *granted, + int *op_errno); #endif /* __CLEAR_H__ */ diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c index b3309580d3d..a2c6be93e03 100644 --- a/xlators/features/locks/src/common.c +++ b/xlators/features/locks/src/common.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2006-2012, 2015-2016 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -12,737 +12,775 @@ #include <limits.h> #include <pthread.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "inode.h" -#include "logging.h" -#include "common-utils.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/logging.h> +#include <glusterfs/syncop.h> #include "locks.h" #include "common.h" - static int -__is_lock_grantable (pl_inode_t *pl_inode, posix_lock_t *lock); +__is_lock_grantable(pl_inode_t *pl_inode, posix_lock_t *lock); static void -__insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock); +__insert_and_merge(pl_inode_t *pl_inode, posix_lock_t *lock); static int -pl_send_prelock_unlock (xlator_t *this, pl_inode_t *pl_inode, - posix_lock_t *old_lock); +pl_send_prelock_unlock(xlator_t *this, pl_inode_t *pl_inode, + posix_lock_t *old_lock); static pl_dom_list_t * -__allocate_domain (const char *volume) +__allocate_domain(const char *volume) { - pl_dom_list_t *dom = NULL; + pl_dom_list_t *dom = NULL; - dom = GF_CALLOC (1, sizeof (*dom), - gf_locks_mt_pl_dom_list_t); - if (!dom) - goto out; + dom = GF_CALLOC(1, sizeof(*dom), gf_locks_mt_pl_dom_list_t); + if (!dom) + goto out; - dom->domain = gf_strdup(volume); - if (!dom->domain) - goto out; + dom->domain = gf_strdup(volume); + if (!dom->domain) + goto out; - gf_log ("posix-locks", GF_LOG_TRACE, - "New domain allocated: %s", dom->domain); + gf_log("posix-locks", GF_LOG_TRACE, "New domain allocated: %s", + dom->domain); - INIT_LIST_HEAD (&dom->inode_list); - INIT_LIST_HEAD (&dom->entrylk_list); - INIT_LIST_HEAD (&dom->blocked_entrylks); - INIT_LIST_HEAD (&dom->inodelk_list); - INIT_LIST_HEAD (&dom->blocked_inodelks); + INIT_LIST_HEAD(&dom->inode_list); + INIT_LIST_HEAD(&dom->entrylk_list); + INIT_LIST_HEAD(&dom->blocked_entrylks); + INIT_LIST_HEAD(&dom->inodelk_list); + INIT_LIST_HEAD(&dom->blocked_inodelks); out: - if (dom && (NULL == dom->domain)) { - GF_FREE (dom); - dom = NULL; - } + if (dom && (NULL == dom->domain)) { + GF_FREE(dom); + dom = NULL; + } - return dom; + return dom; } /* Returns domain for the lock. If domain is not present, * allocates a domain and returns it */ pl_dom_list_t * -get_domain (pl_inode_t *pl_inode, const char *volume) +get_domain(pl_inode_t *pl_inode, const char *volume) { - pl_dom_list_t *dom = NULL; + pl_dom_list_t *dom = NULL; - GF_VALIDATE_OR_GOTO ("posix-locks", pl_inode, out); - GF_VALIDATE_OR_GOTO ("posix-locks", volume, out); + GF_VALIDATE_OR_GOTO("posix-locks", pl_inode, out); + GF_VALIDATE_OR_GOTO("posix-locks", volume, out); - pthread_mutex_lock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) { - list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { - if (strcmp (dom->domain, volume) == 0) - goto unlock; - } - - dom = __allocate_domain (volume); - if (dom) - list_add (&dom->inode_list, &pl_inode->dom_list); + if (strcmp(dom->domain, volume) == 0) + goto unlock; } + + dom = __allocate_domain(volume); + if (dom) + list_add(&dom->inode_list, &pl_inode->dom_list); + } unlock: - pthread_mutex_unlock (&pl_inode->mutex); - if (dom) { - gf_log ("posix-locks", GF_LOG_TRACE, "Domain %s found", volume); - } else { - gf_log ("posix-locks", GF_LOG_TRACE, "Domain %s not found", volume); - } + pthread_mutex_unlock(&pl_inode->mutex); + if (dom) { + gf_log("posix-locks", GF_LOG_TRACE, "Domain %s found", volume); + } else { + gf_log("posix-locks", GF_LOG_TRACE, "Domain %s not found", volume); + } out: - return dom; + return dom; } unsigned long -fd_to_fdnum (fd_t *fd) +fd_to_fdnum(fd_t *fd) { - return ((unsigned long) fd); + return ((unsigned long)fd); } fd_t * -fd_from_fdnum (posix_lock_t *lock) +fd_from_fdnum(posix_lock_t *lock) { - return ((fd_t *) lock->fd_num); + return ((fd_t *)lock->fd_num); } int -__pl_inode_is_empty (pl_inode_t *pl_inode) +__pl_inode_is_empty(pl_inode_t *pl_inode) { - pl_dom_list_t *dom = NULL; - int is_empty = 1; - - if (!list_empty (&pl_inode->ext_list)) - is_empty = 0; - - list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { - if (!list_empty (&dom->entrylk_list)) - is_empty = 0; - - if (!list_empty (&dom->inodelk_list)) - is_empty = 0; - } - - return is_empty; + return (list_empty(&pl_inode->ext_list)); } void -pl_print_locker (char *str, int size, xlator_t *this, call_frame_t *frame) +pl_print_locker(char *str, int size, xlator_t *this, call_frame_t *frame) { - snprintf (str, size, "Pid=%llu, lk-owner=%s, Client=%p, Frame=%llu", - (unsigned long long) frame->root->pid, - lkowner_utoa (&frame->root->lk_owner), - frame->root->client, - (unsigned long long) frame->root->unique); + snprintf(str, size, "Pid=%llu, lk-owner=%s, Client=%p, Frame=%llu", + (unsigned long long)frame->root->pid, + lkowner_utoa(&frame->root->lk_owner), frame->root->client, + (unsigned long long)frame->root->unique); } - void -pl_print_lockee (char *str, int size, fd_t *fd, loc_t *loc) +pl_print_lockee(char *str, int size, fd_t *fd, loc_t *loc) { - inode_t *inode = NULL; - char *ipath = NULL; - int ret = 0; + inode_t *inode = NULL; + char *ipath = NULL; + int ret = 0; - if (fd) - inode = fd->inode; - if (loc) - inode = loc->inode; + if (fd) + inode = fd->inode; + if (loc) + inode = loc->inode; - if (!inode) { - snprintf (str, size, "<nul>"); - return; - } + if (!inode) { + snprintf(str, size, "<nul>"); + return; + } - if (loc && loc->path) { - ipath = gf_strdup (loc->path); - } else { - ret = inode_path (inode, NULL, &ipath); - if (ret <= 0) - ipath = NULL; - } + if (loc && loc->path) { + ipath = gf_strdup(loc->path); + } else { + ret = inode_path(inode, NULL, &ipath); + if (ret <= 0) + ipath = NULL; + } - snprintf (str, size, "gfid=%s, fd=%p, path=%s", - uuid_utoa (inode->gfid), fd, - ipath ? ipath : "<nul>"); + snprintf(str, size, "gfid=%s, fd=%p, path=%s", uuid_utoa(inode->gfid), fd, + ipath ? ipath : "<nul>"); - GF_FREE (ipath); + GF_FREE(ipath); } - void -pl_print_lock (char *str, int size, int cmd, - struct gf_flock *flock, gf_lkowner_t *owner) +pl_print_lock(char *str, int size, int cmd, struct gf_flock *flock, + gf_lkowner_t *owner) { - char *cmd_str = NULL; - char *type_str = NULL; + char *cmd_str = NULL; + char *type_str = NULL; - switch (cmd) { + switch (cmd) { #if F_GETLK != F_GETLK64 case F_GETLK64: #endif case F_GETLK: - cmd_str = "GETLK"; - break; + cmd_str = "GETLK"; + break; #if F_SETLK != F_SETLK64 case F_SETLK64: #endif case F_SETLK: - cmd_str = "SETLK"; - break; + cmd_str = "SETLK"; + break; #if F_SETLKW != F_SETLKW64 case F_SETLKW64: #endif case F_SETLKW: - cmd_str = "SETLKW"; - break; + cmd_str = "SETLKW"; + break; default: - cmd_str = "UNKNOWN"; - break; - } + cmd_str = "UNKNOWN"; + break; + } - switch (flock->l_type) { + switch (flock->l_type) { case F_RDLCK: - type_str = "READ"; - break; + type_str = "READ"; + break; case F_WRLCK: - type_str = "WRITE"; - break; + type_str = "WRITE"; + break; case F_UNLCK: - type_str = "UNLOCK"; - break; + type_str = "UNLOCK"; + break; default: - type_str = "UNKNOWN"; - break; - } - - snprintf (str, size, "lock=FCNTL, cmd=%s, type=%s, " - "start=%llu, len=%llu, pid=%llu, lk-owner=%s", - cmd_str, type_str, (unsigned long long) flock->l_start, - (unsigned long long) flock->l_len, - (unsigned long long) flock->l_pid, - lkowner_utoa (owner)); + type_str = "UNKNOWN"; + break; + } + + snprintf(str, size, + "lock=FCNTL, cmd=%s, type=%s, " + "start=%llu, len=%llu, pid=%llu, lk-owner=%s", + cmd_str, type_str, (unsigned long long)flock->l_start, + (unsigned long long)flock->l_len, (unsigned long long)flock->l_pid, + lkowner_utoa(owner)); } - void -pl_trace_in (xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, - int cmd, struct gf_flock *flock, const char *domain) +pl_trace_in(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, int cmd, + struct gf_flock *flock, const char *domain) { - posix_locks_private_t *priv = NULL; - char pl_locker[256]; - char pl_lockee[256]; - char pl_lock[256]; - - priv = this->private; + posix_locks_private_t *priv = this->private; + char pl_locker[256]; + char pl_lockee[256]; + char pl_lock[256]; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_print_locker (pl_locker, 256, this, frame); - pl_print_lockee (pl_lockee, 256, fd, loc); - if (domain) - pl_print_inodelk (pl_lock, 256, cmd, flock, domain); - else - pl_print_lock (pl_lock, 256, cmd, flock, &frame->root->lk_owner); + pl_print_locker(pl_locker, 256, this, frame); + pl_print_lockee(pl_lockee, 256, fd, loc); + if (domain) + pl_print_inodelk(pl_lock, 256, cmd, flock, domain); + else + pl_print_lock(pl_lock, 256, cmd, flock, &frame->root->lk_owner); - gf_log (this->name, GF_LOG_INFO, - "[REQUEST] Locker = {%s} Lockee = {%s} Lock = {%s}", - pl_locker, pl_lockee, pl_lock); + gf_log(this->name, GF_LOG_INFO, + "[REQUEST] Locker = {%s} Lockee = {%s} Lock = {%s}", pl_locker, + pl_lockee, pl_lock); } - void -pl_print_verdict (char *str, int size, int op_ret, int op_errno) +pl_print_verdict(char *str, int size, int op_ret, int op_errno) { - char *verdict = NULL; - - if (op_ret == 0) { - verdict = "GRANTED"; - } else { - switch (op_errno) { - case EAGAIN: - verdict = "TRYAGAIN"; - break; - default: - verdict = strerror (op_errno); - } + char *verdict = NULL; + + if (op_ret == 0) { + verdict = "GRANTED"; + } else { + switch (op_errno) { + case EAGAIN: + verdict = "TRYAGAIN"; + break; + default: + verdict = strerror(op_errno); } + } - snprintf (str, size, "%s", verdict); + snprintf(str, size, "%s", verdict); } - void -pl_trace_out (xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, - int cmd, struct gf_flock *flock, int op_ret, int op_errno, const char *domain) +pl_trace_out(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, int cmd, + struct gf_flock *flock, int op_ret, int op_errno, + const char *domain) { - posix_locks_private_t *priv = NULL; - char pl_locker[256]; - char pl_lockee[256]; - char pl_lock[256]; - char verdict[32]; + posix_locks_private_t *priv = NULL; + char pl_locker[256]; + char pl_lockee[256]; + char pl_lock[256]; + char verdict[32]; - priv = this->private; + priv = this->private; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_print_locker (pl_locker, 256, this, frame); - pl_print_lockee (pl_lockee, 256, fd, loc); - if (domain) - pl_print_inodelk (pl_lock, 256, cmd, flock, domain); - else - pl_print_lock (pl_lock, 256, cmd, flock, &frame->root->lk_owner); + pl_print_locker(pl_locker, 256, this, frame); + pl_print_lockee(pl_lockee, 256, fd, loc); + if (domain) + pl_print_inodelk(pl_lock, 256, cmd, flock, domain); + else + pl_print_lock(pl_lock, 256, cmd, flock, &frame->root->lk_owner); - pl_print_verdict (verdict, 32, op_ret, op_errno); + pl_print_verdict(verdict, 32, op_ret, op_errno); - gf_log (this->name, GF_LOG_INFO, - "[%s] Locker = {%s} Lockee = {%s} Lock = {%s}", - verdict, pl_locker, pl_lockee, pl_lock); + gf_log(this->name, GF_LOG_INFO, + "[%s] Locker = {%s} Lockee = {%s} Lock = {%s}", verdict, pl_locker, + pl_lockee, pl_lock); } - void -pl_trace_block (xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, - int cmd, struct gf_flock *flock, const char *domain) +pl_trace_block(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, + int cmd, struct gf_flock *flock, const char *domain) { - posix_locks_private_t *priv = NULL; - char pl_locker[256]; - char pl_lockee[256]; - char pl_lock[256]; + posix_locks_private_t *priv = this->private; + char pl_locker[256]; + char pl_lockee[256]; + char pl_lock[256]; - priv = this->private; - - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_print_locker (pl_locker, 256, this, frame); - pl_print_lockee (pl_lockee, 256, fd, loc); - if (domain) - pl_print_inodelk (pl_lock, 256, cmd, flock, domain); - else - pl_print_lock (pl_lock, 256, cmd, flock, &frame->root->lk_owner); + pl_print_locker(pl_locker, 256, this, frame); + pl_print_lockee(pl_lockee, 256, fd, loc); + if (domain) + pl_print_inodelk(pl_lock, 256, cmd, flock, domain); + else + pl_print_lock(pl_lock, 256, cmd, flock, &frame->root->lk_owner); - gf_log (this->name, GF_LOG_INFO, - "[BLOCKED] Locker = {%s} Lockee = {%s} Lock = {%s}", - pl_locker, pl_lockee, pl_lock); + gf_log(this->name, GF_LOG_INFO, + "[BLOCKED] Locker = {%s} Lockee = {%s} Lock = {%s}", pl_locker, + pl_lockee, pl_lock); } - void -pl_trace_flush (xlator_t *this, call_frame_t *frame, fd_t *fd) +pl_trace_flush(xlator_t *this, call_frame_t *frame, fd_t *fd) { - posix_locks_private_t *priv = NULL; - char pl_locker[256]; - char pl_lockee[256]; - pl_inode_t *pl_inode = NULL; + posix_locks_private_t *priv = NULL; + char pl_locker[256]; + char pl_lockee[256]; + pl_inode_t *pl_inode = NULL; - priv = this->private; + priv = this->private; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_inode = pl_inode_get (this, fd->inode); + pl_inode = pl_inode_get(this, fd->inode, NULL); - if (pl_inode && __pl_inode_is_empty (pl_inode)) - return; + if (pl_inode && __pl_inode_is_empty(pl_inode)) + return; - pl_print_locker (pl_locker, 256, this, frame); - pl_print_lockee (pl_lockee, 256, fd, NULL); + pl_print_locker(pl_locker, 256, this, frame); + pl_print_lockee(pl_lockee, 256, fd, NULL); - gf_log (this->name, GF_LOG_INFO, - "[FLUSH] Locker = {%s} Lockee = {%s}", - pl_locker, pl_lockee); + gf_log(this->name, GF_LOG_INFO, "[FLUSH] Locker = {%s} Lockee = {%s}", + pl_locker, pl_lockee); } void -pl_trace_release (xlator_t *this, fd_t *fd) +pl_trace_release(xlator_t *this, fd_t *fd) { - posix_locks_private_t *priv = NULL; - char pl_lockee[256]; + posix_locks_private_t *priv = NULL; + char pl_lockee[256]; - priv = this->private; + priv = this->private; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_print_lockee (pl_lockee, 256, fd, NULL); + pl_print_lockee(pl_lockee, 256, fd, NULL); - gf_log (this->name, GF_LOG_INFO, - "[RELEASE] Lockee = {%s}", pl_lockee); + gf_log(this->name, GF_LOG_INFO, "[RELEASE] Lockee = {%s}", pl_lockee); } - void -pl_update_refkeeper (xlator_t *this, inode_t *inode) +pl_update_refkeeper(xlator_t *this, inode_t *inode) { - pl_inode_t *pl_inode = NULL; - int is_empty = 0; - int need_unref = 0; - int need_ref = 0; + pl_inode_t *pl_inode = NULL; + int is_empty = 0; + int need_unref = 0; + int need_ref = 0; - pl_inode = pl_inode_get (this, inode); + pl_inode = pl_inode_get(this, inode, NULL); + if (!pl_inode) + return; - pthread_mutex_lock (&pl_inode->mutex); - { - is_empty = __pl_inode_is_empty (pl_inode); + pthread_mutex_lock(&pl_inode->mutex); + { + is_empty = __pl_inode_is_empty(pl_inode); - if (is_empty && pl_inode->refkeeper) { - need_unref = 1; - pl_inode->refkeeper = NULL; - } + if (is_empty && pl_inode->refkeeper) { + need_unref = 1; + pl_inode->refkeeper = NULL; + } - if (!is_empty && !pl_inode->refkeeper) { - need_ref = 1; - pl_inode->refkeeper = inode; - } + if (!is_empty && !pl_inode->refkeeper) { + need_ref = 1; + pl_inode->refkeeper = inode; } - pthread_mutex_unlock (&pl_inode->mutex); + } + pthread_mutex_unlock(&pl_inode->mutex); - if (need_unref) - inode_unref (inode); + if (need_unref) + inode_unref(inode); - if (need_ref) - inode_ref (inode); + if (need_ref) + inode_ref(inode); } - -pl_inode_t * -pl_inode_get (xlator_t *this, inode_t *inode) +/* Get lock enforcement info from disk */ +int +pl_fetch_mlock_info_from_disk(xlator_t *this, pl_inode_t *pl_inode, + pl_local_t *local) { - uint64_t tmp_pl_inode = 0; - pl_inode_t *pl_inode = NULL; - int ret = 0; - - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &tmp_pl_inode); - if (ret == 0) { - pl_inode = (pl_inode_t *)(long)tmp_pl_inode; - goto unlock; - } - pl_inode = GF_CALLOC (1, sizeof (*pl_inode), - gf_locks_mt_pl_inode_t); - if (!pl_inode) { - goto unlock; - } + dict_t *xdata_rsp = NULL; + int ret = 0; + int op_ret = 0; + + if (!local) { + return -1; + } + + if (local->fd) { + op_ret = syncop_fgetxattr(this, local->fd, &xdata_rsp, + GF_ENFORCE_MANDATORY_LOCK, NULL, NULL); + } else { + op_ret = syncop_getxattr(this, &local->loc[0], &xdata_rsp, + GF_ENFORCE_MANDATORY_LOCK, NULL, NULL); + } + + pthread_mutex_lock(&pl_inode->mutex); + { + if (op_ret >= 0) { + pl_inode->mlock_enforced = _gf_true; + pl_inode->check_mlock_info = _gf_false; + } else { + gf_msg(this->name, GF_LOG_WARNING, -op_ret, 0, + "getxattr failed with %d", op_ret); + pl_inode->mlock_enforced = _gf_false; + + if (-op_ret == ENODATA) { + pl_inode->check_mlock_info = _gf_false; + } else { + pl_inode->check_mlock_info = _gf_true; + } + } + } + pthread_mutex_unlock(&pl_inode->mutex); - gf_log (this->name, GF_LOG_TRACE, - "Allocating new pl inode"); + return ret; +} - pthread_mutex_init (&pl_inode->mutex, NULL); +pl_inode_t * +pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local) +{ + uint64_t tmp_pl_inode = 0; + pl_inode_t *pl_inode = NULL; + int ret = 0; + + LOCK(&inode->lock); + { + ret = __inode_ctx_get(inode, this, &tmp_pl_inode); + if (ret == 0) { + pl_inode = (pl_inode_t *)(long)tmp_pl_inode; + goto unlock; + } - INIT_LIST_HEAD (&pl_inode->dom_list); - INIT_LIST_HEAD (&pl_inode->ext_list); - INIT_LIST_HEAD (&pl_inode->rw_list); - INIT_LIST_HEAD (&pl_inode->reservelk_list); - INIT_LIST_HEAD (&pl_inode->blocked_reservelks); - INIT_LIST_HEAD (&pl_inode->blocked_calls); + pl_inode = GF_CALLOC(1, sizeof(*pl_inode), gf_locks_mt_pl_inode_t); + if (!pl_inode) { + goto unlock; + } - __inode_ctx_put (inode, this, (uint64_t)(long)(pl_inode)); + gf_log(this->name, GF_LOG_TRACE, "Allocating new pl inode"); + + pthread_mutex_init(&pl_inode->mutex, NULL); + pthread_cond_init(&pl_inode->check_fop_wind_count, 0); + + INIT_LIST_HEAD(&pl_inode->dom_list); + INIT_LIST_HEAD(&pl_inode->ext_list); + INIT_LIST_HEAD(&pl_inode->rw_list); + INIT_LIST_HEAD(&pl_inode->reservelk_list); + INIT_LIST_HEAD(&pl_inode->blocked_reservelks); + INIT_LIST_HEAD(&pl_inode->blocked_calls); + INIT_LIST_HEAD(&pl_inode->metalk_list); + INIT_LIST_HEAD(&pl_inode->queued_locks); + INIT_LIST_HEAD(&pl_inode->waiting); + gf_uuid_copy(pl_inode->gfid, inode->gfid); + + pl_inode->check_mlock_info = _gf_true; + pl_inode->mlock_enforced = _gf_false; + + /* -2 means never looked up. -1 means something went wrong and link + * tracking is disabled. */ + pl_inode->links = -2; + + ret = __inode_ctx_put(inode, this, (uint64_t)(long)(pl_inode)); + if (ret) { + pthread_mutex_destroy(&pl_inode->mutex); + GF_FREE(pl_inode); + pl_inode = NULL; + goto unlock; } + } unlock: - UNLOCK (&inode->lock); + UNLOCK(&inode->lock); - return pl_inode; -} + if ((pl_inode != NULL) && pl_is_mandatory_locking_enabled(pl_inode) && + pl_inode->check_mlock_info && local) { + /* Note: The lock enforcement information per file can be stored in the + attribute flag of stat(x) in posix. With that there won't be a need + for doing getxattr post a reboot + */ + pl_fetch_mlock_info_from_disk(this, pl_inode, local); + } + return pl_inode; +} /* Create a new posix_lock_t */ posix_lock_t * -new_posix_lock (struct gf_flock *flock, client_t *client, pid_t client_pid, - gf_lkowner_t *owner, fd_t *fd) +new_posix_lock(struct gf_flock *flock, client_t *client, pid_t client_pid, + gf_lkowner_t *owner, fd_t *fd, uint32_t lk_flags, int blocking, + int32_t *op_errno) { - posix_lock_t *lock = NULL; + posix_lock_t *lock = NULL; - GF_VALIDATE_OR_GOTO ("posix-locks", flock, out); - GF_VALIDATE_OR_GOTO ("posix-locks", client, out); - GF_VALIDATE_OR_GOTO ("posix-locks", fd, out); + GF_VALIDATE_OR_GOTO("posix-locks", flock, out); + GF_VALIDATE_OR_GOTO("posix-locks", client, out); + GF_VALIDATE_OR_GOTO("posix-locks", fd, out); - lock = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!lock) { - goto out; - } + if (!pl_is_lk_owner_valid(owner, client)) { + *op_errno = EINVAL; + goto out; + } + + lock = GF_CALLOC(1, sizeof(posix_lock_t), gf_locks_mt_posix_lock_t); + if (!lock) { + *op_errno = ENOMEM; + goto out; + } + + lock->fl_start = flock->l_start; + lock->fl_type = flock->l_type; - lock->fl_start = flock->l_start; - lock->fl_type = flock->l_type; + if (flock->l_len == 0) + lock->fl_end = LLONG_MAX; + else + lock->fl_end = flock->l_start + flock->l_len - 1; - if (flock->l_len == 0) - lock->fl_end = LLONG_MAX; - else - lock->fl_end = flock->l_start + flock->l_len - 1; + lock->client = client; - lock->client = client; - lock->fd_num = fd_to_fdnum (fd); - lock->fd = fd; - lock->client_pid = client_pid; - lock->owner = *owner; + lock->client_uid = gf_strdup(client->client_uid); + if (lock->client_uid == NULL) { + GF_FREE(lock); + lock = NULL; + *op_errno = ENOMEM; + goto out; + } - INIT_LIST_HEAD (&lock->list); + lock->fd_num = fd_to_fdnum(fd); + lock->fd = fd; + lock->client_pid = client_pid; + lock->owner = *owner; + lock->lk_flags = lk_flags; + + lock->blocking = blocking; + memcpy(&lock->user_flock, flock, sizeof(lock->user_flock)); + + INIT_LIST_HEAD(&lock->list); out: - return lock; + return lock; } - /* Delete a lock from the inode's lock list */ void -__delete_lock (pl_inode_t *pl_inode, posix_lock_t *lock) +__delete_lock(posix_lock_t *lock) { - list_del_init (&lock->list); + list_del_init(&lock->list); } - /* Destroy a posix_lock */ void -__destroy_lock (posix_lock_t *lock) +__destroy_lock(posix_lock_t *lock) { - GF_FREE (lock); + GF_FREE(lock->client_uid); + GF_FREE(lock); } +static posix_lock_t * +__copy_lock(posix_lock_t *src) +{ + posix_lock_t *dst; + + dst = GF_MALLOC(sizeof(posix_lock_t), gf_locks_mt_posix_lock_t); + if (dst != NULL) { + memcpy(dst, src, sizeof(posix_lock_t)); + dst->client_uid = gf_strdup(src->client_uid); + if (dst->client_uid == NULL) { + GF_FREE(dst); + dst = NULL; + } + + if (dst != NULL) + INIT_LIST_HEAD(&dst->list); + } + + return dst; +} /* Convert a posix_lock to a struct gf_flock */ void -posix_lock_to_flock (posix_lock_t *lock, struct gf_flock *flock) +posix_lock_to_flock(posix_lock_t *lock, struct gf_flock *flock) { - flock->l_pid = lock->client_pid; - flock->l_type = lock->fl_type; - flock->l_start = lock->fl_start; - flock->l_owner = lock->owner; - - if (lock->fl_end == LLONG_MAX) - flock->l_len = 0; - else - flock->l_len = lock->fl_end - lock->fl_start + 1; + flock->l_pid = lock->user_flock.l_pid; + flock->l_type = lock->fl_type; + flock->l_start = lock->fl_start; + flock->l_owner = lock->owner; + + if (lock->fl_end == LLONG_MAX) + flock->l_len = 0; + else + flock->l_len = lock->fl_end - lock->fl_start + 1; } /* Insert the lock into the inode's lock list */ static void -__insert_lock (pl_inode_t *pl_inode, posix_lock_t *lock) +__insert_lock(pl_inode_t *pl_inode, posix_lock_t *lock) { - if (lock->blocked) - gettimeofday (&lock->blkd_time, NULL); - else - gettimeofday (&lock->granted_time, NULL); + if (lock->blocked) + lock->blkd_time = gf_time(); + else + lock->granted_time = gf_time(); - list_add_tail (&lock->list, &pl_inode->ext_list); - - return; + list_add_tail(&lock->list, &pl_inode->ext_list); } - /* Return true if the locks overlap, false otherwise */ int -locks_overlap (posix_lock_t *l1, posix_lock_t *l2) +locks_overlap(posix_lock_t *l1, posix_lock_t *l2) { - /* - Note: - FUSE always gives us absolute offsets, so no need to worry - about SEEK_CUR or SEEK_END - */ + /* + Note: + FUSE always gives us absolute offsets, so no need to worry + about SEEK_CUR or SEEK_END + */ - return ((l1->fl_end >= l2->fl_start) && - (l2->fl_end >= l1->fl_start)); + return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start)); } - /* Return true if the locks have the same owner */ int -same_owner (posix_lock_t *l1, posix_lock_t *l2) +same_owner(posix_lock_t *l1, posix_lock_t *l2) { - - return (is_same_lkowner (&l1->owner, &l2->owner) && - (l1->client == l2->client)); - + return (is_same_lkowner(&l1->owner, &l2->owner) && + (l1->client == l2->client)); } - /* Delete all F_UNLCK locks */ void -__delete_unlck_locks (pl_inode_t *pl_inode) +__delete_unlck_locks(pl_inode_t *pl_inode) { - posix_lock_t *l = NULL; - posix_lock_t *tmp = NULL; - - list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) { - if (l->fl_type == F_UNLCK) { - __delete_lock (pl_inode, l); - __destroy_lock (l); - } + posix_lock_t *l = NULL; + posix_lock_t *tmp = NULL; + + list_for_each_entry_safe(l, tmp, &pl_inode->ext_list, list) + { + if (l->fl_type == F_UNLCK) { + __delete_lock(l); + __destroy_lock(l); } + } } - /* Add two locks */ static posix_lock_t * -add_locks (posix_lock_t *l1, posix_lock_t *l2) +add_locks(posix_lock_t *l1, posix_lock_t *l2, posix_lock_t *dst) { - posix_lock_t *sum = NULL; + posix_lock_t *sum = NULL; + + sum = __copy_lock(dst); + if (!sum) + return NULL; - sum = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!sum) - return NULL; + sum->fl_start = min(l1->fl_start, l2->fl_start); + sum->fl_end = max(l1->fl_end, l2->fl_end); - sum->fl_start = min (l1->fl_start, l2->fl_start); - sum->fl_end = max (l1->fl_end, l2->fl_end); + posix_lock_to_flock(sum, &sum->user_flock); - return sum; + return sum; } /* Subtract two locks */ struct _values { - posix_lock_t *locks[3]; + posix_lock_t *locks[3]; }; /* {big} must always be contained inside {small} */ static struct _values -subtract_locks (posix_lock_t *big, posix_lock_t *small) +subtract_locks(posix_lock_t *big, posix_lock_t *small) { + struct _values v = {.locks = {0, 0, 0}}; - struct _values v = { .locks = {0, 0, 0} }; - - if ((big->fl_start == small->fl_start) && - (big->fl_end == small->fl_end)) { - /* both edges coincide with big */ - v.locks[0] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[0]) - goto out; - memcpy (v.locks[0], big, sizeof (posix_lock_t)); - v.locks[0]->fl_type = small->fl_type; - goto done; + if ((big->fl_start == small->fl_start) && (big->fl_end == small->fl_end)) { + /* both edges coincide with big */ + v.locks[0] = __copy_lock(big); + if (!v.locks[0]) { + goto out; } - if ((small->fl_start > big->fl_start) && - (small->fl_end < big->fl_end)) { - /* both edges lie inside big */ - v.locks[0] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[0]) - goto out; - - v.locks[1] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[1]) - goto out; - - v.locks[2] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[1]) - goto out; - - memcpy (v.locks[0], big, sizeof (posix_lock_t)); - v.locks[0]->fl_end = small->fl_start - 1; - - memcpy (v.locks[1], small, sizeof (posix_lock_t)); - - memcpy (v.locks[2], big, sizeof (posix_lock_t)); - v.locks[2]->fl_start = small->fl_end + 1; - goto done; - + v.locks[0]->fl_type = small->fl_type; + v.locks[0]->user_flock.l_type = small->fl_type; + goto done; + } + + if ((small->fl_start > big->fl_start) && (small->fl_end < big->fl_end)) { + /* both edges lie inside big */ + v.locks[0] = __copy_lock(big); + v.locks[1] = __copy_lock(small); + v.locks[2] = __copy_lock(big); + if ((v.locks[0] == NULL) || (v.locks[1] == NULL) || + (v.locks[2] == NULL)) { + goto out; } - /* one edge coincides with big */ - if (small->fl_start == big->fl_start) { - v.locks[0] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[0]) - goto out; - - v.locks[1] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[1]) - goto out; - - memcpy (v.locks[0], big, sizeof (posix_lock_t)); - v.locks[0]->fl_start = small->fl_end + 1; - - memcpy (v.locks[1], small, sizeof (posix_lock_t)); - goto done; + v.locks[0]->fl_end = small->fl_start - 1; + v.locks[2]->fl_start = small->fl_end + 1; + posix_lock_to_flock(v.locks[0], &v.locks[0]->user_flock); + posix_lock_to_flock(v.locks[2], &v.locks[2]->user_flock); + goto done; + } + + /* one edge coincides with big */ + if (small->fl_start == big->fl_start) { + v.locks[0] = __copy_lock(big); + v.locks[1] = __copy_lock(small); + if ((v.locks[0] == NULL) || (v.locks[1] == NULL)) { + goto out; } - if (small->fl_end == big->fl_end) { - v.locks[0] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[0]) - goto out; + v.locks[0]->fl_start = small->fl_end + 1; + posix_lock_to_flock(v.locks[0], &v.locks[0]->user_flock); + goto done; + } - v.locks[1] = GF_CALLOC (1, sizeof (posix_lock_t), - gf_locks_mt_posix_lock_t); - if (!v.locks[1]) - goto out; - - memcpy (v.locks[0], big, sizeof (posix_lock_t)); - v.locks[0]->fl_end = small->fl_start - 1; - - memcpy (v.locks[1], small, sizeof (posix_lock_t)); - goto done; + if (small->fl_end == big->fl_end) { + v.locks[0] = __copy_lock(big); + v.locks[1] = __copy_lock(small); + if ((v.locks[0] == NULL) || (v.locks[1] == NULL)) { + goto out; } - GF_ASSERT (0); - gf_log ("posix-locks", GF_LOG_ERROR, "Unexpected case in subtract_locks"); + v.locks[0]->fl_end = small->fl_start - 1; + posix_lock_to_flock(v.locks[0], &v.locks[0]->user_flock); + goto done; + } + + GF_ASSERT(0); + gf_log("posix-locks", GF_LOG_ERROR, "Unexpected case in subtract_locks"); out: - if (v.locks[0]) { - GF_FREE (v.locks[0]); - v.locks[0] = NULL; - } - if (v.locks[1]) { - GF_FREE (v.locks[1]); - v.locks[1] = NULL; - } - if (v.locks[2]) { - GF_FREE (v.locks[2]); - v.locks[2] = NULL; - } + if (v.locks[0]) { + __destroy_lock(v.locks[0]); + v.locks[0] = NULL; + } + if (v.locks[1]) { + __destroy_lock(v.locks[1]); + v.locks[1] = NULL; + } + if (v.locks[2]) { + __destroy_lock(v.locks[2]); + v.locks[2] = NULL; + } done: - return v; + return v; } static posix_lock_t * -first_conflicting_overlap (pl_inode_t *pl_inode, posix_lock_t *lock) +first_conflicting_overlap(pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *l = NULL; - posix_lock_t *conf = NULL; + posix_lock_t *l = NULL; + posix_lock_t *conf = NULL; - pthread_mutex_lock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry(l, &pl_inode->ext_list, list) { - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (l->blocked) - continue; - - if (locks_overlap (l, lock)) { - if (same_owner (l, lock)) - continue; - - if ((l->fl_type == F_WRLCK) || - (lock->fl_type == F_WRLCK)) { - conf = l; - goto unlock; - } - } + if (l->blocked) + continue; + + if (locks_overlap(l, lock)) { + if (same_owner(l, lock)) + continue; + + if ((l->fl_type == F_WRLCK) || (lock->fl_type == F_WRLCK)) { + conf = l; + goto unlock; } + } } + } unlock: - pthread_mutex_unlock (&pl_inode->mutex); + pthread_mutex_unlock(&pl_inode->mutex); - return conf; + return conf; } /* @@ -751,471 +789,803 @@ unlock: If {begin} is NULL, then start from the beginning of the list */ static posix_lock_t * -first_overlap (pl_inode_t *pl_inode, posix_lock_t *lock) +first_overlap(pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *l = NULL; + posix_lock_t *l = NULL; - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (l->blocked) - continue; + list_for_each_entry(l, &pl_inode->ext_list, list) + { + if (l->blocked) + continue; - if (locks_overlap (l, lock)) - return l; - } + if (locks_overlap(l, lock)) + return l; + } - return NULL; + return NULL; } - - /* Return true if lock is grantable */ static int -__is_lock_grantable (pl_inode_t *pl_inode, posix_lock_t *lock) +__is_lock_grantable(pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *l = NULL; - int ret = 1; - - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (!l->blocked && locks_overlap (lock, l)) { - if (((l->fl_type == F_WRLCK) - || (lock->fl_type == F_WRLCK)) - && (lock->fl_type != F_UNLCK) - && !same_owner (l, lock)) { - ret = 0; - break; - } - } + posix_lock_t *l = NULL; + int ret = 1; + + list_for_each_entry(l, &pl_inode->ext_list, list) + { + if (!l->blocked && locks_overlap(lock, l)) { + if (((l->fl_type == F_WRLCK) || (lock->fl_type == F_WRLCK)) && + (lock->fl_type != F_UNLCK) && !same_owner(l, lock)) { + ret = 0; + break; + } } - return ret; + } + return ret; } - -extern void do_blocked_rw (pl_inode_t *); - +extern void +do_blocked_rw(pl_inode_t *); static void -__insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock) +__insert_and_merge(pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *conf = NULL; - posix_lock_t *t = NULL; - posix_lock_t *sum = NULL; - int i = 0; - struct _values v = { .locks = {0, 0, 0} }; - - list_for_each_entry_safe (conf, t, &pl_inode->ext_list, list) { - if (conf->blocked) - continue; - if (!locks_overlap (conf, lock)) - continue; + posix_lock_t *conf = NULL; + posix_lock_t *t = NULL; + posix_lock_t *sum = NULL; + int i = 0; + struct _values v = {.locks = {0, 0, 0}}; + + list_for_each_entry_safe(conf, t, &pl_inode->ext_list, list) + { + if (conf->blocked) + continue; + if (!locks_overlap(conf, lock)) + continue; + + if (same_owner(conf, lock)) { + if (conf->fl_type == lock->fl_type && + conf->lk_flags == lock->lk_flags) { + sum = add_locks(lock, conf, lock); + + __delete_lock(conf); + __destroy_lock(conf); + + __destroy_lock(lock); + INIT_LIST_HEAD(&sum->list); + posix_lock_to_flock(sum, &sum->user_flock); + __insert_and_merge(pl_inode, sum); - if (same_owner (conf, lock)) { - if (conf->fl_type == lock->fl_type) { - sum = add_locks (lock, conf); + return; + } else { + sum = add_locks(lock, conf, conf); - sum->fl_type = lock->fl_type; - sum->client = lock->client; - sum->fd_num = lock->fd_num; - sum->client_pid = lock->client_pid; - sum->owner = lock->owner; + v = subtract_locks(sum, lock); - __delete_lock (pl_inode, conf); - __destroy_lock (conf); + __delete_lock(conf); + __destroy_lock(conf); - __destroy_lock (lock); - INIT_LIST_HEAD (&sum->list); - posix_lock_to_flock (sum, &sum->user_flock); - __insert_and_merge (pl_inode, sum); + __delete_lock(lock); + __destroy_lock(lock); - return; - } else { - sum = add_locks (lock, conf); + __destroy_lock(sum); - sum->fl_type = conf->fl_type; - sum->client = conf->client; - sum->fd_num = conf->fd_num; - sum->client_pid = conf->client_pid; - sum->owner = conf->owner; + for (i = 0; i < 3; i++) { + if (!v.locks[i]) + continue; - v = subtract_locks (sum, lock); + __insert_and_merge(pl_inode, v.locks[i]); + } - __delete_lock (pl_inode, conf); - __destroy_lock (conf); + __delete_unlck_locks(pl_inode); + return; + } + } - __delete_lock (pl_inode, lock); - __destroy_lock (lock); + if (lock->fl_type == F_UNLCK) { + continue; + } - __destroy_lock (sum); + if ((conf->fl_type == F_RDLCK) && (lock->fl_type == F_RDLCK)) { + __insert_lock(pl_inode, lock); + return; + } + } + + /* no conflicts, so just insert */ + if (lock->fl_type != F_UNLCK) { + __insert_lock(pl_inode, lock); + } else { + __destroy_lock(lock); + } +} - for (i = 0; i < 3; i++) { - if (!v.locks[i]) - continue; +void +__grant_blocked_locks(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted) +{ + struct list_head tmp_list; + posix_lock_t *l = NULL; + posix_lock_t *tmp = NULL; + posix_lock_t *conf = NULL; + + INIT_LIST_HEAD(&tmp_list); + + list_for_each_entry_safe(l, tmp, &pl_inode->ext_list, list) + { + if (l->blocked) { + conf = first_overlap(pl_inode, l); + if (conf) + continue; + + l->blocked = 0; + list_move_tail(&l->list, &tmp_list); + } + } - INIT_LIST_HEAD (&v.locks[i]->list); - posix_lock_to_flock (v.locks[i], - &v.locks[i]->user_flock); - __insert_and_merge (pl_inode, - v.locks[i]); - } + list_for_each_entry_safe(l, tmp, &tmp_list, list) + { + list_del_init(&l->list); - __delete_unlck_locks (pl_inode); - return; - } - } + if (__is_lock_grantable(pl_inode, l)) { + conf = GF_CALLOC(1, sizeof(*conf), gf_locks_mt_posix_lock_t); - if (lock->fl_type == F_UNLCK) { - continue; - } + if (!conf) { + l->blocked = 1; + __insert_lock(pl_inode, l); + continue; + } - if ((conf->fl_type == F_RDLCK) && (lock->fl_type == F_RDLCK)) { - __insert_lock (pl_inode, lock); - return; - } - } + conf->frame = l->frame; + l->frame = NULL; + + posix_lock_to_flock(l, &conf->user_flock); + + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64 + " => Granted", + l->fl_type == F_UNLCK ? "Unlock" : "Lock", l->client_pid, + lkowner_utoa(&l->owner), l->user_flock.l_start, + l->user_flock.l_len); + + __insert_and_merge(pl_inode, l); - /* no conflicts, so just insert */ - if (lock->fl_type != F_UNLCK) { - __insert_lock (pl_inode, lock); + list_add(&conf->list, granted); } else { - __destroy_lock (lock); + l->blocked = 1; + __insert_lock(pl_inode, l); } + } } - void -__grant_blocked_locks (xlator_t *this, pl_inode_t *pl_inode, struct list_head *granted) +grant_blocked_locks(xlator_t *this, pl_inode_t *pl_inode) { - struct list_head tmp_list; - posix_lock_t *l = NULL; - posix_lock_t *tmp = NULL; - posix_lock_t *conf = NULL; + struct list_head granted_list; + posix_lock_t *tmp = NULL; + posix_lock_t *lock = NULL; + pl_local_t *local = NULL; + INIT_LIST_HEAD(&granted_list); + + pthread_mutex_lock(&pl_inode->mutex); + { + __grant_blocked_locks(this, pl_inode, &granted_list); + } + pthread_mutex_unlock(&pl_inode->mutex); + + list_for_each_entry_safe(lock, tmp, &granted_list, list) + { + list_del_init(&lock->list); + + pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock, + 0, 0, NULL); + local = lock->frame->local; + PL_STACK_UNWIND_AND_FREE(local, lk, lock->frame, 0, 0, + &lock->user_flock, NULL); + __destroy_lock(lock); + } + + return; +} - INIT_LIST_HEAD (&tmp_list); +static int +pl_send_prelock_unlock(xlator_t *this, pl_inode_t *pl_inode, + posix_lock_t *old_lock) +{ + struct gf_flock flock = { + 0, + }; + posix_lock_t *unlock_lock = NULL; + int32_t op_errno = 0; - list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) { - if (l->blocked) { - conf = first_overlap (pl_inode, l); - if (conf) - continue; + struct list_head granted_list; + posix_lock_t *tmp = NULL; + posix_lock_t *lock = NULL; + pl_local_t *local = NULL; - l->blocked = 0; - list_move_tail (&l->list, &tmp_list); - } - } + int ret = -1; - list_for_each_entry_safe (l, tmp, &tmp_list, list) { - list_del_init (&l->list); + INIT_LIST_HEAD(&granted_list); - if (__is_lock_grantable (pl_inode, l)) { - conf = GF_CALLOC (1, sizeof (*conf), - gf_locks_mt_posix_lock_t); + flock.l_type = F_UNLCK; + flock.l_whence = old_lock->user_flock.l_whence; + flock.l_start = old_lock->user_flock.l_start; + flock.l_len = old_lock->user_flock.l_len; + flock.l_pid = old_lock->user_flock.l_pid; - if (!conf) { - l->blocked = 1; - __insert_lock (pl_inode, l); - continue; - } + unlock_lock = new_posix_lock(&flock, old_lock->client, old_lock->client_pid, + &old_lock->owner, old_lock->fd, + old_lock->lk_flags, 0, &op_errno); + GF_VALIDATE_OR_GOTO(this->name, unlock_lock, out); + ret = 0; - conf->frame = l->frame; - l->frame = NULL; + __insert_and_merge(pl_inode, unlock_lock); - posix_lock_to_flock (l, &conf->user_flock); + __grant_blocked_locks(this, pl_inode, &granted_list); - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => Granted", - l->fl_type == F_UNLCK ? "Unlock" : "Lock", - l->client_pid, lkowner_utoa (&l->owner), - l->user_flock.l_start, - l->user_flock.l_len); + list_for_each_entry_safe(lock, tmp, &granted_list, list) + { + list_del_init(&lock->list); - __insert_and_merge (pl_inode, l); + pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock, + 0, 0, NULL); + local = lock->frame->local; + PL_STACK_UNWIND_AND_FREE(local, lk, lock->frame, 0, 0, + &lock->user_flock, NULL); + __destroy_lock(lock); + } - list_add (&conf->list, granted); - } else { - l->blocked = 1; - __insert_lock (pl_inode, l); - } - } +out: + return ret; } - -void -grant_blocked_locks (xlator_t *this, pl_inode_t *pl_inode) +int +pl_setlk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, + int can_block) { - struct list_head granted_list; - posix_lock_t *tmp = NULL; - posix_lock_t *lock = NULL; + int ret = 0; - INIT_LIST_HEAD (&granted_list); + errno = 0; - pthread_mutex_lock (&pl_inode->mutex); - { - __grant_blocked_locks (this, pl_inode, &granted_list); + pthread_mutex_lock(&pl_inode->mutex); + { + /* Send unlock before the actual lock to + prevent lock upgrade / downgrade + problems only if: + - it is a blocking call + - it has other conflicting locks + */ + + if (can_block && !(__is_lock_grantable(pl_inode, lock))) { + ret = pl_send_prelock_unlock(this, pl_inode, lock); + if (ret) + gf_log(this->name, GF_LOG_DEBUG, + "Could not send pre-lock " + "unlock"); } - pthread_mutex_unlock (&pl_inode->mutex); - list_for_each_entry_safe (lock, tmp, &granted_list, list) { - list_del_init (&lock->list); + if (__is_lock_grantable(pl_inode, lock)) { + if (pl_metalock_is_active(pl_inode)) { + __pl_queue_lock(pl_inode, lock); + pthread_mutex_unlock(&pl_inode->mutex); + ret = -2; + goto out; + } + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64 " => OK", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, lkowner_utoa(&lock->owner), + lock->user_flock.l_start, lock->user_flock.l_len); + __insert_and_merge(pl_inode, lock); + } else if (can_block) { + if (pl_metalock_is_active(pl_inode)) { + __pl_queue_lock(pl_inode, lock); + pthread_mutex_unlock(&pl_inode->mutex); + ret = -2; + goto out; + } + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64 + " => Blocked", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, lkowner_utoa(&lock->owner), + lock->user_flock.l_start, lock->user_flock.l_len); + + pl_trace_block(this, lock->frame, NULL, NULL, F_SETLKW, + &lock->user_flock, NULL); + + lock->blocked = 1; + __insert_lock(pl_inode, lock); + ret = -1; + } else { + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64 " => NOK", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, lkowner_utoa(&lock->owner), + lock->user_flock.l_start, lock->user_flock.l_len); + errno = EAGAIN; + ret = -1; + } + } + pthread_mutex_unlock(&pl_inode->mutex); - pl_trace_out (this, lock->frame, NULL, NULL, F_SETLKW, - &lock->user_flock, 0, 0, NULL); + grant_blocked_locks(this, pl_inode); - STACK_UNWIND_STRICT (lk, lock->frame, 0, 0, - &lock->user_flock, NULL); + do_blocked_rw(pl_inode); - GF_FREE (lock); - } +out: + return ret; +} - return; +posix_lock_t * +pl_getlk(pl_inode_t *pl_inode, posix_lock_t *lock) +{ + posix_lock_t *conf = first_conflicting_overlap(pl_inode, lock); + if (conf == NULL) { + lock->fl_type = F_UNLCK; + return lock; + } + + return conf; } -static int -pl_send_prelock_unlock (xlator_t *this, pl_inode_t *pl_inode, - posix_lock_t *old_lock) +gf_boolean_t +pl_does_monkey_want_stuck_lock() { - struct gf_flock flock = {0,}; - posix_lock_t *unlock_lock = NULL; + long int monkey_unlock_rand = 0; + long int monkey_unlock_rand_rem = 0; + + /* coverity[DC.WEAK_CRYPTO] */ + monkey_unlock_rand = random(); + monkey_unlock_rand_rem = monkey_unlock_rand % 100; + if (monkey_unlock_rand_rem == 0) + return _gf_true; + return _gf_false; +} - struct list_head granted_list; - posix_lock_t *tmp = NULL; - posix_lock_t *lock = NULL; +int +pl_lock_preempt(pl_inode_t *pl_inode, posix_lock_t *reqlock) +{ + posix_lock_t *lock = NULL; + posix_lock_t *i = NULL; + pl_rw_req_t *rw = NULL; + pl_rw_req_t *itr = NULL; + struct list_head unwind_blist = { + 0, + }; + struct list_head unwind_rw_list = { + 0, + }; + int ret = 0; + + INIT_LIST_HEAD(&unwind_blist); + INIT_LIST_HEAD(&unwind_rw_list); + + pthread_mutex_lock(&pl_inode->mutex); + { + /* + - go through the lock list + - remove all locks from different owners + - same owner locks will be added or substracted based on + the new request + - add the new lock + */ + list_for_each_entry_safe(lock, i, &pl_inode->ext_list, list) + { + if (lock->blocked) { + list_del_init(&lock->list); + list_add(&lock->list, &unwind_blist); + continue; + } + + if (locks_overlap(lock, reqlock)) { + if (same_owner(lock, reqlock)) + continue; + + /* remove conflicting locks */ + list_del_init(&lock->list); + __delete_lock(lock); + __destroy_lock(lock); + } + } + + __insert_and_merge(pl_inode, reqlock); + + list_for_each_entry_safe(rw, itr, &pl_inode->rw_list, list) + { + list_del_init(&rw->list); + list_add(&rw->list, &unwind_rw_list); + } + } + pthread_mutex_unlock(&pl_inode->mutex); + + /* unwind blocked locks */ + list_for_each_entry_safe(lock, i, &unwind_blist, list) + { + PL_STACK_UNWIND_AND_FREE(((pl_local_t *)lock->frame->local), lk, + lock->frame, -1, EBUSY, &lock->user_flock, + NULL); + __destroy_lock(lock); + } + + /* unwind blocked IOs */ + list_for_each_entry_safe(rw, itr, &unwind_rw_list, list) + { + pl_clean_local(rw->stub->frame->local); + call_unwind_error(rw->stub, -1, EBUSY); + } + + return ret; +} - int ret = -1; +/* Return true in case we need to ensure mandatory-locking + * semantics under different modes. + */ +gf_boolean_t +pl_is_mandatory_locking_enabled(pl_inode_t *pl_inode) +{ + posix_locks_private_t *priv = THIS->private; - INIT_LIST_HEAD (&granted_list); + if (priv->mandatory_mode == MLK_FILE_BASED && pl_inode->mandatory) + return _gf_true; + else if (priv->mandatory_mode == MLK_FORCED || + priv->mandatory_mode == MLK_OPTIMAL) + return _gf_true; - flock.l_type = F_UNLCK; - flock.l_whence = old_lock->user_flock.l_whence; - flock.l_start = old_lock->user_flock.l_start; - flock.l_len = old_lock->user_flock.l_len; + return _gf_false; +} +void +pl_clean_local(pl_local_t *local) +{ + if (!local) + return; - unlock_lock = new_posix_lock (&flock, old_lock->client, - old_lock->client_pid, &old_lock->owner, - old_lock->fd); - GF_VALIDATE_OR_GOTO (this->name, unlock_lock, out); - ret = 0; + if (local->inodelk_dom_count_req) + data_unref(local->inodelk_dom_count_req); + loc_wipe(&local->loc[0]); + loc_wipe(&local->loc[1]); + if (local->fd) + fd_unref(local->fd); + if (local->inode) + inode_unref(local->inode); + mem_put(local); +} - __insert_and_merge (pl_inode, unlock_lock); +/* +TODO: detach local initialization from PL_LOCAL_GET_REQUESTS and add it here +*/ +int +pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +{ + pl_local_t *local = NULL; + + if (!loc && !fd) { + return -1; + } + + if (!frame->local) { + local = mem_get0(this->local_pool); + if (!local) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "mem allocation failed"); + return -1; + } - __grant_blocked_locks (this, pl_inode, &granted_list); + local->inode = (loc ? inode_ref(loc->inode) : inode_ref(fd->inode)); - list_for_each_entry_safe (lock, tmp, &granted_list, list) { - list_del_init (&lock->list); + frame->local = local; + } - pl_trace_out (this, lock->frame, NULL, NULL, F_SETLKW, - &lock->user_flock, 0, 0, NULL); + return 0; +} - STACK_UNWIND_STRICT (lk, lock->frame, 0, 0, - &lock->user_flock, NULL); +gf_boolean_t +pl_is_lk_owner_valid(gf_lkowner_t *owner, client_t *client) +{ + if (client && (client->opversion < GD_OP_VERSION_7_0)) { + return _gf_true; + } + + if (is_lk_owner_null(owner)) { + return _gf_false; + } + return _gf_true; +} - GF_FREE (lock); +static int32_t +pl_inode_from_loc(loc_t *loc, inode_t **pinode) +{ + inode_t *inode = NULL; + int32_t error = 0; + + if (loc->inode != NULL) { + inode = inode_ref(loc->inode); + goto done; + } + + if (loc->parent == NULL) { + error = EINVAL; + goto done; + } + + if (!gf_uuid_is_null(loc->gfid)) { + inode = inode_find(loc->parent->table, loc->gfid); + if (inode != NULL) { + goto done; } + } -out: - return ret; + if (loc->name == NULL) { + error = EINVAL; + goto done; + } + + inode = inode_grep(loc->parent->table, loc->parent, loc->name); + if (inode == NULL) { + /* We haven't found any inode. This means that the file doesn't exist + * or that even if it exists, we don't have any knowledge about it, so + * we don't have locks on it either, which is fine for our purposes. */ + goto done; + } + +done: + *pinode = inode; + + return error; } -int -pl_setlk (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, - int can_block) +static gf_boolean_t +pl_inode_has_owners(xlator_t *xl, client_t *client, pl_inode_t *pl_inode, + struct timespec *now, struct list_head *contend) { - int ret = 0; + pl_dom_list_t *dom; + pl_inode_lock_t *lock; + gf_boolean_t has_owners = _gf_false; - errno = 0; - - pthread_mutex_lock (&pl_inode->mutex); + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + list_for_each_entry(lock, &dom->inodelk_list, list) { - /* Send unlock before the actual lock to - prevent lock upgrade / downgrade - problems only if: - - it is a blocking call - - it has other conflicting locks - */ - - if (can_block && - !(__is_lock_grantable (pl_inode, lock))) { - ret = pl_send_prelock_unlock (this, pl_inode, - lock); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not send pre-lock " - "unlock"); - } - - if (__is_lock_grantable (pl_inode, lock)) { - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => OK", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - __insert_and_merge (pl_inode, lock); - } else if (can_block) { - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => Blocked", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - lock->blocked = 1; - __insert_lock (pl_inode, lock); - ret = -1; - } else { - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => NOK", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - errno = EAGAIN; - ret = -1; - } + /* If the lock belongs to the same client, we assume it's related + * to the same operation, so we allow the removal to continue. */ + if (lock->client == client) { + continue; + } + /* If the lock belongs to an internal process, we don't block the + * removal. */ + if (lock->client_pid < 0) { + continue; + } + if (contend == NULL) { + return _gf_true; + } + has_owners = _gf_true; + inodelk_contention_notify_check(xl, lock, now, contend); } - pthread_mutex_unlock (&pl_inode->mutex); + } - grant_blocked_locks (this, pl_inode); + return has_owners; +} - do_blocked_rw (pl_inode); +int32_t +pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc, + pl_inode_t **ppl_inode, struct list_head *contend) +{ + struct timespec now; + inode_t *inode; + pl_inode_t *pl_inode; + int32_t error; + + pl_inode = NULL; + + error = pl_inode_from_loc(loc, &inode); + if ((error != 0) || (inode == NULL)) { + goto done; + } + + pl_inode = pl_inode_get(xl, inode, NULL); + if (pl_inode == NULL) { + inode_unref(inode); + error = ENOMEM; + goto done; + } + + /* pl_inode_from_loc() already increments ref count for inode, so + * we only assign here our reference. */ + pl_inode->inode = inode; + + timespec_now(&now); + + pthread_mutex_lock(&pl_inode->mutex); + + if (pl_inode->removed) { + error = ESTALE; + goto unlock; + } + + if (pl_inode_has_owners(xl, frame->root->client, pl_inode, &now, contend)) { + error = -1; + /* We skip the unlock here because the caller must create a stub when + * we return -1 and do a call to pl_inode_remove_complete(), which + * assumes the lock is still acquired and will release it once + * everything else is prepared. */ + goto done; + } + + pl_inode->is_locked = _gf_true; + pl_inode->remove_running++; - return ret; -} +unlock: + pthread_mutex_unlock(&pl_inode->mutex); +done: + *ppl_inode = pl_inode; -posix_lock_t * -pl_getlk (pl_inode_t *pl_inode, posix_lock_t *lock) + return error; +} + +int32_t +pl_inode_remove_complete(xlator_t *xl, pl_inode_t *pl_inode, call_stub_t *stub, + struct list_head *contend) { - posix_lock_t *conf = NULL; + pl_inode_lock_t *lock; + int32_t error = -1; + + if (stub != NULL) { + list_add_tail(&stub->list, &pl_inode->waiting); + pl_inode->is_locked = _gf_true; + } else { + error = ENOMEM; + + while (!list_empty(contend)) { + lock = list_first_entry(contend, pl_inode_lock_t, list); + list_del_init(&lock->list); + __pl_inodelk_unref(lock); + } + } - conf = first_conflicting_overlap (pl_inode, lock); + pthread_mutex_unlock(&pl_inode->mutex); - if (conf == NULL) { - lock->fl_type = F_UNLCK; - return lock; - } + if (error < 0) { + inodelk_contention_notify(xl, contend); + } - return conf; -} + inode_unref(pl_inode->inode); + return error; +} -struct _lock_table * -pl_lock_table_new (void) +void +pl_inode_remove_wake(struct list_head *list) { - struct _lock_table *new = NULL; + call_stub_t *stub; - new = GF_CALLOC (1, sizeof (struct _lock_table), gf_common_mt_lock_table); - if (new == NULL) { - goto out; - } - INIT_LIST_HEAD (&new->entrylk_lockers); - INIT_LIST_HEAD (&new->inodelk_lockers); - LOCK_INIT (&new->lock); -out: - return new; -} + while (!list_empty(list)) { + stub = list_first_entry(list, call_stub_t, list); + list_del_init(&stub->list); + call_resume(stub); + } +} -int -pl_add_locker (struct _lock_table *table, const char *volume, - loc_t *loc, fd_t *fd, pid_t pid, gf_lkowner_t *owner, - glusterfs_fop_t type) +void +pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error) { - int32_t ret = -1; - struct _locker *new = NULL; + struct list_head contend, granted; + struct timespec now; + pl_dom_list_t *dom; - GF_VALIDATE_OR_GOTO ("lock-table", table, out); - GF_VALIDATE_OR_GOTO ("lock-table", volume, out); + if (pl_inode == NULL) { + return; + } - new = GF_CALLOC (1, sizeof (struct _locker), gf_common_mt_locker); - if (new == NULL) { - goto out; - } - INIT_LIST_HEAD (&new->lockers); + INIT_LIST_HEAD(&contend); + INIT_LIST_HEAD(&granted); + timespec_now(&now); - new->volume = gf_strdup (volume); + pthread_mutex_lock(&pl_inode->mutex); - if (fd == NULL) { - loc_copy (&new->loc, loc); - } else { - new->fd = fd_ref (fd); + if (error == 0) { + if (pl_inode->links >= 0) { + pl_inode->links--; } + if (pl_inode->links == 0) { + pl_inode->removed = _gf_true; + } + } + + pl_inode->remove_running--; - new->pid = pid; - new->owner = *owner; + if ((pl_inode->remove_running == 0) && list_empty(&pl_inode->waiting)) { + pl_inode->is_locked = _gf_false; - LOCK (&table->lock); + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) { - if (type == GF_FOP_ENTRYLK) - list_add_tail (&new->lockers, &table->entrylk_lockers); - else - list_add_tail (&new->lockers, &table->inodelk_lockers); + __grant_blocked_inode_locks(xl, pl_inode, &granted, dom, &now, + &contend); } - UNLOCK (&table->lock); -out: - return ret; -} + } -int -pl_del_locker (struct _lock_table *table, const char *volume, - loc_t *loc, fd_t *fd, gf_lkowner_t *owner, glusterfs_fop_t type) -{ - struct _locker *locker = NULL; - struct _locker *tmp = NULL; - int32_t ret = -1; - struct list_head *head = NULL; - struct list_head del; + pthread_mutex_unlock(&pl_inode->mutex); - GF_VALIDATE_OR_GOTO ("lock-table", table, out); - GF_VALIDATE_OR_GOTO ("lock-table", volume, out); + unwind_granted_inodes(xl, pl_inode, &granted); - INIT_LIST_HEAD (&del); + inodelk_contention_notify(xl, &contend); - LOCK (&table->lock); - { - if (type == GF_FOP_ENTRYLK) { - head = &table->entrylk_lockers; - } else { - head = &table->inodelk_lockers; - } - - list_for_each_entry_safe (locker, tmp, head, lockers) { - if (!is_same_lkowner (&locker->owner, owner) || - strcmp (locker->volume, volume)) - continue; - - /* - * It is possible for inodelk lock to come on anon-fd - * and inodelk unlock to come on normal fd in case of - * client re-opens. So don't check for fds to be equal. - */ - if (locker->fd && fd) - list_move_tail (&locker->lockers, &del); - else if (locker->loc.inode && loc && - (locker->loc.inode == loc->inode)) - list_move_tail (&locker->lockers, &del); - } - } - UNLOCK (&table->lock); + inode_unref(pl_inode->inode); +} - tmp = NULL; - locker = NULL; +void +pl_inode_remove_unlocked(xlator_t *xl, pl_inode_t *pl_inode, + struct list_head *list) +{ + call_stub_t *stub, *tmp; - list_for_each_entry_safe (locker, tmp, &del, lockers) { - list_del_init (&locker->lockers); - if (locker->fd) - fd_unref (locker->fd); - else - loc_wipe (&locker->loc); + if (!pl_inode->is_locked) { + return; + } - GF_FREE (locker->volume); - GF_FREE (locker); + list_for_each_entry_safe(stub, tmp, &pl_inode->waiting, list) + { + if (!pl_inode_has_owners(xl, stub->frame->root->client, pl_inode, NULL, + NULL)) { + list_move_tail(&stub->list, list); } + } +} - ret = 0; -out: - return ret; +/* This function determines if an inodelk attempt can be done now or it needs + * to wait. + * + * Possible return values: + * < 0: An error occurred. Currently only -ESTALE can be returned if the + * inode has been deleted previously by unlink/rmdir/rename + * = 0: The lock can be attempted. + * > 0: The lock needs to wait because a conflicting remove operation is + * ongoing. + */ +int32_t +pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock) +{ + pl_dom_list_t *dom; + pl_inode_lock_t *ilock; + + /* If the inode has been deleted, we won't allow any lock. */ + if (pl_inode->removed) { + return -ESTALE; + } + + /* We only synchronize with locks made for regular operations coming from + * the user. Locks done for internal purposes are hard to control and could + * lead to long delays or deadlocks quite easily. */ + if (lock->client_pid < 0) { + return 0; + } + if (!pl_inode->is_locked) { + return 0; + } + if (pl_inode->remove_running > 0) { + return 1; + } + + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + list_for_each_entry(ilock, &dom->inodelk_list, list) + { + /* If a lock from the same client is already granted, we allow this + * one to continue. This is necessary to prevent deadlocks when + * multiple locks are taken for the same operation. + * + * On the other side it's unlikely that the same client sends + * completely unrelated locks for the same inode. + */ + if (ilock->client == lock->client) { + return 0; + } + } + } + return 1; } - diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h index db19ec978b4..281223bf3b8 100644 --- a/xlators/features/locks/src/common.h +++ b/xlators/features/locks/src/common.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2006-2012, 2015-2016 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -10,178 +10,253 @@ #ifndef __COMMON_H__ #define __COMMON_H__ -#include "lkowner.h" /*dump locks format strings */ -#define RANGE_FMT "type=%s, whence=%hd, start=%llu, len=%llu" -#define ENTRY_FMT "type=%s on basename=%s" -#define DUMP_GEN_FMT "pid = %llu, owner=%s, client=%p" -#define GRNTD_AT "granted at %s" -#define BLKD_AT "blocked at %s" -#define CONN_ID "connection-id=%s" -#define DUMP_BLKD_FMT DUMP_GEN_FMT", "CONN_ID", "BLKD_AT -#define DUMP_GRNTD_FMT DUMP_GEN_FMT", "CONN_ID", "GRNTD_AT -#define DUMP_BLKD_GRNTD_FMT DUMP_GEN_FMT", "CONN_ID", "BLKD_AT", "GRNTD_AT - -#define ENTRY_BLKD_FMT ENTRY_FMT", "DUMP_BLKD_FMT -#define ENTRY_GRNTD_FMT ENTRY_FMT", "DUMP_GRNTD_FMT -#define ENTRY_BLKD_GRNTD_FMT ENTRY_FMT", "DUMP_BLKD_GRNTD_FMT - -#define RANGE_BLKD_FMT RANGE_FMT", "DUMP_BLKD_FMT -#define RANGE_GRNTD_FMT RANGE_FMT", "DUMP_GRNTD_FMT -#define RANGE_BLKD_GRNTD_FMT RANGE_FMT", "DUMP_BLKD_GRNTD_FMT +#define RANGE_FMT "type=%s, whence=%hd, start=%llu, len=%llu" +#define ENTRY_FMT "type=%s on basename=%s" +#define DUMP_GEN_FMT "pid = %llu, owner=%s, client=%p" +#define GRNTD_AT "granted at %s" +#define BLKD_AT "blocked at %s" +#define CONN_ID "connection-id=%s" +#define DUMP_BLKD_FMT DUMP_GEN_FMT ", " CONN_ID ", " BLKD_AT +#define DUMP_GRNTD_FMT DUMP_GEN_FMT ", " CONN_ID ", " GRNTD_AT +#define DUMP_BLKD_GRNTD_FMT DUMP_GEN_FMT ", " CONN_ID ", " BLKD_AT ", " GRNTD_AT + +#define ENTRY_BLKD_FMT ENTRY_FMT ", " DUMP_BLKD_FMT +#define ENTRY_GRNTD_FMT ENTRY_FMT ", " DUMP_GRNTD_FMT +#define ENTRY_BLKD_GRNTD_FMT ENTRY_FMT ", " DUMP_BLKD_GRNTD_FMT + +#define RANGE_BLKD_FMT RANGE_FMT ", " DUMP_BLKD_FMT +#define RANGE_GRNTD_FMT RANGE_FMT ", " DUMP_GRNTD_FMT +#define RANGE_BLKD_GRNTD_FMT RANGE_FMT ", " DUMP_BLKD_GRNTD_FMT #define SET_FLOCK_PID(flock, lock) ((flock)->l_pid = lock->client_pid) -struct _locker { - struct list_head lockers; - char *volume; - loc_t loc; - fd_t *fd; - gf_lkowner_t owner; - pid_t pid; -}; - -struct _lock_table { - struct list_head inodelk_lockers; - struct list_head entrylk_lockers; - gf_lock_t lock; -}; +#define PL_STACK_UNWIND_AND_FREE(__local, fop, frame, op_ret, params...) \ + do { \ + frame->local = NULL; \ + STACK_UNWIND_STRICT(fop, frame, op_ret, params); \ + if (__local) { \ + if (__local->inodelk_dom_count_req) \ + data_unref(__local->inodelk_dom_count_req); \ + loc_wipe(&__local->loc[0]); \ + loc_wipe(&__local->loc[1]); \ + if (__local->fd) \ + fd_unref(__local->fd); \ + if (__local->inode) \ + inode_unref(__local->inode); \ + if (__local->xdata) { \ + dict_unref(__local->xdata); \ + __local->xdata = NULL; \ + } \ + mem_put(__local); \ + } \ + } while (0) posix_lock_t * -new_posix_lock (struct gf_flock *flock, client_t *client, pid_t client_pid, - gf_lkowner_t *owner, fd_t *fd); +new_posix_lock(struct gf_flock *flock, client_t *client, pid_t client_pid, + gf_lkowner_t *owner, fd_t *fd, uint32_t lk_flags, int blocking, + int32_t *op_errno); pl_inode_t * -pl_inode_get (xlator_t *this, inode_t *inode); +pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local); posix_lock_t * -pl_getlk (pl_inode_t *inode, posix_lock_t *lock); +pl_getlk(pl_inode_t *inode, posix_lock_t *lock); int -pl_setlk (xlator_t *this, pl_inode_t *inode, posix_lock_t *lock, - int can_block); +pl_setlk(xlator_t *this, pl_inode_t *inode, posix_lock_t *lock, int can_block); + +int +pl_lock_preempt(pl_inode_t *pl_inode, posix_lock_t *reqlock); void -grant_blocked_locks (xlator_t *this, pl_inode_t *inode); +grant_blocked_locks(xlator_t *this, pl_inode_t *inode); void -posix_lock_to_flock (posix_lock_t *lock, struct gf_flock *flock); +posix_lock_to_flock(posix_lock_t *lock, struct gf_flock *flock); int -locks_overlap (posix_lock_t *l1, posix_lock_t *l2); +locks_overlap(posix_lock_t *l1, posix_lock_t *l2); int -same_owner (posix_lock_t *l1, posix_lock_t *l2); +same_owner(posix_lock_t *l1, posix_lock_t *l2); -void __delete_lock (pl_inode_t *, posix_lock_t *); +void +__delete_lock(posix_lock_t *); -void __destroy_lock (posix_lock_t *); +void +__destroy_lock(posix_lock_t *); pl_dom_list_t * -get_domain (pl_inode_t *pl_inode, const char *volume); +get_domain(pl_inode_t *pl_inode, const char *volume); + +void +grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend); + +void +inodelk_contention_notify(xlator_t *this, struct list_head *contend); + +void +__delete_inode_lock(pl_inode_lock_t *lock); + +void +__pl_inodelk_unref(pl_inode_lock_t *lock); void -grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_dom_list_t *dom); +__grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted, pl_dom_list_t *dom, + struct timespec *now, struct list_head *contend); void -__delete_inode_lock (pl_inode_lock_t *lock); +unwind_granted_inodes(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted); void -__pl_inodelk_unref (pl_inode_lock_t *lock); +grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend); void -grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_entry_lock_t *unlocked, pl_dom_list_t *dom); +entrylk_contention_notify(xlator_t *this, struct list_head *contend); -void pl_update_refkeeper (xlator_t *this, inode_t *inode); +void +pl_update_refkeeper(xlator_t *this, inode_t *inode); int32_t -__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode, char *domname); +__get_inodelk_count(xlator_t *this, pl_inode_t *pl_inode, char *domname); int32_t -get_inodelk_count (xlator_t *this, inode_t *inode, char *domname); +get_inodelk_count(xlator_t *this, inode_t *inode, char *domname); int32_t -__get_entrylk_count (xlator_t *this, pl_inode_t *pl_inode); +__get_entrylk_count(xlator_t *this, pl_inode_t *pl_inode); int32_t -get_entrylk_count (xlator_t *this, inode_t *inode); +get_entrylk_count(xlator_t *this, inode_t *inode); -void pl_trace_in (xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, - int cmd, struct gf_flock *flock, const char *domain); +void +pl_trace_in(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, int cmd, + struct gf_flock *flock, const char *domain); -void pl_trace_out (xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, - int cmd, struct gf_flock *flock, int op_ret, int op_errno, const char *domain); +void +pl_trace_out(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, int cmd, + struct gf_flock *flock, int op_ret, int op_errno, + const char *domain); -void pl_trace_block (xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, - int cmd, struct gf_flock *flock, const char *domain); +void +pl_trace_block(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, + int cmd, struct gf_flock *flock, const char *domain); -void pl_trace_flush (xlator_t *this, call_frame_t *frame, fd_t *fd); +void +pl_trace_flush(xlator_t *this, call_frame_t *frame, fd_t *fd); -void entrylk_trace_in (xlator_t *this, call_frame_t *frame, const char *volume, - fd_t *fd, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type); +void +entrylk_trace_in(xlator_t *this, call_frame_t *frame, const char *volume, + fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type); -void entrylk_trace_out (xlator_t *this, call_frame_t *frame, const char *volume, - fd_t *fd, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, - int op_ret, int op_errno); +void +entrylk_trace_out(xlator_t *this, call_frame_t *frame, const char *volume, + fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, int op_ret, int op_errno); -void entrylk_trace_block (xlator_t *this, call_frame_t *frame, const char *volume, - fd_t *fd, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type); +void +entrylk_trace_block(xlator_t *this, call_frame_t *frame, const char *volume, + fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type); void -pl_print_verdict (char *str, int size, int op_ret, int op_errno); +pl_print_verdict(char *str, int size, int op_ret, int op_errno); void -pl_print_lockee (char *str, int size, fd_t *fd, loc_t *loc); +pl_print_lockee(char *str, int size, fd_t *fd, loc_t *loc); void -pl_print_locker (char *str, int size, xlator_t *this, call_frame_t *frame); +pl_print_locker(char *str, int size, xlator_t *this, call_frame_t *frame); void -pl_print_inodelk (char *str, int size, int cmd, struct gf_flock *flock, const char *domain); +pl_print_inodelk(char *str, int size, int cmd, struct gf_flock *flock, + const char *domain); void -pl_trace_release (xlator_t *this, fd_t *fd); +pl_trace_release(xlator_t *this, fd_t *fd); unsigned long -fd_to_fdnum (fd_t *fd); +fd_to_fdnum(fd_t *fd); fd_t * -fd_from_fdnum (posix_lock_t *lock); +fd_from_fdnum(posix_lock_t *lock); int -pl_reserve_setlk (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, - int can_block); +pl_reserve_setlk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, + int can_block); int -reservelks_equal (posix_lock_t *l1, posix_lock_t *l2); +reservelks_equal(posix_lock_t *l1, posix_lock_t *l2); int -pl_verify_reservelk (xlator_t *this, pl_inode_t *pl_inode, - posix_lock_t *lock, int can_block); +pl_verify_reservelk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, + int can_block); int -pl_reserve_unlock (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *reqlock); +pl_reserve_unlock(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *reqlock); + +int32_t +check_entrylk_on_basename(xlator_t *this, inode_t *parent, char *basename); -uint32_t -check_entrylk_on_basename (xlator_t *this, inode_t *parent, char *basename); +void +__pl_inodelk_unref(pl_inode_lock_t *lock); +void +__pl_entrylk_unref(pl_entry_lock_t *lock); + +int +pl_metalock_is_active(pl_inode_t *pl_inode); + +void +__pl_queue_lock(pl_inode_t *pl_inode, posix_lock_t *reqlock); + +void +inodelk_contention_notify_check(xlator_t *xl, pl_inode_lock_t *lock, + struct timespec *now, + struct list_head *contend); + +void +entrylk_contention_notify_check(xlator_t *xl, pl_entry_lock_t *lock, + struct timespec *now, + struct list_head *contend); + +gf_boolean_t +pl_does_monkey_want_stuck_lock(); + +gf_boolean_t +pl_is_mandatory_locking_enabled(pl_inode_t *pl_inode); + +void +pl_clean_local(pl_local_t *local); + +int +pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd); + +gf_boolean_t +pl_is_lk_owner_valid(gf_lkowner_t *owner, client_t *client); int32_t -pl_add_locker (struct _lock_table *table, const char *volume, - loc_t *loc, - fd_t *fd, - pid_t pid, - gf_lkowner_t *owner, - glusterfs_fop_t type); +pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc, + pl_inode_t **ppl_inode, struct list_head *contend); + +int32_t +pl_inode_remove_complete(xlator_t *xl, pl_inode_t *pl_inode, call_stub_t *stub, + struct list_head *contend); + +void +pl_inode_remove_wake(struct list_head *list); + +void +pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error); + +void +pl_inode_remove_unlocked(xlator_t *xl, pl_inode_t *pl_inode, + struct list_head *list); int32_t -pl_del_locker (struct _lock_table *table, const char *volume, - loc_t *loc, - fd_t *fd, - gf_lkowner_t *owner, - glusterfs_fop_t type); - -struct _lock_table * -pl_lock_table_new (void); +pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock); #endif /* __COMMON_H__ */ diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c index 0785dc547fc..fd772c850dd 100644 --- a/xlators/features/locks/src/entrylk.c +++ b/xlators/features/locks/src/entrylk.c @@ -7,51 +7,77 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "inode.h" -#include "logging.h" -#include "common-utils.h" -#include "list.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/xlator.h> +#include <glusterfs/logging.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/list.h> +#include <glusterfs/upcall-utils.h> #include "locks.h" +#include "clear.h" #include "common.h" +#include "pl-messages.h" -static pl_entry_lock_t * -new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type, - client_t *client, pid_t client_pid, gf_lkowner_t *owner, - const char *volume) - +void +__pl_entrylk_unref(pl_entry_lock_t *lock) { - pl_entry_lock_t *newlock = NULL; - - newlock = GF_CALLOC (1, sizeof (pl_entry_lock_t), - gf_locks_mt_pl_entry_lock_t); - if (!newlock) { - goto out; - } - - newlock->basename = basename ? gf_strdup (basename) : NULL; - newlock->type = type; - newlock->trans = client; - newlock->volume = volume; - newlock->client_pid = client_pid; - newlock->owner = *owner; + lock->ref--; + if (!lock->ref) { + GF_FREE((char *)lock->basename); + GF_FREE(lock->connection_id); + GF_FREE(lock); + } +} - INIT_LIST_HEAD (&newlock->domain_list); - INIT_LIST_HEAD (&newlock->blocked_locks); +static void +__pl_entrylk_ref(pl_entry_lock_t *lock) +{ + lock->ref++; +} +static pl_entry_lock_t * +new_entrylk_lock(pl_inode_t *pinode, const char *basename, entrylk_type type, + const char *domain, call_frame_t *frame, char *conn_id, + int32_t *op_errno) +{ + pl_entry_lock_t *newlock = NULL; + + if (!pl_is_lk_owner_valid(&frame->root->lk_owner, frame->root->client)) { + *op_errno = EINVAL; + goto out; + } + + newlock = GF_CALLOC(1, sizeof(pl_entry_lock_t), + gf_locks_mt_pl_entry_lock_t); + if (!newlock) { + *op_errno = ENOMEM; + goto out; + } + + newlock->basename = basename ? gf_strdup(basename) : NULL; + newlock->type = type; + newlock->client = frame->root->client; + newlock->client_pid = frame->root->pid; + newlock->volume = domain; + newlock->owner = frame->root->lk_owner; + newlock->frame = frame; + newlock->this = frame->this; + + if (conn_id) { + newlock->connection_id = gf_strdup(conn_id); + } + + INIT_LIST_HEAD(&newlock->domain_list); + INIT_LIST_HEAD(&newlock->blocked_locks); + INIT_LIST_HEAD(&newlock->client_list); + + __pl_entrylk_ref(newlock); out: - return newlock; + return newlock; } - /** * all_names - does a basename represent all names? * @basename: name to check @@ -66,206 +92,411 @@ out: */ static int -names_conflict (const char *n1, const char *n2) +names_conflict(const char *n1, const char *n2) { - return all_names (n1) || all_names (n2) || !strcmp (n1, n2); + return all_names(n1) || all_names(n2) || !strcmp(n1, n2); } +static int +__same_entrylk_owner(pl_entry_lock_t *l1, pl_entry_lock_t *l2) +{ + return (is_same_lkowner(&l1->owner, &l2->owner) && + (l1->client == l2->client)); +} -static inline int -__same_entrylk_owner (pl_entry_lock_t *l1, pl_entry_lock_t *l2) +/* Just as in inodelk, allow conflicting name locks from same (lk_owner, conn)*/ +static int +__conflicting_entrylks(pl_entry_lock_t *l1, pl_entry_lock_t *l2) { + if (names_conflict(l1->basename, l2->basename) && + !__same_entrylk_owner(l1, l2)) + return 1; + + return 0; +} - return (is_same_lkowner (&l1->owner, &l2->owner) && - (l1->trans == l2->trans)); +/* See comments in inodelk.c for details */ +static inline gf_boolean_t +__stale_entrylk(xlator_t *this, pl_entry_lock_t *candidate_lock, + pl_entry_lock_t *requested_lock, time_t *lock_age_sec) +{ + posix_locks_private_t *priv = NULL; + + priv = this->private; + + /* Question: Should we just prune them all given the + * chance? Or just the locks we are attempting to acquire? + */ + if (names_conflict(candidate_lock->basename, requested_lock->basename)) { + *lock_age_sec = gf_time() - candidate_lock->granted_time; + if (*lock_age_sec > priv->revocation_secs) + return _gf_true; + } + return _gf_false; } +/* See comments in inodelk.c for details */ +static gf_boolean_t +__entrylk_prune_stale(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, + pl_entry_lock_t *lock) +{ + posix_locks_private_t *priv = NULL; + pl_entry_lock_t *tmp = NULL; + pl_entry_lock_t *lk = NULL; + gf_boolean_t revoke_lock = _gf_false; + int bcount = 0; + int gcount = 0; + int op_errno = 0; + clrlk_args args; + args.opts = NULL; + time_t lk_age_sec = 0; + uint32_t max_blocked = 0; + char *reason_str = NULL; + + priv = this->private; + args.type = CLRLK_ENTRY; + if (priv->revocation_clear_all == _gf_true) + args.kind = CLRLK_ALL; + else + args.kind = CLRLK_GRANTED; + + if (list_empty(&dom->entrylk_list)) + goto out; + + pthread_mutex_lock(&pinode->mutex); + lock->pinode = pinode; + list_for_each_entry_safe(lk, tmp, &dom->entrylk_list, domain_list) + { + if (__stale_entrylk(this, lk, lock, &lk_age_sec) == _gf_true) { + revoke_lock = _gf_true; + reason_str = "age"; + break; + } + } + max_blocked = priv->revocation_max_blocked; + if (max_blocked != 0 && revoke_lock == _gf_false) { + list_for_each_entry_safe(lk, tmp, &dom->blocked_entrylks, blocked_locks) + { + max_blocked--; + if (max_blocked == 0) { + revoke_lock = _gf_true; + reason_str = "max blocked"; + break; + } + } + } + pthread_mutex_unlock(&pinode->mutex); + +out: + if (revoke_lock == _gf_true) { + clrlk_clear_entrylk(this, pinode, dom, &args, &bcount, &gcount, + &op_errno); + gf_log(this->name, GF_LOG_WARNING, + "Lock revocation [reason: %s; gfid: %s; domain: %s; " + "age: %ld sec] - Entry lock revoked: %d granted & %d " + "blocked locks cleared", + reason_str, uuid_utoa(pinode->gfid), dom->domain, lk_age_sec, + gcount, bcount); + } + + return revoke_lock; +} + +void +entrylk_contention_notify_check(xlator_t *this, pl_entry_lock_t *lock, + struct timespec *now, struct list_head *contend) +{ + posix_locks_private_t *priv; + int64_t elapsed; + + priv = this->private; + + /* If this lock is in a list, it means that we are about to send a + * notification for it, so no need to do anything else. */ + if (!list_empty(&lock->contend)) { + return; + } + + elapsed = now->tv_sec; + elapsed -= lock->contention_time.tv_sec; + if (now->tv_nsec < lock->contention_time.tv_nsec) { + elapsed--; + } + if (elapsed < priv->notify_contention_delay) { + return; + } + + /* All contention notifications will be sent outside of the locked + * region. This means that currently granted locks might have already + * been unlocked by that time. To avoid the lock or the inode to be + * destroyed before we process them, we take an additional reference + * on both. */ + inode_ref(lock->pinode->inode); + __pl_entrylk_ref(lock); + + lock->contention_time = *now; + + list_add_tail(&lock->contend, contend); +} + +void +entrylk_contention_notify(xlator_t *this, struct list_head *contend) +{ + struct gf_upcall up; + struct gf_upcall_entrylk_contention lc; + pl_entry_lock_t *lock; + pl_inode_t *pl_inode; + client_t *client; + gf_boolean_t notify; + + while (!list_empty(contend)) { + lock = list_first_entry(contend, pl_entry_lock_t, contend); + + pl_inode = lock->pinode; + + pthread_mutex_lock(&pl_inode->mutex); + + /* If the lock has already been released, no notification is + * sent. We clear the notification time in this case. */ + notify = !list_empty(&lock->domain_list); + if (!notify) { + lock->contention_time.tv_sec = 0; + lock->contention_time.tv_nsec = 0; + } else { + lc.type = lock->type; + lc.name = lock->basename; + lc.pid = lock->client_pid; + lc.domain = lock->volume; + lc.xdata = NULL; + + gf_uuid_copy(up.gfid, lock->pinode->gfid); + client = (client_t *)lock->client; + if (client == NULL) { + /* A NULL client can be found if the entrylk + * was issued by a server side xlator. */ + up.client_uid = NULL; + } else { + up.client_uid = client->client_uid; + } + } + + pthread_mutex_unlock(&pl_inode->mutex); + + if (notify) { + up.event_type = GF_UPCALL_ENTRYLK_CONTENTION; + up.data = &lc; + + if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) { + gf_msg_debug(this->name, 0, + "Entrylk contention notification " + "failed"); + } else { + gf_msg_debug(this->name, 0, + "Entrylk contention notification " + "sent"); + } + } + + pthread_mutex_lock(&pl_inode->mutex); + + list_del_init(&lock->contend); + __pl_entrylk_unref(lock); + + pthread_mutex_unlock(&pl_inode->mutex); + + inode_unref(pl_inode->inode); + } +} /** - * lock_grantable - is this lock grantable? + * entrylk_grantable - is this lock grantable? * @inode: inode in which to look * @basename: name we're trying to lock * @type: type of lock */ static pl_entry_lock_t * -__lock_grantable (pl_dom_list_t *dom, const char *basename, entrylk_type type) +__entrylk_grantable(xlator_t *this, pl_dom_list_t *dom, pl_entry_lock_t *lock, + struct timespec *now, struct list_head *contend) { - pl_entry_lock_t *lock = NULL; - - if (list_empty (&dom->entrylk_list)) - return NULL; - - list_for_each_entry (lock, &dom->entrylk_list, domain_list) { - if (names_conflict (lock->basename, basename)) - return lock; + pl_entry_lock_t *tmp = NULL; + pl_entry_lock_t *ret = NULL; + + list_for_each_entry(tmp, &dom->entrylk_list, domain_list) + { + if (__conflicting_entrylks(tmp, lock)) { + if (ret == NULL) { + ret = tmp; + if (contend == NULL) { + break; + } + } + entrylk_contention_notify_check(this, tmp, now, contend); } + } - return NULL; + return ret; } static pl_entry_lock_t * -__blocked_lock_conflict (pl_dom_list_t *dom, const char *basename, entrylk_type type) +__blocked_entrylk_conflict(pl_dom_list_t *dom, pl_entry_lock_t *lock) { - pl_entry_lock_t *lock = NULL; - - if (list_empty (&dom->blocked_entrylks)) - return NULL; + pl_entry_lock_t *tmp = NULL; - list_for_each_entry (lock, &dom->blocked_entrylks, blocked_locks) { - if (names_conflict (lock->basename, basename)) - return lock; - } + list_for_each_entry(tmp, &dom->blocked_entrylks, blocked_locks) + { + if (names_conflict(tmp->basename, lock->basename)) + return lock; + } - return NULL; + return NULL; } static int -__owner_has_lock (pl_dom_list_t *dom, pl_entry_lock_t *newlock) +__owner_has_lock(pl_dom_list_t *dom, pl_entry_lock_t *newlock) { - pl_entry_lock_t *lock = NULL; + pl_entry_lock_t *lock = NULL; - list_for_each_entry (lock, &dom->entrylk_list, domain_list) { - if (__same_entrylk_owner (lock, newlock)) - return 1; - } + list_for_each_entry(lock, &dom->entrylk_list, domain_list) + { + if (__same_entrylk_owner(lock, newlock)) + return 1; + } - list_for_each_entry (lock, &dom->blocked_entrylks, blocked_locks) { - if (__same_entrylk_owner (lock, newlock)) - return 1; - } + list_for_each_entry(lock, &dom->blocked_entrylks, blocked_locks) + { + if (__same_entrylk_owner(lock, newlock)) + return 1; + } - return 0; + return 0; } static int -names_equal (const char *n1, const char *n2) +names_equal(const char *n1, const char *n2) { - return (n1 == NULL && n2 == NULL) || (n1 && n2 && !strcmp (n1, n2)); + return (n1 == NULL && n2 == NULL) || (n1 && n2 && !strcmp(n1, n2)); } void -pl_print_entrylk (char *str, int size, entrylk_cmd cmd, entrylk_type type, - const char *basename, const char *domain) +pl_print_entrylk(char *str, int size, entrylk_cmd cmd, entrylk_type type, + const char *basename, const char *domain) { - char *cmd_str = NULL; - char *type_str = NULL; + char *cmd_str = NULL; + char *type_str = NULL; - switch (cmd) { + switch (cmd) { case ENTRYLK_LOCK: - cmd_str = "LOCK"; - break; + cmd_str = "LOCK"; + break; case ENTRYLK_LOCK_NB: - cmd_str = "LOCK_NB"; - break; + cmd_str = "LOCK_NB"; + break; case ENTRYLK_UNLOCK: - cmd_str = "UNLOCK"; - break; + cmd_str = "UNLOCK"; + break; default: - cmd_str = "UNKNOWN"; - break; - } + cmd_str = "UNKNOWN"; + break; + } - switch (type) { + switch (type) { case ENTRYLK_RDLCK: - type_str = "READ"; - break; + type_str = "READ"; + break; case ENTRYLK_WRLCK: - type_str = "WRITE"; - break; + type_str = "WRITE"; + break; default: - type_str = "UNKNOWN"; - break; - } + type_str = "UNKNOWN"; + break; + } - snprintf (str, size, "lock=ENTRYLK, cmd=%s, type=%s, basename=%s, domain: %s", - cmd_str, type_str, basename, domain); + snprintf(str, size, + "lock=ENTRYLK, cmd=%s, type=%s, basename=%s, domain: %s", cmd_str, + type_str, basename, domain); } - void -entrylk_trace_in (xlator_t *this, call_frame_t *frame, const char *domain, - fd_t *fd, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) +entrylk_trace_in(xlator_t *this, call_frame_t *frame, const char *domain, + fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type) { - posix_locks_private_t *priv = NULL; - char pl_locker[256]; - char pl_lockee[256]; - char pl_entrylk[256]; + posix_locks_private_t *priv = NULL; + char pl_locker[256]; + char pl_lockee[256]; + char pl_entrylk[256]; - priv = this->private; + priv = this->private; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_print_locker (pl_locker, 256, this, frame); - pl_print_lockee (pl_lockee, 256, fd, loc); - pl_print_entrylk (pl_entrylk, 256, cmd, type, basename, domain); + pl_print_locker(pl_locker, 256, this, frame); + pl_print_lockee(pl_lockee, 256, fd, loc); + pl_print_entrylk(pl_entrylk, 256, cmd, type, basename, domain); - gf_log (this->name, GF_LOG_INFO, - "[REQUEST] Locker = {%s} Lockee = {%s} Lock = {%s}", - pl_locker, pl_lockee, pl_entrylk); + gf_log(this->name, GF_LOG_INFO, + "[REQUEST] Locker = {%s} Lockee = {%s} Lock = {%s}", pl_locker, + pl_lockee, pl_entrylk); } - void -entrylk_trace_out (xlator_t *this, call_frame_t *frame, const char *domain, - fd_t *fd, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, int op_ret, int op_errno) +entrylk_trace_out(xlator_t *this, call_frame_t *frame, const char *domain, + fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, int op_ret, int op_errno) { - posix_locks_private_t *priv = NULL; - char pl_locker[256]; - char pl_lockee[256]; - char pl_entrylk[256]; - char verdict[32]; + posix_locks_private_t *priv = NULL; + char pl_locker[256]; + char pl_lockee[256]; + char pl_entrylk[256]; + char verdict[32]; - priv = this->private; + priv = this->private; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_print_locker (pl_locker, 256, this, frame); - pl_print_lockee (pl_lockee, 256, fd, loc); - pl_print_entrylk (pl_entrylk, 256, cmd, type, basename, domain); - pl_print_verdict (verdict, 32, op_ret, op_errno); + pl_print_locker(pl_locker, 256, this, frame); + pl_print_lockee(pl_lockee, 256, fd, loc); + pl_print_entrylk(pl_entrylk, 256, cmd, type, basename, domain); + pl_print_verdict(verdict, 32, op_ret, op_errno); - gf_log (this->name, GF_LOG_INFO, - "[%s] Locker = {%s} Lockee = {%s} Lock = {%s}", - verdict, pl_locker, pl_lockee, pl_entrylk); + gf_log(this->name, GF_LOG_INFO, + "[%s] Locker = {%s} Lockee = {%s} Lock = {%s}", verdict, pl_locker, + pl_lockee, pl_entrylk); } - void -entrylk_trace_block (xlator_t *this, call_frame_t *frame, const char *volume, - fd_t *fd, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) +entrylk_trace_block(xlator_t *this, call_frame_t *frame, const char *volume, + fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type) { - posix_locks_private_t *priv = NULL; - char pl_locker[256]; - char pl_lockee[256]; - char pl_entrylk[256]; + posix_locks_private_t *priv = NULL; + char pl_locker[256]; + char pl_lockee[256]; + char pl_entrylk[256]; - priv = this->private; + priv = this->private; - if (!priv->trace) - return; + if (!priv->trace) + return; - pl_print_locker (pl_locker, 256, this, frame); - pl_print_lockee (pl_lockee, 256, fd, loc); - pl_print_entrylk (pl_entrylk, 256, cmd, type, basename, volume); + pl_print_locker(pl_locker, 256, this, frame); + pl_print_lockee(pl_lockee, 256, fd, loc); + pl_print_entrylk(pl_entrylk, 256, cmd, type, basename, volume); - gf_log (this->name, GF_LOG_INFO, - "[BLOCKED] Locker = {%s} Lockee = {%s} Lock = {%s}", - pl_locker, pl_lockee, pl_entrylk); + gf_log(this->name, GF_LOG_INFO, + "[BLOCKED] Locker = {%s} Lockee = {%s} Lock = {%s}", pl_locker, + pl_lockee, pl_entrylk); } /** - * __find_most_matching_lock - find the lock struct which most matches in order of: - * lock on the exact basename || - * an all_names lock + * __find_most_matching_lock - find the lock struct which most matches in order + * of: lock on the exact basename || an all_names lock * * * @inode: inode in which to look @@ -273,27 +504,61 @@ entrylk_trace_block (xlator_t *this, call_frame_t *frame, const char *volume, */ static pl_entry_lock_t * -__find_most_matching_lock (pl_dom_list_t *dom, const char *basename) +__find_most_matching_lock(pl_dom_list_t *dom, const char *basename) { - pl_entry_lock_t *lock; - pl_entry_lock_t *all = NULL; - pl_entry_lock_t *exact = NULL; + pl_entry_lock_t *lock; + pl_entry_lock_t *all = NULL; + pl_entry_lock_t *exact = NULL; - if (list_empty (&dom->entrylk_list)) - return NULL; + if (list_empty(&dom->entrylk_list)) + return NULL; - list_for_each_entry (lock, &dom->entrylk_list, domain_list) { - if (all_names (lock->basename)) - all = lock; - else if (names_equal (lock->basename, basename)) - exact = lock; - } + list_for_each_entry(lock, &dom->entrylk_list, domain_list) + { + if (all_names(lock->basename)) + all = lock; + else if (names_equal(lock->basename, basename)) + exact = lock; + } - return (exact ? exact : all); + return (exact ? exact : all); +} + +static pl_entry_lock_t * +__find_matching_lock(pl_dom_list_t *dom, pl_entry_lock_t *lock) +{ + pl_entry_lock_t *tmp = NULL; + + list_for_each_entry(tmp, &dom->entrylk_list, domain_list) + { + if (names_equal(lock->basename, tmp->basename) && + __same_entrylk_owner(lock, tmp) && (lock->type == tmp->type)) + return tmp; + } + return NULL; +} + +static int +__lock_blocked_add(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, + pl_entry_lock_t *lock, int nonblock) +{ + if (nonblock) + goto out; + + lock->blkd_time = gf_time(); + list_add_tail(&lock->blocked_locks, &dom->blocked_entrylks); + + gf_msg_trace(this->name, 0, "Blocking lock: {pinode=%p, basename=%s}", + pinode, lock->basename); + + entrylk_trace_block(this, lock->frame, NULL, NULL, NULL, lock->basename, + ENTRYLK_LOCK, lock->type); +out: + return -EAGAIN; } /** - * __lock_name - lock a name in a directory + * __lock_entrylk - lock a name in a directory * @inode: inode for the directory in which to lock * @basename: name of the entry to lock * if null, lock the entire directory @@ -304,465 +569,375 @@ __find_most_matching_lock (pl_dom_list_t *dom, const char *basename) */ int -__lock_name (pl_inode_t *pinode, const char *basename, entrylk_type type, - call_frame_t *frame, pl_dom_list_t *dom, xlator_t *this, - int nonblock, char *conn_id) -{ - pl_entry_lock_t *lock = NULL; - pl_entry_lock_t *conf = NULL; - int ret = -EINVAL; - - lock = new_entrylk_lock (pinode, basename, type, - frame->root->client, frame->root->pid, - &frame->root->lk_owner, dom->domain); - if (!lock) { - ret = -ENOMEM; - goto out; - } - - lock->frame = frame; - lock->this = this; - lock->trans = frame->root->client; - - if (conn_id) { - lock->connection_id = gf_strdup (conn_id); - } - - conf = __lock_grantable (dom, basename, type); - if (conf) { - ret = -EAGAIN; - if (nonblock){ - GF_FREE (lock->connection_id); - GF_FREE ((char *)lock->basename); - GF_FREE (lock); - goto out; - - } - - gettimeofday (&lock->blkd_time, NULL); - list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks); - - gf_log (this->name, GF_LOG_TRACE, - "Blocking lock: {pinode=%p, basename=%s}", - pinode, basename); - - goto out; - } - - if ( __blocked_lock_conflict (dom, basename, type) && !(__owner_has_lock (dom, lock))) { - ret = -EAGAIN; - if (nonblock) { - GF_FREE (lock->connection_id); - GF_FREE ((char *) lock->basename); - GF_FREE (lock); - goto out; - - } - lock->frame = frame; - lock->this = this; - - gettimeofday (&lock->blkd_time, NULL); - list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks); - - gf_log (this->name, GF_LOG_TRACE, - "Lock is grantable, but blocking to prevent starvation"); - gf_log (this->name, GF_LOG_TRACE, - "Blocking lock: {pinode=%p, basename=%s}", - pinode, basename); - - ret = -EAGAIN; - goto out; +__lock_entrylk(xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock, + int nonblock, pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend) +{ + pl_entry_lock_t *conf = NULL; + int ret = -EAGAIN; + + conf = __entrylk_grantable(this, dom, lock, now, contend); + if (conf) { + ret = __lock_blocked_add(this, pinode, dom, lock, nonblock); + goto out; + } + + /* To prevent blocked locks starvation, check if there are any blocked + * locks thay may conflict with this lock. If there is then don't grant + * the lock. BUT grant the lock if the owner already has lock to allow + * nested locks. + * Example: SHD from Machine1 takes (gfid, basename=257-length-name) + * and is granted. + * SHD from machine2 takes (gfid, basename=NULL) and is blocked. + * When SHD from Machine1 takes (gfid, basename=NULL) it needs to be + * granted, without which self-heal can't progress. + * TODO: Find why 'owner_has_lock' is checked even for blocked locks. + */ + if (__blocked_entrylk_conflict(dom, lock) && + !(__owner_has_lock(dom, lock))) { + if (nonblock == 0) { + gf_log(this->name, GF_LOG_DEBUG, + "Lock is grantable, but blocking to prevent " + "starvation"); } - switch (type) { - case ENTRYLK_WRLCK: - gettimeofday (&lock->granted_time, NULL); - list_add_tail (&lock->domain_list, &dom->entrylk_list); - break; + ret = __lock_blocked_add(this, pinode, dom, lock, nonblock); + goto out; + } - default: + __pl_entrylk_ref(lock); + lock->granted_time = gf_time(); + list_add(&lock->domain_list, &dom->entrylk_list); - gf_log (this->name, GF_LOG_DEBUG, - "Invalid type for entrylk specified: %d", type); - ret = -EINVAL; - goto out; - } - - ret = 0; + ret = 0; out: - return ret; + return ret; } /** - * __unlock_name - unlock a name in a directory + * __unlock_entrylk - unlock a name in a directory * @inode: inode for the directory to unlock in * @basename: name of the entry to unlock * if null, unlock the entire directory */ pl_entry_lock_t * -__unlock_name (pl_dom_list_t *dom, const char *basename, entrylk_type type) +__unlock_entrylk(pl_dom_list_t *dom, pl_entry_lock_t *lock) { - pl_entry_lock_t *lock = NULL; - pl_entry_lock_t *ret_lock = NULL; - - lock = __find_most_matching_lock (dom, basename); + pl_entry_lock_t *ret_lock = NULL; - if (!lock) { - gf_log ("locks", GF_LOG_DEBUG, - "unlock on %s (type=ENTRYLK_WRLCK) attempted but no matching lock found", - basename); - goto out; - } + ret_lock = __find_matching_lock(dom, lock); - if (names_equal (lock->basename, basename) - && lock->type == type) { + if (ret_lock) { + list_del_init(&ret_lock->domain_list); + } else { + gf_log("locks", GF_LOG_ERROR, + "unlock on %s " + "(type=ENTRYLK_WRLCK) attempted but no matching lock " + "found", + lock->basename); + } - if (type == ENTRYLK_WRLCK) { - list_del_init (&lock->domain_list); - ret_lock = lock; - } - } else { - gf_log ("locks", GF_LOG_DEBUG, - "Unlock for a non-existing lock!"); - goto out; - } - -out: - return ret_lock; + return ret_lock; } -uint32_t -check_entrylk_on_basename (xlator_t *this, inode_t *parent, char *basename) +int32_t +check_entrylk_on_basename(xlator_t *this, inode_t *parent, char *basename) { - uint32_t entrylk = 0; - pl_inode_t *pinode = 0; - pl_dom_list_t *dom = NULL; - pl_entry_lock_t *conf = NULL; - - pinode = pl_inode_get (this, parent); - if (!pinode) - goto out; - pthread_mutex_lock (&pinode->mutex); + int32_t entrylk = 0; + pl_dom_list_t *dom = NULL; + pl_entry_lock_t *conf = NULL; + + pl_inode_t *pinode = pl_inode_get(this, parent, NULL); + if (!pinode) + goto out; + pthread_mutex_lock(&pinode->mutex); + { + list_for_each_entry(dom, &pinode->dom_list, inode_list) { - list_for_each_entry (dom, &pinode->dom_list, inode_list) { - conf = __lock_grantable (dom, basename, ENTRYLK_WRLCK); - if (conf && conf->basename) { - entrylk = 1; - break; - } - } + conf = __find_most_matching_lock(dom, basename); + if (conf && conf->basename) { + entrylk = 1; + break; + } } - pthread_mutex_unlock (&pinode->mutex); + } + pthread_mutex_unlock(&pinode->mutex); out: - return entrylk; + return entrylk; } void -__grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_dom_list_t *dom, struct list_head *granted) +__grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom, struct list_head *granted, + struct timespec *now, struct list_head *contend) { - int bl_ret = 0; - pl_entry_lock_t *bl = NULL; - pl_entry_lock_t *tmp = NULL; + int bl_ret = 0; + pl_entry_lock_t *bl = NULL; + pl_entry_lock_t *tmp = NULL; - struct list_head blocked_list; + struct list_head blocked_list; - INIT_LIST_HEAD (&blocked_list); - list_splice_init (&dom->blocked_entrylks, &blocked_list); + INIT_LIST_HEAD(&blocked_list); + list_splice_init(&dom->blocked_entrylks, &blocked_list); - list_for_each_entry_safe (bl, tmp, &blocked_list, - blocked_locks) { + list_for_each_entry_safe(bl, tmp, &blocked_list, blocked_locks) + { + list_del_init(&bl->blocked_locks); - list_del_init (&bl->blocked_locks); + bl_ret = __lock_entrylk(bl->this, pl_inode, bl, 0, dom, now, contend); - - gf_log ("locks", GF_LOG_TRACE, - "Trying to unblock: {pinode=%p, basename=%s}", - pl_inode, bl->basename); - - bl_ret = __lock_name (pl_inode, bl->basename, bl->type, - bl->frame, dom, bl->this, 0, - bl->connection_id); - - if (bl_ret == 0) { - list_add (&bl->blocked_locks, granted); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "should never happen"); - GF_FREE (bl->connection_id); - GF_FREE ((char *)bl->basename); - GF_FREE (bl); - } + if (bl_ret == 0) { + list_add_tail(&bl->blocked_locks, granted); } - return; + } } /* Grants locks if possible which are blocked on a lock */ void -grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_entry_lock_t *unlocked, pl_dom_list_t *dom) +grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend) { - struct list_head granted_list; - pl_entry_lock_t *tmp = NULL; - pl_entry_lock_t *lock = NULL; - - INIT_LIST_HEAD (&granted_list); - - pthread_mutex_lock (&pl_inode->mutex); + struct list_head granted_list; + pl_entry_lock_t *tmp = NULL; + pl_entry_lock_t *lock = NULL; + + INIT_LIST_HEAD(&granted_list); + + pthread_mutex_lock(&pl_inode->mutex); + { + __grant_blocked_entry_locks(this, pl_inode, dom, &granted_list, now, + contend); + } + pthread_mutex_unlock(&pl_inode->mutex); + + list_for_each_entry_safe(lock, tmp, &granted_list, blocked_locks) + { + entrylk_trace_out(this, lock->frame, NULL, NULL, NULL, lock->basename, + ENTRYLK_LOCK, lock->type, 0, 0); + + STACK_UNWIND_STRICT(entrylk, lock->frame, 0, 0, NULL); + lock->frame = NULL; + } + + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(lock, tmp, &granted_list, blocked_locks) { - __grant_blocked_entry_locks (this, pl_inode, dom, - &granted_list); + list_del_init(&lock->blocked_locks); + __pl_entrylk_unref(lock); } - pthread_mutex_unlock (&pl_inode->mutex); - - list_for_each_entry_safe (lock, tmp, &granted_list, blocked_locks) { - list_del_init (&lock->blocked_locks); - - entrylk_trace_out (this, lock->frame, NULL, NULL, NULL, - lock->basename, ENTRYLK_LOCK, lock->type, - 0, 0); - - STACK_UNWIND_STRICT (entrylk, lock->frame, 0, 0, NULL); - - GF_FREE (lock->connection_id); - GF_FREE ((char *)lock->basename); - GF_FREE (lock); - } - - GF_FREE ((char *)unlocked->basename); - GF_FREE (unlocked->connection_id); - GF_FREE (unlocked); - - return; -} - -/** - * release_entry_locks_for_client: release all entry locks from this - * client for this loc_t - */ - -static int -release_entry_locks_for_client (xlator_t *this, pl_inode_t *pinode, - pl_dom_list_t *dom, client_t *client) -{ - pl_entry_lock_t *lock = NULL; - pl_entry_lock_t *tmp = NULL; - struct list_head granted; - struct list_head released; - - INIT_LIST_HEAD (&granted); - INIT_LIST_HEAD (&released); - - pthread_mutex_lock (&pinode->mutex); - { - list_for_each_entry_safe (lock, tmp, &dom->blocked_entrylks, - blocked_locks) { - if (lock->trans != client) - continue; - - list_del_init (&lock->blocked_locks); - - gf_log (this->name, GF_LOG_TRACE, - "releasing lock on held by " - "{client=%p}", client); - - list_add (&lock->blocked_locks, &released); - - } - - list_for_each_entry_safe (lock, tmp, &dom->entrylk_list, - domain_list) { - if (lock->trans != client) - continue; - - list_del_init (&lock->domain_list); - - gf_log (this->name, GF_LOG_TRACE, - "releasing lock on held by " - "{client=%p}", client); - - GF_FREE ((char *)lock->basename); - GF_FREE (lock->connection_id); - GF_FREE (lock); - } - - __grant_blocked_entry_locks (this, pinode, dom, &granted); - - } - - pthread_mutex_unlock (&pinode->mutex); - - list_for_each_entry_safe (lock, tmp, &released, blocked_locks) { - list_del_init (&lock->blocked_locks); - - STACK_UNWIND_STRICT (entrylk, lock->frame, -1, EAGAIN, NULL); - - GF_FREE ((char *)lock->basename); - GF_FREE (lock->connection_id); - GF_FREE (lock); - - } - - list_for_each_entry_safe (lock, tmp, &granted, blocked_locks) { - list_del_init (&lock->blocked_locks); - - STACK_UNWIND_STRICT (entrylk, lock->frame, 0, 0, NULL); - - GF_FREE ((char *)lock->basename); - GF_FREE (lock->connection_id); - GF_FREE (lock); - } - - return 0; + } + pthread_mutex_unlock(&pl_inode->mutex); } /* Common entrylk code called by pl_entrylk and pl_fentrylk */ int -pl_common_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, inode_t *inode, const char *basename, - entrylk_cmd cmd, entrylk_type type, loc_t *loc, fd_t *fd, - dict_t *xdata) - -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int ret = -1; - char unwind = 1; - GF_UNUSED int dict_ret = -1; - pl_inode_t *pinode = NULL; - pl_entry_lock_t *unlocked = NULL; - pl_dom_list_t *dom = NULL; - char *conn_id = NULL; - pl_ctx_t *ctx = NULL; - - if (xdata) - dict_ret = dict_get_str (xdata, "connection-id", &conn_id); - - pinode = pl_inode_get (this, inode); - if (!pinode) { - op_errno = ENOMEM; - goto out; - } +pl_common_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, + inode_t *inode, const char *basename, entrylk_cmd cmd, + entrylk_type type, loc_t *loc, fd_t *fd, dict_t *xdata) - dom = get_domain (pinode, volume); - if (!dom){ - op_errno = ENOMEM; - goto out; +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int ret = -1; + char unwind = 1; + GF_UNUSED int dict_ret = -1; + pl_inode_t *pinode = NULL; + pl_entry_lock_t *reqlock = NULL; + pl_entry_lock_t *unlocked = NULL; + pl_dom_list_t *dom = NULL; + char *conn_id = NULL; + pl_ctx_t *ctx = NULL; + int nonblock = 0; + gf_boolean_t need_inode_unref = _gf_false; + posix_locks_private_t *priv = NULL; + struct list_head *pcontend = NULL; + struct list_head contend; + struct timespec now = {}; + + priv = this->private; + + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD(pcontend); + timespec_now(&now); + } + + if (xdata) + dict_ret = dict_get_str(xdata, "connection-id", &conn_id); + + pinode = pl_inode_get(this, inode, NULL); + if (!pinode) { + op_errno = ENOMEM; + goto out; + } + + if (frame->root->client) { + ctx = pl_ctx_get(frame->root->client, this); + if (!ctx) { + op_errno = ENOMEM; + gf_log(this->name, GF_LOG_INFO, "pl_ctx_get() failed"); + goto unwind; } - - entrylk_trace_in (this, frame, volume, fd, loc, basename, cmd, type); - - if (frame->root->lk_owner.len == 0) { - /* - this is a special case that means release - all locks from this client - */ - - gf_log (this->name, GF_LOG_TRACE, - "Releasing locks for client %p", frame->root->client); - - release_entry_locks_for_client (this, pinode, dom, - frame->root->client); + } + + dom = get_domain(pinode, volume); + if (!dom) { + op_errno = ENOMEM; + goto out; + } + + entrylk_trace_in(this, frame, volume, fd, loc, basename, cmd, type); + + reqlock = new_entrylk_lock(pinode, basename, type, dom->domain, frame, + conn_id, &op_errno); + if (!reqlock) { + op_ret = -1; + goto unwind; + } + + /* Ideally, AFTER a successful lock (both blocking and non-blocking) or + * an unsuccessful blocking lock operation, the inode needs to be ref'd. + * + * But doing so might give room to a race where the lock-requesting + * client could send a DISCONNECT just before this thread refs the inode + * after the locking is done, and the epoll thread could unref the inode + * in cleanup which means the inode's refcount would come down to 0, and + * the call to pl_forget() at this point destroys @pinode. Now when + * the io-thread executing this function tries to access pinode, + * it could crash on account of illegal memory access. + * + * To get around this problem, the inode is ref'd once even before + * adding the lock into client_list as a precautionary measure. + * This way even if there are DISCONNECTs, there will always be 1 extra + * ref on the inode, so @pinode is still alive until after the + * current stack unwinds. + */ + pinode->inode = inode_ref(inode); + if (priv->revocation_secs != 0) { + if (cmd != ENTRYLK_UNLOCK) { + __entrylk_prune_stale(this, pinode, dom, reqlock); + } else if (priv->monkey_unlocking == _gf_true) { + if (pl_does_monkey_want_stuck_lock()) { + gf_log(this->name, GF_LOG_WARNING, + "MONKEY LOCKING (forcing stuck lock)!"); op_ret = 0; - + need_inode_unref = _gf_true; + pthread_mutex_lock(&pinode->mutex); + { + __pl_entrylk_unref(reqlock); + } + pthread_mutex_unlock(&pinode->mutex); goto out; + } } + } - switch (cmd) { + switch (cmd) { + case ENTRYLK_LOCK_NB: + nonblock = 1; + /* fall through */ case ENTRYLK_LOCK: - pthread_mutex_lock (&pinode->mutex); - { - ret = __lock_name (pinode, basename, type, - frame, dom, this, 0, conn_id); - } - pthread_mutex_unlock (&pinode->mutex); - - op_errno = -ret; - if (ret < 0) { - if (ret == -EAGAIN) - unwind = 0; - else - unwind = 1; - goto out; + if (ctx) + pthread_mutex_lock(&ctx->lock); + pthread_mutex_lock(&pinode->mutex); + { + reqlock->pinode = pinode; + + ret = __lock_entrylk(this, pinode, reqlock, nonblock, dom, &now, + pcontend); + if (ret == 0) { + reqlock->frame = NULL; + op_ret = 0; } else { - op_ret = 0; - op_errno = 0; - unwind = 1; - goto out; + op_errno = -ret; } - break; - - case ENTRYLK_LOCK_NB: - unwind = 1; - pthread_mutex_lock (&pinode->mutex); - { - ret = __lock_name (pinode, basename, type, - frame, dom, this, 1, conn_id); - } - pthread_mutex_unlock (&pinode->mutex); + if (ctx && (!ret || !nonblock)) + list_add(&reqlock->client_list, &ctx->entrylk_lockers); - if (ret < 0) { - op_errno = -ret; - goto out; + if (ret == -EAGAIN && !nonblock) { + /* blocked */ + unwind = 0; + } else { + __pl_entrylk_unref(reqlock); } - break; + /* For all but the case where a non-blocking lock + * attempt fails, the extra ref taken before the switch + * block must be negated. + */ + if ((ret == -EAGAIN) && (nonblock)) + need_inode_unref = _gf_true; + } + pthread_mutex_unlock(&pinode->mutex); + if (ctx) + pthread_mutex_unlock(&ctx->lock); + break; case ENTRYLK_UNLOCK: - pthread_mutex_lock (&pinode->mutex); - { - unlocked = __unlock_name (dom, basename, type); + if (ctx) + pthread_mutex_lock(&ctx->lock); + pthread_mutex_lock(&pinode->mutex); + { + /* Irrespective of whether unlock succeeds or not, + * the extra inode ref that was done before the switch + * block must be negated. Towards this, + * @need_inode_unref flag is set unconditionally here. + */ + need_inode_unref = _gf_true; + unlocked = __unlock_entrylk(dom, reqlock); + if (unlocked) { + list_del_init(&unlocked->client_list); + __pl_entrylk_unref(unlocked); + op_ret = 0; + } else { + op_errno = EINVAL; } - pthread_mutex_unlock (&pinode->mutex); + __pl_entrylk_unref(reqlock); + } + pthread_mutex_unlock(&pinode->mutex); + if (ctx) + pthread_mutex_unlock(&ctx->lock); - if (unlocked) - grant_blocked_entry_locks (this, pinode, unlocked, dom); + grant_blocked_entry_locks(this, pinode, dom, &now, pcontend); - break; + break; default: - gf_log (this->name, GF_LOG_ERROR, - "Unexpected case in entrylk (cmd=%d). Please file" - "a bug report at http://bugs.gluster.com", cmd); - goto out; - } + need_inode_unref = _gf_true; + gf_log(this->name, GF_LOG_ERROR, + "Unexpected case in entrylk (cmd=%d). Please file" + "a bug report at http://bugs.gluster.com", + cmd); + goto out; + } + /* The following (extra) unref corresponds to the ref that + * was done at the time the lock was granted. + */ + if ((cmd == ENTRYLK_UNLOCK) && (op_ret == 0)) + inode_unref(pinode->inode); - op_ret = 0; out: - pl_update_refkeeper (this, inode); - if (unwind) { - entrylk_trace_out (this, frame, volume, fd, loc, basename, - cmd, type, op_ret, op_errno); - ctx = pl_ctx_get (frame->root->client, this); + if (need_inode_unref) + inode_unref(pinode->inode); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed"); - goto unwind; - } - - if (cmd == ENTRYLK_UNLOCK) - pl_del_locker (ctx->ltable, volume, loc, fd, - &frame->root->lk_owner, - GF_FOP_ENTRYLK); - else - pl_add_locker (ctx->ltable, volume, loc, fd, - frame->root->pid, - &frame->root->lk_owner, - GF_FOP_ENTRYLK); - -unwind: - STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, NULL); - } else { - entrylk_trace_block (this, frame, volume, fd, loc, basename, - cmd, type); - } + if (unwind) { + entrylk_trace_out(this, frame, volume, fd, loc, basename, cmd, type, + op_ret, op_errno); + unwind: + STACK_UNWIND_STRICT(entrylk, frame, op_ret, op_errno, NULL); + } + if (pcontend != NULL) { + entrylk_contention_notify(this, pcontend); + } - return 0; + return 0; } /** @@ -772,17 +947,16 @@ unwind: */ int -pl_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +pl_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { - pl_common_entrylk (frame, this, volume, loc->inode, basename, cmd, - type, loc, NULL, xdata); + pl_common_entrylk(frame, this, volume, loc->inode, basename, cmd, type, loc, + NULL, xdata); - return 0; + return 0; } - /** * pl_fentrylk: * @@ -790,59 +964,190 @@ pl_entrylk (call_frame_t *frame, xlator_t *this, */ int -pl_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +pl_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { - pl_common_entrylk (frame, this, volume, fd->inode, basename, cmd, - type, NULL, fd, xdata); + pl_common_entrylk(frame, this, volume, fd->inode, basename, cmd, type, NULL, + fd, xdata); - return 0; + return 0; } - -int32_t -__get_entrylk_count (xlator_t *this, pl_inode_t *pl_inode) +static void +pl_entrylk_log_cleanup(pl_entry_lock_t *lock) { - int32_t count = 0; - pl_entry_lock_t *lock = NULL; - pl_dom_list_t *dom = NULL; + pl_inode_t *pinode = NULL; - list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { - list_for_each_entry (lock, &dom->entrylk_list, domain_list) { - count++; - } + pinode = lock->pinode; + + gf_log(THIS->name, GF_LOG_WARNING, + "releasing lock on %s held by " + "{client=%p, pid=%" PRId64 " lk-owner=%s}", + uuid_utoa(pinode->gfid), lock->client, (uint64_t)lock->client_pid, + lkowner_utoa(&lock->owner)); +} - list_for_each_entry (lock, &dom->blocked_entrylks, blocked_locks) { - count++; +/* Release all entrylks from this client */ +int +pl_entrylk_client_cleanup(xlator_t *this, pl_ctx_t *ctx) +{ + posix_locks_private_t *priv; + pl_entry_lock_t *tmp = NULL; + pl_entry_lock_t *l = NULL; + pl_dom_list_t *dom = NULL; + pl_inode_t *pinode = NULL; + struct list_head *pcontend = NULL; + struct list_head released; + struct list_head unwind; + struct list_head contend; + struct timespec now = {}; + + INIT_LIST_HEAD(&released); + INIT_LIST_HEAD(&unwind); + + priv = this->private; + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD(pcontend); + timespec_now(&now); + } + + pthread_mutex_lock(&ctx->lock); + { + list_for_each_entry_safe(l, tmp, &ctx->entrylk_lockers, client_list) + { + pl_entrylk_log_cleanup(l); + + pinode = l->pinode; + + pthread_mutex_lock(&pinode->mutex); + { + /* If the entrylk object is part of granted list but not + * blocked list, then perform the following actions: + * i. delete the object from granted list; + * ii. grant other locks (from other clients) that may + * have been blocked on this entrylk; and + * iii. unref the object. + * + * If the entrylk object (L1) is part of both granted + * and blocked lists, then this means that a parallel + * unlock on another entrylk (L2 say) may have 'granted' + * L1 and added it to 'granted' list in + * __grant_blocked_entry_locks() (although using the + * 'blocked_locks' member). In that case, the cleanup + * codepath must try and grant other overlapping + * blocked entrylks from other clients, now that L1 is + * out of their way and then unref L1 in the end, and + * leave it to the other thread (the one executing + * unlock codepath) to unwind L1's frame, delete it from + * blocked_locks list, and perform the last unref on L1. + * + * If the entrylk object (L1) is part of blocked list + * only, the cleanup code path must: + * i. delete it from the blocked_locks list inside + * this critical section, + * ii. unwind its frame with EAGAIN, + * iii. try and grant blocked entry locks from other + * clients that were otherwise grantable, but were + * blocked to avoid leaving L1 to starve forever. + * iv. unref the object. + */ + list_del_init(&l->client_list); + + if (!list_empty(&l->domain_list)) { + list_del_init(&l->domain_list); + list_add_tail(&l->client_list, &released); + } else { + list_del_init(&l->blocked_locks); + list_add_tail(&l->client_list, &unwind); } + } + pthread_mutex_unlock(&pinode->mutex); + } + } + pthread_mutex_unlock(&ctx->lock); + + if (!list_empty(&unwind)) { + list_for_each_entry_safe(l, tmp, &unwind, client_list) + { + list_del_init(&l->client_list); + + if (l->frame) + STACK_UNWIND_STRICT(entrylk, l->frame, -1, EAGAIN, NULL); + list_add_tail(&l->client_list, &released); + } + } + + if (!list_empty(&released)) { + list_for_each_entry_safe(l, tmp, &released, client_list) + { + list_del_init(&l->client_list); + + pinode = l->pinode; + dom = get_domain(pinode, l->volume); + + grant_blocked_entry_locks(this, pinode, dom, &now, pcontend); + + pthread_mutex_lock(&pinode->mutex); + { + __pl_entrylk_unref(l); + } + pthread_mutex_unlock(&pinode->mutex); + + inode_unref(pinode->inode); } + } + + if (pcontend != NULL) { + entrylk_contention_notify(this, pcontend); + } - return count; + return 0; } int32_t -get_entrylk_count (xlator_t *this, inode_t *inode) +__get_entrylk_count(xlator_t *this, pl_inode_t *pl_inode) { - pl_inode_t *pl_inode = NULL; - uint64_t tmp_pl_inode = 0; - int ret = 0; - int32_t count = 0; + int32_t count = 0; + pl_entry_lock_t *lock = NULL; + pl_dom_list_t *dom = NULL; - ret = inode_ctx_get (inode, this, &tmp_pl_inode); - if (ret != 0) { - goto out; - } + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + list_for_each_entry(lock, &dom->entrylk_list, domain_list) { count++; } - pl_inode = (pl_inode_t *)(long) tmp_pl_inode; - - pthread_mutex_lock (&pl_inode->mutex); + list_for_each_entry(lock, &dom->blocked_entrylks, blocked_locks) { - count = __get_entrylk_count (this, pl_inode); + count++; } - pthread_mutex_unlock (&pl_inode->mutex); + } + + return count; +} + +int32_t +get_entrylk_count(xlator_t *this, inode_t *inode) +{ + pl_inode_t *pl_inode = NULL; + uint64_t tmp_pl_inode = 0; + int ret = 0; + int32_t count = 0; + + ret = inode_ctx_get(inode, this, &tmp_pl_inode); + if (ret != 0) { + goto out; + } + + pl_inode = (pl_inode_t *)(long)tmp_pl_inode; + + pthread_mutex_lock(&pl_inode->mutex); + { + count = __get_entrylk_count(this, pl_inode); + } + pthread_mutex_unlock(&pl_inode->mutex); out: - return count; + return count; } diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c index 508523e1106..d4e51d6e0a1 100644 --- a/xlators/features/locks/src/inodelk.c +++ b/xlators/features/locks/src/inodelk.c @@ -7,819 +7,1168 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "inode.h" -#include "logging.h" -#include "common-utils.h" -#include "list.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include <glusterfs/list.h> +#include <glusterfs/upcall-utils.h> #include "locks.h" +#include "clear.h" #include "common.h" -inline void -__delete_inode_lock (pl_inode_lock_t *lock) +void +__delete_inode_lock(pl_inode_lock_t *lock) { - list_del (&lock->list); + list_del_init(&lock->list); } -static inline void -__pl_inodelk_ref (pl_inode_lock_t *lock) +static void +__pl_inodelk_ref(pl_inode_lock_t *lock) { - lock->ref++; + lock->ref++; } -inline void -__pl_inodelk_unref (pl_inode_lock_t *lock) +void +__pl_inodelk_unref(pl_inode_lock_t *lock) { - lock->ref--; - if (!lock->ref) { - GF_FREE (lock->connection_id); - GF_FREE (lock); - } + lock->ref--; + if (!lock->ref) { + GF_FREE(lock->connection_id); + GF_FREE(lock); + } } -/* Check if 2 inodelks are conflicting on type. Only 2 shared locks don't conflict */ -static inline int -inodelk_type_conflict (pl_inode_lock_t *l1, pl_inode_lock_t *l2) +/* Check if 2 inodelks are conflicting on type. Only 2 shared locks don't + * conflict */ +static int +inodelk_type_conflict(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { - if (l2->fl_type == F_WRLCK || l1->fl_type == F_WRLCK) - return 1; + if (l2->fl_type == F_WRLCK || l1->fl_type == F_WRLCK) + return 1; - return 0; + return 0; } void -pl_print_inodelk (char *str, int size, int cmd, struct gf_flock *flock, const char *domain) +pl_print_inodelk(char *str, int size, int cmd, struct gf_flock *flock, + const char *domain) { - char *cmd_str = NULL; - char *type_str = NULL; + char *cmd_str = NULL; + char *type_str = NULL; - switch (cmd) { + switch (cmd) { #if F_GETLK != F_GETLK64 case F_GETLK64: #endif case F_GETLK: - cmd_str = "GETLK"; - break; + cmd_str = "GETLK"; + break; #if F_SETLK != F_SETLK64 case F_SETLK64: #endif case F_SETLK: - cmd_str = "SETLK"; - break; + cmd_str = "SETLK"; + break; #if F_SETLKW != F_SETLKW64 case F_SETLKW64: #endif case F_SETLKW: - cmd_str = "SETLKW"; - break; + cmd_str = "SETLKW"; + break; default: - cmd_str = "UNKNOWN"; - break; - } + cmd_str = "UNKNOWN"; + break; + } - switch (flock->l_type) { + switch (flock->l_type) { case F_RDLCK: - type_str = "READ"; - break; + type_str = "READ"; + break; case F_WRLCK: - type_str = "WRITE"; - break; + type_str = "WRITE"; + break; case F_UNLCK: - type_str = "UNLOCK"; - break; + type_str = "UNLOCK"; + break; default: - type_str = "UNKNOWN"; - break; - } - - snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, " - "domain: %s, start=%llu, len=%llu, pid=%llu", - cmd_str, type_str, domain, - (unsigned long long) flock->l_start, - (unsigned long long) flock->l_len, - (unsigned long long) flock->l_pid); + type_str = "UNKNOWN"; + break; + } + + snprintf(str, size, + "lock=INODELK, cmd=%s, type=%s, " + "domain: %s, start=%llu, len=%llu, pid=%llu", + cmd_str, type_str, domain, (unsigned long long)flock->l_start, + (unsigned long long)flock->l_len, + (unsigned long long)flock->l_pid); } /* Determine if the two inodelks overlap reach other's lock regions */ static int -inodelk_overlap (pl_inode_lock_t *l1, pl_inode_lock_t *l2) +inodelk_overlap(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { - return ((l1->fl_end >= l2->fl_start) && - (l2->fl_end >= l1->fl_start)); + return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start)); } /* Returns true if the 2 inodelks have the same owner */ -static inline int -same_inodelk_owner (pl_inode_lock_t *l1, pl_inode_lock_t *l2) +static int +same_inodelk_owner(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { - return (is_same_lkowner (&l1->owner, &l2->owner) && - (l1->client == l2->client)); + return (is_same_lkowner(&l1->owner, &l2->owner) && + (l1->client == l2->client)); } /* Returns true if the 2 inodelks conflict with each other */ static int -inodelk_conflict (pl_inode_lock_t *l1, pl_inode_lock_t *l2) +inodelk_conflict(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { - return (inodelk_overlap (l1, l2) && - inodelk_type_conflict (l1, l2)); + return (inodelk_overlap(l1, l2) && inodelk_type_conflict(l1, l2)); } -/* Determine if lock is grantable or not */ -static pl_inode_lock_t * -__inodelk_grantable (pl_dom_list_t *dom, pl_inode_lock_t *lock) +/* + * Check to see if the candidate lock overlaps/conflicts with the + * requested lock. If so, determine how old the lock is and return + * true if it exceeds the configured threshold, false otherwise. + */ +static inline gf_boolean_t +__stale_inodelk(xlator_t *this, pl_inode_lock_t *candidate_lock, + pl_inode_lock_t *requested_lock, time_t *lock_age_sec) { - pl_inode_lock_t *l = NULL; - pl_inode_lock_t *ret = NULL; - if (list_empty (&dom->inodelk_list)) - goto out; - list_for_each_entry (l, &dom->inodelk_list, list){ - if (inodelk_conflict (lock, l) && - !same_inodelk_owner (lock, l)) { - ret = l; - goto out; - } - } -out: - return ret; + posix_locks_private_t *priv = NULL; + + priv = this->private; + /* Question: Should we just prune them all given the + * chance? Or just the locks we are attempting to acquire? + */ + if (inodelk_conflict(candidate_lock, requested_lock)) { + *lock_age_sec = gf_time() - candidate_lock->granted_time; + if (*lock_age_sec > priv->revocation_secs) + return _gf_true; + } + return _gf_false; } -static pl_inode_lock_t * -__blocked_lock_conflict (pl_dom_list_t *dom, pl_inode_lock_t *lock) +/* Examine any locks held on this inode and potentially revoke the lock + * if the age exceeds revocation_secs. We will clear _only_ those locks + * which are granted, and then grant those locks which are blocked. + * + * Depending on how this patch works in the wild, we may expand this and + * introduce a heuristic which clears blocked locks as well if they + * are beyond a threshold. + */ +static gf_boolean_t +__inodelk_prune_stale(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, + pl_inode_lock_t *lock) { - pl_inode_lock_t *l = NULL; - pl_inode_lock_t *ret = NULL; - - if (list_empty (&dom->blocked_inodelks)) - return NULL; + posix_locks_private_t *priv = NULL; + pl_inode_lock_t *tmp = NULL; + pl_inode_lock_t *lk = NULL; + gf_boolean_t revoke_lock = _gf_false; + int bcount = 0; + int gcount = 0; + int op_errno = 0; + clrlk_args args; + args.opts = NULL; + time_t lk_age_sec = 0; + uint32_t max_blocked = 0; + char *reason_str = NULL; + + priv = this->private; + + args.type = CLRLK_INODE; + if (priv->revocation_clear_all == _gf_true) + args.kind = CLRLK_ALL; + else + args.kind = CLRLK_GRANTED; + + if (list_empty(&dom->inodelk_list)) + goto out; + + pthread_mutex_lock(&pinode->mutex); + list_for_each_entry_safe(lk, tmp, &dom->inodelk_list, list) + { + if (__stale_inodelk(this, lk, lock, &lk_age_sec) == _gf_true) { + revoke_lock = _gf_true; + reason_str = "age"; + break; + } + } - list_for_each_entry (l, &dom->blocked_inodelks, blocked_locks) { - if (inodelk_conflict (lock, l)) { - ret = l; - goto out; - } + max_blocked = priv->revocation_max_blocked; + if (max_blocked != 0 && revoke_lock == _gf_false) { + list_for_each_entry_safe(lk, tmp, &dom->blocked_inodelks, blocked_locks) + { + max_blocked--; + if (max_blocked == 0) { + revoke_lock = _gf_true; + reason_str = "max blocked"; + break; + } } + } + pthread_mutex_unlock(&pinode->mutex); out: - return ret; + if (revoke_lock == _gf_true) { + clrlk_clear_inodelk(this, pinode, dom, &args, &bcount, &gcount, + &op_errno); + gf_log(this->name, GF_LOG_WARNING, + "Lock revocation [reason: %s; gfid: %s; domain: %s; " + "age: %ld sec] - Inode lock revoked: %d granted & %d " + "blocked locks cleared", + reason_str, uuid_utoa(pinode->gfid), dom->domain, lk_age_sec, + gcount, bcount); + } + return revoke_lock; } -static int -__owner_has_lock (pl_dom_list_t *dom, pl_inode_lock_t *newlock) +void +inodelk_contention_notify_check(xlator_t *this, pl_inode_lock_t *lock, + struct timespec *now, struct list_head *contend) { - pl_inode_lock_t *lock = NULL; + posix_locks_private_t *priv; + int64_t elapsed; - list_for_each_entry (lock, &dom->inodelk_list, list) { - if (same_inodelk_owner (lock, newlock)) - return 1; - } + priv = this->private; - list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) { - if (same_inodelk_owner (lock, newlock)) - return 1; - } + /* If this lock is in a list, it means that we are about to send a + * notification for it, so no need to do anything else. */ + if (!list_empty(&lock->contend)) { + return; + } + + elapsed = now->tv_sec; + elapsed -= lock->contention_time.tv_sec; + if (now->tv_nsec < lock->contention_time.tv_nsec) { + elapsed--; + } + if (elapsed < priv->notify_contention_delay) { + return; + } - return 0; -} + /* All contention notifications will be sent outside of the locked + * region. This means that currently granted locks might have already + * been unlocked by that time. To avoid the lock or the inode to be + * destroyed before we process them, we take an additional reference + * on both. */ + inode_ref(lock->pl_inode->inode); + __pl_inodelk_ref(lock); + lock->contention_time = *now; -/* Determines if lock can be granted and adds the lock. If the lock - * is blocking, adds it to the blocked_inodelks list of the domain. - */ -static int -__lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock, - int can_block, pl_dom_list_t *dom) + list_add_tail(&lock->contend, contend); +} + +void +inodelk_contention_notify(xlator_t *this, struct list_head *contend) { - pl_inode_lock_t *conf = NULL; - int ret = -EINVAL; + struct gf_upcall up; + struct gf_upcall_inodelk_contention lc; + pl_inode_lock_t *lock; + pl_inode_t *pl_inode; + client_t *client; + gf_boolean_t notify; + + while (!list_empty(contend)) { + lock = list_first_entry(contend, pl_inode_lock_t, contend); + + pl_inode = lock->pl_inode; + + pthread_mutex_lock(&pl_inode->mutex); + + /* If the lock has already been released, no notification is + * sent. We clear the notification time in this case. */ + notify = !list_empty(&lock->list); + if (!notify) { + lock->contention_time.tv_sec = 0; + lock->contention_time.tv_nsec = 0; + } else { + memcpy(&lc.flock, &lock->user_flock, sizeof(lc.flock)); + lc.pid = lock->client_pid; + lc.domain = lock->volume; + lc.xdata = NULL; + + gf_uuid_copy(up.gfid, lock->pl_inode->gfid); + client = (client_t *)lock->client; + if (client == NULL) { + /* A NULL client can be found if the inodelk + * was issued by a server side xlator. */ + up.client_uid = NULL; + } else { + up.client_uid = client->client_uid; + } + } - conf = __inodelk_grantable (dom, lock); - if (conf){ - ret = -EAGAIN; - if (can_block == 0) - goto out; + pthread_mutex_unlock(&pl_inode->mutex); + + if (notify) { + up.event_type = GF_UPCALL_INODELK_CONTENTION; + up.data = &lc; + + if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) { + gf_msg_debug(this->name, 0, + "Inodelk contention notification " + "failed"); + } else { + gf_msg_debug(this->name, 0, + "Inodelk contention notification " + "sent"); + } + } - gettimeofday (&lock->blkd_time, NULL); - list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks); + pthread_mutex_lock(&pl_inode->mutex); - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => Blocked", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); + list_del_init(&lock->contend); + __pl_inodelk_unref(lock); + pthread_mutex_unlock(&pl_inode->mutex); - goto out; + inode_unref(pl_inode->inode); + } +} + +/* Determine if lock is grantable or not */ +static pl_inode_lock_t * +__inodelk_grantable(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock, + struct timespec *now, struct list_head *contend) +{ + pl_inode_lock_t *l = NULL; + pl_inode_lock_t *ret = NULL; + + list_for_each_entry(l, &dom->inodelk_list, list) + { + if (inodelk_conflict(lock, l) && !same_inodelk_owner(lock, l)) { + if (ret == NULL) { + ret = l; + if (contend == NULL) { + break; + } + } + inodelk_contention_notify_check(this, l, now, contend); } + } - if (__blocked_lock_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) { - ret = -EAGAIN; - if (can_block == 0) - goto out; + return ret; +} - gettimeofday (&lock->blkd_time, NULL); - list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks); +static pl_inode_lock_t * +__blocked_lock_conflict(pl_dom_list_t *dom, pl_inode_lock_t *lock) +{ + pl_inode_lock_t *l = NULL; - gf_log (this->name, GF_LOG_TRACE, - "Lock is grantable, but blocking to prevent starvation"); - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => Blocked", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); + list_for_each_entry(l, &dom->blocked_inodelks, blocked_locks) + { + if (inodelk_conflict(lock, l)) { + return l; + } + } + return NULL; +} - goto out; - } - __pl_inodelk_ref (lock); - gettimeofday (&lock->granted_time, NULL); - list_add (&lock->list, &dom->inodelk_list); +static int +__owner_has_lock(pl_dom_list_t *dom, pl_inode_lock_t *newlock) +{ + pl_inode_lock_t *lock = NULL; - ret = 0; + list_for_each_entry(lock, &dom->inodelk_list, list) + { + if (same_inodelk_owner(lock, newlock)) + return 1; + } + + list_for_each_entry(lock, &dom->blocked_inodelks, blocked_locks) + { + if (same_inodelk_owner(lock, newlock)) + return 1; + } + + return 0; +} +static int +__lock_blocked_add(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock, + int can_block) +{ + if (can_block == 0) { + goto out; + } + + lock->blkd_time = gf_time(); + list_add_tail(&lock->blocked_locks, &dom->blocked_inodelks); + + gf_msg_trace(this->name, 0, + "%s (pid=%d) (lk-owner=%s) %" PRId64 + " - " + "%" PRId64 " => Blocked", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, + lkowner_utoa(&lock->owner), lock->user_flock.l_start, + lock->user_flock.l_len); + + pl_trace_block(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock, + lock->volume); out: + return -EAGAIN; +} + +/* Determines if lock can be granted and adds the lock. If the lock + * is blocking, adds it to the blocked_inodelks list of the domain. + */ +static int +__lock_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock, + int can_block, pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend) +{ + pl_inode_lock_t *conf = NULL; + int ret; + + ret = pl_inode_remove_inodelk(pl_inode, lock); + if (ret < 0) { return ret; + } + if (ret == 0) { + conf = __inodelk_grantable(this, dom, lock, now, contend); + } + if ((ret > 0) || (conf != NULL)) { + return __lock_blocked_add(this, dom, lock, can_block); + } + + /* To prevent blocked locks starvation, check if there are any blocked + * locks thay may conflict with this lock. If there is then don't grant + * the lock. BUT grant the lock if the owner already has lock to allow + * nested locks. + * Example: + * SHD from Machine1 takes (gfid, 0-infinity) and is granted. + * SHD from machine2 takes (gfid, 0-infinity) and is blocked. + * When SHD from Machine1 takes (gfid, 0-128KB) it + * needs to be granted, without which the earlier lock on 0-infinity + * will not be unlocked by SHD from Machine1. + * TODO: Find why 'owner_has_lock' is checked even for blocked locks. + */ + if (__blocked_lock_conflict(dom, lock) && !(__owner_has_lock(dom, lock))) { + if (can_block != 0) { + gf_log(this->name, GF_LOG_DEBUG, + "Lock is grantable, but blocking to prevent " + "starvation"); + } + + return __lock_blocked_add(this, dom, lock, can_block); + } + __pl_inodelk_ref(lock); + lock->granted_time = gf_time(); + list_add(&lock->list, &dom->inodelk_list); + + return 0; } /* Return true if the two inodelks have exactly same lock boundaries */ static int -inodelks_equal (pl_inode_lock_t *l1, pl_inode_lock_t *l2) +inodelks_equal(pl_inode_lock_t *l1, pl_inode_lock_t *l2) { - if ((l1->fl_start == l2->fl_start) && - (l1->fl_end == l2->fl_end)) - return 1; + if ((l1->fl_start == l2->fl_start) && (l1->fl_end == l2->fl_end)) + return 1; - return 0; + return 0; } - static pl_inode_lock_t * -find_matching_inodelk (pl_inode_lock_t *lock, pl_dom_list_t *dom) +find_matching_inodelk(pl_inode_lock_t *lock, pl_dom_list_t *dom) { - pl_inode_lock_t *l = NULL; - list_for_each_entry (l, &dom->inodelk_list, list) { - if (inodelks_equal (l, lock) && - same_inodelk_owner (l, lock)) - return l; - } - return NULL; + pl_inode_lock_t *l = NULL; + list_for_each_entry(l, &dom->inodelk_list, list) + { + if (inodelks_equal(l, lock) && same_inodelk_owner(l, lock)) + return l; + } + return NULL; } /* Set F_UNLCK removes a lock which has the exact same lock boundaries * as the UNLCK lock specifies. If such a lock is not found, returns invalid */ static pl_inode_lock_t * -__inode_unlock_lock (xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom) +__inode_unlock_lock(xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom) { - - pl_inode_lock_t *conf = NULL; - - conf = find_matching_inodelk (lock, dom); - if (!conf) { - gf_log (this->name, GF_LOG_ERROR, - " Matching lock not found for unlock %llu-%llu, by %s " - "on %p", (unsigned long long)lock->fl_start, - (unsigned long long)lock->fl_end, - lkowner_utoa (&lock->owner), lock->client); - goto out; - } - __delete_inode_lock (conf); - gf_log (this->name, GF_LOG_DEBUG, - " Matching lock found for unlock %llu-%llu, by %s on %p", - (unsigned long long)lock->fl_start, - (unsigned long long)lock->fl_end, lkowner_utoa (&lock->owner), - lock->client); + pl_inode_lock_t *conf = NULL; + inode_t *inode = NULL; + + inode = lock->pl_inode->inode; + + conf = find_matching_inodelk(lock, dom); + if (!conf) { + gf_log(this->name, GF_LOG_ERROR, + " Matching lock not found for unlock %llu-%llu, by %s " + "on %p for gfid:%s", + (unsigned long long)lock->fl_start, + (unsigned long long)lock->fl_end, lkowner_utoa(&lock->owner), + lock->client, inode ? uuid_utoa(inode->gfid) : "UNKNOWN"); + goto out; + } + __delete_inode_lock(conf); + gf_log(this->name, GF_LOG_DEBUG, + " Matching lock found for unlock %llu-%llu, by %s on %p for gfid:%s", + (unsigned long long)lock->fl_start, (unsigned long long)lock->fl_end, + lkowner_utoa(&lock->owner), lock->client, + inode ? uuid_utoa(inode->gfid) : "UNKNOWN"); out: - return conf; + return conf; } -static void -__grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, - struct list_head *granted, pl_dom_list_t *dom) -{ - int bl_ret = 0; - pl_inode_lock_t *bl = NULL; - pl_inode_lock_t *tmp = NULL; - struct list_head blocked_list; +void +__grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted, pl_dom_list_t *dom, + struct timespec *now, struct list_head *contend) +{ + pl_inode_lock_t *bl = NULL; + pl_inode_lock_t *tmp = NULL; - INIT_LIST_HEAD (&blocked_list); - list_splice_init (&dom->blocked_inodelks, &blocked_list); + struct list_head blocked_list; - list_for_each_entry_safe (bl, tmp, &blocked_list, blocked_locks) { + INIT_LIST_HEAD(&blocked_list); + list_splice_init(&dom->blocked_inodelks, &blocked_list); - list_del_init (&bl->blocked_locks); + list_for_each_entry_safe(bl, tmp, &blocked_list, blocked_locks) + { + list_del_init(&bl->blocked_locks); - bl_ret = __lock_inodelk (this, pl_inode, bl, 1, dom); + bl->status = __lock_inodelk(this, pl_inode, bl, 1, dom, now, contend); - if (bl_ret == 0) { - list_add (&bl->blocked_locks, granted); - } + if (bl->status != -EAGAIN) { + list_add_tail(&bl->blocked_locks, granted); } - return; + } } -/* Grant all inodelks blocked on a lock */ void -grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_dom_list_t *dom) +unwind_granted_inodes(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted) { - struct list_head granted; - pl_inode_lock_t *lock; - pl_inode_lock_t *tmp; + pl_inode_lock_t *lock; + pl_inode_lock_t *tmp; + int32_t op_ret; + int32_t op_errno; + + list_for_each_entry_safe(lock, tmp, granted, blocked_locks) + { + if (lock->status == 0) { + op_ret = 0; + op_errno = 0; + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 + " => Granted", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, lkowner_utoa(&lock->owner), + lock->user_flock.l_start, lock->user_flock.l_len); + } else { + op_ret = -1; + op_errno = -lock->status; + } + pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock, + op_ret, op_errno, lock->volume); - INIT_LIST_HEAD (&granted); + STACK_UNWIND_STRICT(inodelk, lock->frame, op_ret, op_errno, NULL); + lock->frame = NULL; + } - pthread_mutex_lock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(lock, tmp, granted, blocked_locks) { - __grant_blocked_inode_locks (this, pl_inode, &granted, dom); + list_del_init(&lock->blocked_locks); + __pl_inodelk_unref(lock); } - pthread_mutex_unlock (&pl_inode->mutex); + } + pthread_mutex_unlock(&pl_inode->mutex); +} - list_for_each_entry_safe (lock, tmp, &granted, blocked_locks) { - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => Granted", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); +/* Grant all inodelks blocked on a lock */ +void +grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend) +{ + struct list_head granted; - pl_trace_out (this, lock->frame, NULL, NULL, F_SETLKW, - &lock->user_flock, 0, 0, lock->volume); + INIT_LIST_HEAD(&granted); - STACK_UNWIND_STRICT (inodelk, lock->frame, 0, 0, NULL); - } + pthread_mutex_lock(&pl_inode->mutex); + { + __grant_blocked_inode_locks(this, pl_inode, &granted, dom, now, + contend); + } + pthread_mutex_unlock(&pl_inode->mutex); - pthread_mutex_lock (&pl_inode->mutex); - { - list_for_each_entry_safe (lock, tmp, &granted, blocked_locks) { - list_del_init (&lock->blocked_locks); - __pl_inodelk_unref (lock); - } - } - pthread_mutex_unlock (&pl_inode->mutex); + unwind_granted_inodes(this, pl_inode, &granted); } -/* Release all inodelks from this client */ -static int -release_inode_locks_of_client (xlator_t *this, pl_dom_list_t *dom, - inode_t *inode, client_t *client) +static void +pl_inodelk_log_cleanup(pl_inode_lock_t *lock) { - pl_inode_lock_t *tmp = NULL; - pl_inode_lock_t *l = NULL; + pl_inode_t *pl_inode = NULL; - pl_inode_t * pinode = NULL; + pl_inode = lock->pl_inode; - struct list_head released; + gf_log(THIS->name, GF_LOG_WARNING, + "releasing lock on %s held by " + "{client=%p, pid=%" PRId64 " lk-owner=%s}", + uuid_utoa(pl_inode->gfid), lock->client, (uint64_t)lock->client_pid, + lkowner_utoa(&lock->owner)); +} - char *path = NULL; - char *file = NULL; +/* Release all inodelks from this client */ +int +pl_inodelk_client_cleanup(xlator_t *this, pl_ctx_t *ctx) +{ + posix_locks_private_t *priv; + pl_inode_lock_t *tmp = NULL; + pl_inode_lock_t *l = NULL; + pl_dom_list_t *dom = NULL; + pl_inode_t *pl_inode = NULL; + struct list_head *pcontend = NULL; + struct list_head released; + struct list_head unwind; + struct list_head contend; + struct timespec now = {}; + + priv = this->private; + + INIT_LIST_HEAD(&released); + INIT_LIST_HEAD(&unwind); + + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD(pcontend); + timespec_now(&now); + } + + pthread_mutex_lock(&ctx->lock); + { + list_for_each_entry_safe(l, tmp, &ctx->inodelk_lockers, client_list) + { + pl_inodelk_log_cleanup(l); + + pl_inode = l->pl_inode; + + pthread_mutex_lock(&pl_inode->mutex); + { + /* If the inodelk object is part of granted list but not + * blocked list, then perform the following actions: + * i. delete the object from granted list; + * ii. grant other locks (from other clients) that may + * have been blocked on this inodelk; and + * iii. unref the object. + * + * If the inodelk object (L1) is part of both granted + * and blocked lists, then this means that a parallel + * unlock on another inodelk (L2 say) may have 'granted' + * L1 and added it to 'granted' list in + * __grant_blocked_inode_locks() (although using the + * 'blocked_locks' member). In that case, the cleanup + * codepath must try and grant other overlapping + * blocked inodelks from other clients, now that L1 is + * out of their way and then unref L1 in the end, and + * leave it to the other thread (the one executing + * unlock codepath) to unwind L1's frame, delete it from + * blocked_locks list, and perform the last unref on L1. + * + * If the inodelk object (L1) is part of blocked list + * only, the cleanup code path must: + * i. delete it from the blocked_locks list inside + * this critical section, + * ii. unwind its frame with EAGAIN, + * iii. try and grant blocked inode locks from other + * clients that were otherwise grantable, but just + * got blocked to avoid leaving L1 to starve + * forever. + * iv. unref the object. + */ + list_del_init(&l->client_list); + + if (!list_empty(&l->list)) { + __delete_inode_lock(l); + list_add_tail(&l->client_list, &released); + } else { + list_del_init(&l->blocked_locks); + list_add_tail(&l->client_list, &unwind); + } + } + pthread_mutex_unlock(&pl_inode->mutex); + } + } + pthread_mutex_unlock(&ctx->lock); - INIT_LIST_HEAD (&released); + if (!list_empty(&unwind)) { + list_for_each_entry_safe(l, tmp, &unwind, client_list) + { + list_del_init(&l->client_list); - pinode = pl_inode_get (this, inode); + if (l->frame) + STACK_UNWIND_STRICT(inodelk, l->frame, -1, EAGAIN, NULL); + list_add_tail(&l->client_list, &released); + } + } - pthread_mutex_lock (&pinode->mutex); + if (!list_empty(&released)) { + list_for_each_entry_safe(l, tmp, &released, client_list) { + list_del_init(&l->client_list); - list_for_each_entry_safe (l, tmp, &dom->blocked_inodelks, blocked_locks) { - if (l->client != client) - continue; - - list_del_init (&l->blocked_locks); - - inode_path (inode, NULL, &path); - if (path) - file = path; - else - file = uuid_utoa (inode->gfid); - - gf_log (this->name, GF_LOG_DEBUG, - "releasing blocking lock on %s held by " - "{client=%p, pid=%"PRId64" lk-owner=%s}", - file, client, (uint64_t) l->client_pid, - lkowner_utoa (&l->owner)); - - list_add (&l->blocked_locks, &released); - if (path) { - GF_FREE (path); - path = NULL; - } - } + pl_inode = l->pl_inode; - list_for_each_entry_safe (l, tmp, &dom->inodelk_list, list) { - if (l->client != client) - continue; - - inode_path (inode, NULL, &path); - if (path) - file = path; - else - file = uuid_utoa (inode->gfid); - - gf_log (this->name, GF_LOG_DEBUG, - "releasing granted lock on %s held by " - "{client=%p, pid=%"PRId64" lk-owner=%s}", - file, client, (uint64_t) l->client_pid, - lkowner_utoa (&l->owner)); - - if (path) { - GF_FREE (path); - path = NULL; - } - - __delete_inode_lock (l); - __pl_inodelk_unref (l); - } - } - GF_FREE (path); - - pthread_mutex_unlock (&pinode->mutex); + dom = get_domain(pl_inode, l->volume); - list_for_each_entry_safe (l, tmp, &released, blocked_locks) { - list_del_init (&l->blocked_locks); + grant_blocked_inode_locks(this, pl_inode, dom, &now, pcontend); - STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN, NULL); - //No need to take lock as the locks are only in one list - __pl_inodelk_unref (l); + pthread_mutex_lock(&pl_inode->mutex); + { + __pl_inodelk_unref(l); + } + pthread_mutex_unlock(&pl_inode->mutex); + inode_unref(pl_inode->inode); } + } - grant_blocked_inode_locks (this, pinode, dom); - return 0; -} + if (pcontend != NULL) { + inodelk_contention_notify(this, pcontend); + } + return 0; +} static int -pl_inode_setlk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock, - int can_block, pl_dom_list_t *dom) +pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, + pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom, + inode_t *inode) { - int ret = -EINVAL; - pl_inode_lock_t *retlock = NULL; - gf_boolean_t unref = _gf_true; - - pthread_mutex_lock (&pl_inode->mutex); - { - if (lock->fl_type != F_UNLCK) { - ret = __lock_inodelk (this, pl_inode, lock, can_block, dom); - if (ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => OK", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->fl_start, - lock->fl_end); - } else if (ret == -EAGAIN) { - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => NOK", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - if (can_block) - unref = _gf_false; - } - } else { - retlock = __inode_unlock_lock (this, lock, dom); - if (!retlock) { - gf_log (this->name, GF_LOG_DEBUG, - "Bad Unlock issued on Inode lock"); - ret = -EINVAL; - goto out; - } - __pl_inodelk_unref (retlock); - - ret = 0; + posix_locks_private_t *priv = NULL; + int ret = -EINVAL; + pl_inode_lock_t *retlock = NULL; + gf_boolean_t unref = _gf_true; + gf_boolean_t need_inode_unref = _gf_false; + struct list_head *pcontend = NULL; + struct list_head contend; + struct list_head wake; + struct timespec now = {}; + short fl_type; + + lock->pl_inode = pl_inode; + fl_type = lock->fl_type; + + priv = this->private; + + /* Ideally, AFTER a successful lock (both blocking and non-blocking) or + * an unsuccessful blocking lock operation, the inode needs to be ref'd. + * + * But doing so might give room to a race where the lock-requesting + * client could send a DISCONNECT just before this thread refs the inode + * after the locking is done, and the epoll thread could unref the inode + * in cleanup which means the inode's refcount would come down to 0, and + * the call to pl_forget() at this point destroys @pl_inode. Now when + * the io-thread executing this function tries to access pl_inode, + * it could crash on account of illegal memory access. + * + * To get around this problem, the inode is ref'd once even before + * adding the lock into client_list as a precautionary measure. + * This way even if there are DISCONNECTs, there will always be 1 extra + * ref on the inode, so @pl_inode is still alive until after the + * current stack unwinds. + */ + pl_inode->inode = inode_ref(inode); + + if (priv->revocation_secs != 0) { + if (lock->fl_type != F_UNLCK) { + __inodelk_prune_stale(this, pl_inode, dom, lock); + } else if (priv->monkey_unlocking == _gf_true) { + if (pl_does_monkey_want_stuck_lock()) { + pthread_mutex_lock(&pl_inode->mutex); + { + __pl_inodelk_unref(lock); } + pthread_mutex_unlock(&pl_inode->mutex); + inode_unref(pl_inode->inode); + gf_log(this->name, GF_LOG_WARNING, + "MONKEY LOCKING (forcing stuck lock)!"); + return 0; + } } -out: + } + + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD(pcontend); + timespec_now(&now); + } + + INIT_LIST_HEAD(&wake); + + if (ctx) + pthread_mutex_lock(&ctx->lock); + pthread_mutex_lock(&pl_inode->mutex); + { + if (lock->fl_type != F_UNLCK) { + ret = __lock_inodelk(this, pl_inode, lock, can_block, dom, &now, + pcontend); + if (ret == 0) { + lock->frame = NULL; + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 + " => OK", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, lkowner_utoa(&lock->owner), + lock->fl_start, lock->fl_end); + } else if (ret == -EAGAIN) { + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 + " => NOK", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, lkowner_utoa(&lock->owner), + lock->user_flock.l_start, lock->user_flock.l_len); + if (can_block) { + unref = _gf_false; + } + } + /* For all but the case where a non-blocking lock attempt fails + * with -EAGAIN, the extra ref taken at the start of this function + * must be negated. */ + need_inode_unref = (ret != 0) && ((ret != -EAGAIN) || !can_block); + if (ctx && !need_inode_unref) { + list_add_tail(&lock->client_list, &ctx->inodelk_lockers); + } + } else { + /* Irrespective of whether unlock succeeds or not, + * the extra inode ref that was done at the start of + * this function must be negated. Towards this, + * @need_inode_unref flag is set unconditionally here. + */ + need_inode_unref = _gf_true; + retlock = __inode_unlock_lock(this, lock, dom); + if (!retlock) { + gf_log(this->name, GF_LOG_DEBUG, + "Bad Unlock issued on Inode lock"); + ret = -EINVAL; + goto out; + } + list_del_init(&retlock->client_list); + __pl_inodelk_unref(retlock); + + pl_inode_remove_unlocked(this, pl_inode, &wake); + + ret = 0; + } + out: if (unref) - __pl_inodelk_unref (lock); - pthread_mutex_unlock (&pl_inode->mutex); - grant_blocked_inode_locks (this, pl_inode, dom); - return ret; + __pl_inodelk_unref(lock); + } + pthread_mutex_unlock(&pl_inode->mutex); + if (ctx) + pthread_mutex_unlock(&ctx->lock); + + pl_inode_remove_wake(&wake); + + /* The following (extra) unref corresponds to the ref that + * was done at the time the lock was granted. + */ + if ((fl_type == F_UNLCK) && (ret == 0)) { + inode_unref(pl_inode->inode); + grant_blocked_inode_locks(this, pl_inode, dom, &now, pcontend); + } + + if (need_inode_unref) { + inode_unref(pl_inode->inode); + } + + if (pcontend != NULL) { + inodelk_contention_notify(this, pcontend); + } + + return ret; } /* Create a new inode_lock_t */ -pl_inode_lock_t * -new_inode_lock (struct gf_flock *flock, client_t *client, pid_t client_pid, - call_frame_t *frame, xlator_t *this, const char *volume, - char *conn_id) +static pl_inode_lock_t * +new_inode_lock(struct gf_flock *flock, client_t *client, pid_t client_pid, + call_frame_t *frame, xlator_t *this, const char *volume, + char *conn_id, int32_t *op_errno) { - pl_inode_lock_t *lock = NULL; - - lock = GF_CALLOC (1, sizeof (*lock), - gf_locks_mt_pl_inode_lock_t); - if (!lock) { - return NULL; - } - - lock->fl_start = flock->l_start; - lock->fl_type = flock->l_type; - - if (flock->l_len == 0) - lock->fl_end = LLONG_MAX; - else - lock->fl_end = flock->l_start + flock->l_len - 1; - - lock->client = client; - lock->client_pid = client_pid; - lock->volume = volume; - lock->owner = frame->root->lk_owner; - lock->frame = frame; - lock->this = this; - - if (conn_id) { - lock->connection_id = gf_strdup (conn_id); - } - - INIT_LIST_HEAD (&lock->list); - INIT_LIST_HEAD (&lock->blocked_locks); - __pl_inodelk_ref (lock); + pl_inode_lock_t *lock = NULL; + + if (!pl_is_lk_owner_valid(&frame->root->lk_owner, frame->root->client)) { + *op_errno = EINVAL; + goto out; + } + + lock = GF_CALLOC(1, sizeof(*lock), gf_locks_mt_pl_inode_lock_t); + if (!lock) { + *op_errno = ENOMEM; + goto out; + } + + lock->fl_start = flock->l_start; + lock->fl_type = flock->l_type; + + if (flock->l_len == 0) + lock->fl_end = LLONG_MAX; + else + lock->fl_end = flock->l_start + flock->l_len - 1; + + lock->client = client; + lock->client_pid = client_pid; + lock->volume = volume; + lock->owner = frame->root->lk_owner; + lock->frame = frame; + lock->this = this; + + if (conn_id) { + lock->connection_id = gf_strdup(conn_id); + } + + INIT_LIST_HEAD(&lock->list); + INIT_LIST_HEAD(&lock->blocked_locks); + INIT_LIST_HEAD(&lock->client_list); + INIT_LIST_HEAD(&lock->contend); + __pl_inodelk_ref(lock); - return lock; +out: + return lock; } int32_t -_pl_convert_volume (const char *volume, char **res) +_pl_convert_volume(const char *volume, char **res) { - char *mdata_vol = NULL; - int ret = 0; - - mdata_vol = strrchr (volume, ':'); - //if the volume already ends with :metadata don't bother - if (mdata_vol && (strcmp (mdata_vol, ":metadata") == 0)) - return 0; + char *mdata_vol = NULL; + int ret = 0; - ret = gf_asprintf (res, "%s:metadata", volume); - if (ret <= 0) - return ENOMEM; + mdata_vol = strrchr(volume, ':'); + // if the volume already ends with :metadata don't bother + if (mdata_vol && (strcmp(mdata_vol, ":metadata") == 0)) return 0; + + ret = gf_asprintf(res, "%s:metadata", volume); + if (ret <= 0) + return ENOMEM; + return 0; } int32_t -_pl_convert_volume_for_special_range (struct gf_flock *flock, - const char *volume, char **res) +_pl_convert_volume_for_special_range(struct gf_flock *flock, const char *volume, + char **res) { - int32_t ret = 0; + int32_t ret = 0; - if ((flock->l_start == LLONG_MAX -1) && - (flock->l_len == 0)) { - ret = _pl_convert_volume (volume, res); - } + if ((flock->l_start == LLONG_MAX - 1) && (flock->l_len == 0)) { + ret = _pl_convert_volume(volume, res); + } - return ret; + return ret; } /* Common inodelk code called from pl_inodelk and pl_finodelk */ int -pl_common_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, inode_t *inode, int32_t cmd, - struct gf_flock *flock, loc_t *loc, fd_t *fd, dict_t *xdata) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int ret = -1; - GF_UNUSED int dict_ret = -1; - int can_block = 0; - pl_inode_t * pinode = NULL; - pl_inode_lock_t * reqlock = NULL; - pl_dom_list_t * dom = NULL; - char *res = NULL; - char *res1 = NULL; - char *conn_id = NULL; - pl_ctx_t *ctx = NULL; - - if (xdata) - dict_ret = dict_get_str (xdata, "connection-id", &conn_id); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (inode, unwind); - VALIDATE_OR_GOTO (flock, unwind); - - if ((flock->l_start < 0) || (flock->l_len < 0)) { - op_errno = EINVAL; - goto unwind; - } - - op_errno = _pl_convert_volume_for_special_range (flock, volume, &res); - if (op_errno) - goto unwind; - if (res) - volume = res; - - pl_trace_in (this, frame, fd, loc, cmd, flock, volume); - - pinode = pl_inode_get (this, inode); - if (!pinode) { - op_errno = ENOMEM; - goto unwind; +pl_common_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, + inode_t *inode, int32_t cmd, struct gf_flock *flock, + loc_t *loc, fd_t *fd, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int ret = -1; + GF_UNUSED int dict_ret = -1; + int can_block = 0; + short lock_type = 0; + pl_inode_t *pinode = NULL; + pl_inode_lock_t *reqlock = NULL; + pl_dom_list_t *dom = NULL; + char *res = NULL; + char *res1 = NULL; + char *conn_id = NULL; + pl_ctx_t *ctx = NULL; + + if (xdata) + dict_ret = dict_get_str(xdata, "connection-id", &conn_id); + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(inode, unwind); + VALIDATE_OR_GOTO(flock, unwind); + + if ((flock->l_start < 0) || (flock->l_len < 0)) { + op_errno = EINVAL; + goto unwind; + } + + op_errno = _pl_convert_volume_for_special_range(flock, volume, &res); + if (op_errno) + goto unwind; + if (res) + volume = res; + + pl_trace_in(this, frame, fd, loc, cmd, flock, volume); + + if (frame->root->client) { + ctx = pl_ctx_get(frame->root->client, this); + if (!ctx) { + op_errno = ENOMEM; + gf_log(this->name, GF_LOG_INFO, "pl_ctx_get() failed"); + goto unwind; } + } - dom = get_domain (pinode, volume); - if (!dom) { - op_errno = ENOMEM; - goto unwind; - } + pinode = pl_inode_get(this, inode, NULL); + if (!pinode) { + op_errno = ENOMEM; + goto unwind; + } - if (frame->root->lk_owner.len == 0) { - /* - special case: this means release all locks - from this client - */ - gf_log (this->name, GF_LOG_TRACE, - "Releasing all locks from client %p", frame->root->client); - - release_inode_locks_of_client (this, dom, inode, frame->root->client); - _pl_convert_volume (volume, &res1); - if (res1) { - dom = get_domain (pinode, res1); - if (dom) - release_inode_locks_of_client (this, dom, - inode, frame->root->client); - } + dom = get_domain(pinode, volume); + if (!dom) { + op_errno = ENOMEM; + goto unwind; + } - op_ret = 0; - goto unwind; - } + reqlock = new_inode_lock(flock, frame->root->client, frame->root->pid, + frame, this, dom->domain, conn_id, &op_errno); - reqlock = new_inode_lock (flock, frame->root->client, frame->root->pid, - frame, this, volume, conn_id); + if (!reqlock) { + op_ret = -1; + goto unwind; + } - if (!reqlock) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - - - switch (cmd) { + switch (cmd) { case F_SETLKW: - can_block = 1; + can_block = 1; - /* fall through */ + /* fall through */ case F_SETLK: - memcpy (&reqlock->user_flock, flock, sizeof (struct gf_flock)); - ret = pl_inode_setlk (this, pinode, reqlock, - can_block, dom); - - if (ret < 0) { - if ((can_block) && (F_UNLCK != flock->l_type)) { - pl_trace_block (this, frame, fd, loc, - cmd, flock, volume); - goto out; - } - gf_log (this->name, GF_LOG_TRACE, "returning EAGAIN"); - op_errno = -ret; - goto unwind; + lock_type = flock->l_type; + memcpy(&reqlock->user_flock, flock, sizeof(struct gf_flock)); + ret = pl_inode_setlk(this, ctx, pinode, reqlock, can_block, dom, + inode); + + if (ret < 0) { + if (ret == -EAGAIN) { + if (can_block && (F_UNLCK != lock_type)) { + goto out; + } + gf_log(this->name, GF_LOG_TRACE, "returning EAGAIN"); + } else { + gf_log(this->name, GF_LOG_TRACE, "returning %d", ret); } - break; - - default: - op_errno = ENOTSUP; - gf_log (this->name, GF_LOG_DEBUG, - "Lock command F_GETLK not supported for [f]inodelk " - "(cmd=%d)", - cmd); + op_errno = -ret; goto unwind; - } - - op_ret = 0; + } + break; - ctx = pl_ctx_get (frame->root->client, this); - - if (ctx == NULL) { - gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed"); - goto unwind; - } + default: + op_errno = ENOTSUP; + gf_log(this->name, GF_LOG_DEBUG, + "Lock command F_GETLK not supported for [f]inodelk " + "(cmd=%d)", + cmd); + goto unwind; + } - if (flock->l_type == F_UNLCK) - pl_del_locker (ctx->ltable, volume, loc, fd, - &frame->root->lk_owner, - GF_FOP_INODELK); - else - pl_add_locker (ctx->ltable, volume, loc, fd, - frame->root->pid, - &frame->root->lk_owner, - GF_FOP_INODELK); + op_ret = 0; unwind: - if ((inode != NULL) && (flock !=NULL)) { - pl_update_refkeeper (this, inode); - pl_trace_out (this, frame, fd, loc, cmd, flock, op_ret, op_errno, volume); - } + if (flock != NULL) + pl_trace_out(this, frame, fd, loc, cmd, flock, op_ret, op_errno, + volume); - STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, NULL); + STACK_UNWIND_STRICT(inodelk, frame, op_ret, op_errno, NULL); out: - GF_FREE (res); - GF_FREE (res1); - return 0; + GF_FREE(res); + GF_FREE(res1); + return 0; } int -pl_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, - dict_t *xdata) +pl_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - pl_common_inodelk (frame, this, volume, loc->inode, cmd, flock, - loc, NULL, xdata); + pl_common_inodelk(frame, this, volume, loc->inode, cmd, flock, loc, NULL, + xdata); - return 0; + return 0; } int -pl_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, - dict_t *xdata) +pl_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - pl_common_inodelk (frame, this, volume, fd->inode, cmd, flock, - NULL, fd, xdata); - - return 0; + pl_common_inodelk(frame, this, volume, fd->inode, cmd, flock, NULL, fd, + xdata); + return 0; } -static inline int32_t -__get_inodelk_dom_count (pl_dom_list_t *dom) +static int32_t +__get_inodelk_dom_count(pl_dom_list_t *dom) { - pl_inode_lock_t *lock = NULL; - int32_t count = 0; - - list_for_each_entry (lock, &dom->inodelk_list, list) { - count++; - } - list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) { - count++; - } - return count; + pl_inode_lock_t *lock = NULL; + int32_t count = 0; + + list_for_each_entry(lock, &dom->inodelk_list, list) { count++; } + list_for_each_entry(lock, &dom->blocked_inodelks, blocked_locks) + { + count++; + } + return count; } /* Returns the no. of locks (blocked/granted) held on a given domain name * If @domname is NULL, returns the no. of locks in all the domains present. * If @domname is non-NULL and non-existent, returns 0 */ int32_t -__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode, char *domname) +__get_inodelk_count(xlator_t *this, pl_inode_t *pl_inode, char *domname) { - int32_t count = 0; - pl_dom_list_t *dom = NULL; - - list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { - if (domname) { - if (strcmp (domname, dom->domain) == 0) { - count = __get_inodelk_dom_count (dom); - goto out; - } - - } else { - /* Counting locks from all domains */ - count += __get_inodelk_dom_count (dom); + int32_t count = 0; + pl_dom_list_t *dom = NULL; + + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + if (domname) { + if (strcmp(domname, dom->domain) == 0) { + count = __get_inodelk_dom_count(dom); + goto out; + } - } + } else { + /* Counting locks from all domains */ + count += __get_inodelk_dom_count(dom); } + } out: - return count; + return count; } int32_t -get_inodelk_count (xlator_t *this, inode_t *inode, char *domname) +get_inodelk_count(xlator_t *this, inode_t *inode, char *domname) { - pl_inode_t *pl_inode = NULL; - uint64_t tmp_pl_inode = 0; - int ret = 0; - int32_t count = 0; + pl_inode_t *pl_inode = NULL; + uint64_t tmp_pl_inode = 0; + int ret = 0; + int32_t count = 0; - ret = inode_ctx_get (inode, this, &tmp_pl_inode); - if (ret != 0) { - goto out; - } + ret = inode_ctx_get(inode, this, &tmp_pl_inode); + if (ret != 0) { + goto out; + } - pl_inode = (pl_inode_t *)(long) tmp_pl_inode; + pl_inode = (pl_inode_t *)(long)tmp_pl_inode; - pthread_mutex_lock (&pl_inode->mutex); - { - count = __get_inodelk_count (this, pl_inode, domname); - } - pthread_mutex_unlock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + count = __get_inodelk_count(this, pl_inode, domname); + } + pthread_mutex_unlock(&pl_inode->mutex); out: - return count; + return count; } diff --git a/xlators/features/locks/src/locks-mem-types.h b/xlators/features/locks/src/locks-mem-types.h index 08aeb0a7925..a76605027b3 100644 --- a/xlators/features/locks/src/locks-mem-types.h +++ b/xlators/features/locks/src/locks-mem-types.h @@ -11,19 +11,18 @@ #ifndef __LOCKS_MEM_TYPES_H__ #define __LOCKS_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_locks_mem_types_ { - gf_locks_mt_pl_dom_list_t = gf_common_mt_end + 1, - gf_locks_mt_pl_inode_t, - gf_locks_mt_posix_lock_t, - gf_locks_mt_pl_entry_lock_t, - gf_locks_mt_pl_inode_lock_t, - gf_locks_mt_truncate_ops, - gf_locks_mt_pl_rw_req_t, - gf_locks_mt_posix_locks_private_t, - gf_locks_mt_pl_fdctx_t, - gf_locks_mt_end + gf_locks_mt_pl_dom_list_t = gf_common_mt_end + 1, + gf_locks_mt_pl_inode_t, + gf_locks_mt_posix_lock_t, + gf_locks_mt_pl_entry_lock_t, + gf_locks_mt_pl_inode_lock_t, + gf_locks_mt_pl_rw_req_t, + gf_locks_mt_posix_locks_private_t, + gf_locks_mt_pl_fdctx_t, + gf_locks_mt_pl_meta_lock_t, + gf_locks_mt_end }; #endif - diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h index 76fc941d74c..c868eb494a2 100644 --- a/xlators/features/locks/src/locks.h +++ b/xlators/features/locks/src/locks.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2006-2012, 2015-2016 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -10,183 +10,283 @@ #ifndef __POSIX_LOCKS_H__ #define __POSIX_LOCKS_H__ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "compat-errno.h" -#include "stack.h" -#include "call-stub.h" +#include <glusterfs/compat-errno.h> +#include <glusterfs/stack.h> +#include <glusterfs/call-stub.h> #include "locks-mem-types.h" -#include "client_t.h" +#include <glusterfs/client_t.h> + +#include <glusterfs/lkowner.h> -#include "lkowner.h" +typedef enum { + MLK_NONE, + MLK_FILE_BASED, + MLK_FORCED, + MLK_OPTIMAL +} mlk_mode_t; /* defines different mandatory locking modes*/ struct __pl_fd; struct __posix_lock { - struct list_head list; + struct list_head list; + + off_t fl_start; + off_t fl_end; + uint32_t lk_flags; + + short fl_type; + short blocked; /* waiting to acquire */ + struct gf_flock user_flock; /* the flock supplied by the user */ + xlator_t *this; /* required for blocked locks */ + unsigned long fd_num; + + fd_t *fd; + call_frame_t *frame; - short fl_type; - off_t fl_start; - off_t fl_end; + time_t blkd_time; /* time at which lock was queued into blkd list */ + time_t granted_time; /* time at which lock was queued into active list */ - short blocked; /* waiting to acquire */ - struct gf_flock user_flock; /* the flock supplied by the user */ - xlator_t *this; /* required for blocked locks */ - unsigned long fd_num; + /* These two together serve to uniquely identify each process + across nodes */ - fd_t *fd; - call_frame_t *frame; + void *client; /* to identify client node */ - struct timeval blkd_time; /*time at which lock was queued into blkd list*/ - struct timeval granted_time; /*time at which lock was queued into active list*/ + /* This field uniquely identifies the client the lock belongs to. As + * lock migration is handled by rebalance, the client_t object will be + * overwritten by rebalance and can't be deemed as the owner of the + * lock on destination. Hence, the below field is migrated from + * source to destination by lock_migration_info_t and updated on the + * destination. So that on client-server disconnection, server can + * cleanup the locks proper;y. */ - /* These two together serve to uniquely identify each process - across nodes */ + char *client_uid; + gf_lkowner_t owner; + pid_t client_pid; /* pid of client process */ - void *client; /* to identify client node */ - gf_lkowner_t owner; - pid_t client_pid; /* pid of client process */ + int blocking; }; typedef struct __posix_lock posix_lock_t; struct __pl_inode_lock { - struct list_head list; - struct list_head blocked_locks; /* list_head pointing to blocked_inodelks */ - int ref; + struct list_head list; + struct list_head blocked_locks; /* list_head pointing to blocked_inodelks */ + struct list_head contend; /* list of contending locks */ + int ref; - short fl_type; - off_t fl_start; - off_t fl_end; + off_t fl_start; + off_t fl_end; - const char *volume; + const char *volume; - struct gf_flock user_flock; /* the flock supplied by the user */ - xlator_t *this; /* required for blocked locks */ - fd_t *fd; + struct gf_flock user_flock; /* the flock supplied by the user */ + xlator_t *this; /* required for blocked locks */ + struct __pl_inode *pl_inode; - call_frame_t *frame; + call_frame_t *frame; - struct timeval blkd_time; /*time at which lock was queued into blkd list*/ - struct timeval granted_time; /*time at which lock was queued into active list*/ + time_t blkd_time; /* time at which lock was queued into blkd list */ + time_t granted_time; /* time at which lock was queued into active list */ - /* These two together serve to uniquely identify each process - across nodes */ + /*last time at which lock contention was detected and notified*/ + struct timespec contention_time; - void *client; /* to identify client node */ - gf_lkowner_t owner; - pid_t client_pid; /* pid of client process */ + /* These two together serve to uniquely identify each process + across nodes */ - char *connection_id; /* stores the client connection id */ + void *client; /* to identify client node */ + gf_lkowner_t owner; + pid_t client_pid; /* pid of client process */ + + char *connection_id; /* stores the client connection id */ + + struct list_head client_list; /* list of all locks from a client */ + short fl_type; + + int32_t status; /* Error code when we try to grant a lock in blocked + state */ }; typedef struct __pl_inode_lock pl_inode_lock_t; -struct __pl_rw_req_t { - struct list_head list; - call_stub_t *stub; - posix_lock_t region; +struct _pl_rw_req { + struct list_head list; + call_stub_t *stub; + posix_lock_t region; }; -typedef struct __pl_rw_req_t pl_rw_req_t; - -struct __pl_dom_list_t { - struct list_head inode_list; /* list_head back to pl_inode_t */ - const char *domain; - struct list_head entrylk_list; /* List of entry locks */ - struct list_head blocked_entrylks; /* List of all blocked entrylks */ - struct list_head inodelk_list; /* List of inode locks */ - struct list_head blocked_inodelks; /* List of all blocked inodelks */ +typedef struct _pl_rw_req pl_rw_req_t; + +struct _pl_dom_list { + struct list_head inode_list; /* list_head back to pl_inode_t */ + const char *domain; + struct list_head entrylk_list; /* List of entry locks */ + struct list_head blocked_entrylks; /* List of all blocked entrylks */ + struct list_head inodelk_list; /* List of inode locks */ + struct list_head blocked_inodelks; /* List of all blocked inodelks */ }; -typedef struct __pl_dom_list_t pl_dom_list_t; +typedef struct _pl_dom_list pl_dom_list_t; struct __entry_lock { - struct list_head domain_list; /* list_head back to pl_dom_list_t */ - struct list_head blocked_locks; /* list_head back to blocked_entrylks */ + struct list_head domain_list; /* list_head back to pl_dom_list_t */ + struct list_head blocked_locks; /* list_head back to blocked_entrylks */ + struct list_head contend; /* list of contending locks */ + int ref; + + call_frame_t *frame; + xlator_t *this; + struct __pl_inode *pinode; - call_frame_t *frame; - xlator_t *this; + const char *volume; - const char *volume; + const char *basename; - const char *basename; - entrylk_type type; + time_t blkd_time; /* time at which lock was queued into blkd list */ + time_t granted_time; /* time at which lock was queued into active list */ - struct timeval blkd_time; /*time at which lock was queued into blkd list*/ - struct timeval granted_time; /*time at which lock was queued into active list*/ + /*last time at which lock contention was detected and notified*/ + struct timespec contention_time; - void *trans; - gf_lkowner_t owner; - pid_t client_pid; /* pid of client process */ + void *client; + gf_lkowner_t owner; + pid_t client_pid; /* pid of client process */ - char *connection_id; /* stores the client connection id */ + char *connection_id; /* stores the client connection id */ + + struct list_head client_list; /* list of all locks from a client */ + entrylk_type type; }; typedef struct __entry_lock pl_entry_lock_t; - /* The "simulated" inode. This contains a list of all the locks associated with this file */ struct __pl_inode { - pthread_mutex_t mutex; - - struct list_head dom_list; /* list of domains */ - struct list_head ext_list; /* list of fcntl locks */ - struct list_head rw_list; /* list of waiting r/w requests */ - struct list_head reservelk_list; /* list of reservelks */ - struct list_head blocked_reservelks; /* list of blocked reservelks */ - struct list_head blocked_calls; /* List of blocked lock calls while a reserve is held*/ - int mandatory; /* if mandatory locking is enabled */ - - inode_t *refkeeper; /* hold refs on an inode while locks are - held to prevent pruning */ + pthread_mutex_t mutex; + + struct list_head dom_list; /* list of domains */ + struct list_head ext_list; /* list of fcntl locks */ + struct list_head rw_list; /* list of waiting r/w requests */ + struct list_head reservelk_list; /* list of reservelks */ + struct list_head blocked_reservelks; /* list of blocked reservelks */ + struct list_head blocked_calls; /* List of blocked lock calls while a + reserve is held*/ + struct list_head metalk_list; /* Meta lock list */ + struct list_head queued_locks; /* This is to store the incoming lock + requests while meta lock is enabled */ + struct list_head waiting; /* List of pending fops waiting to unlink/rmdir + the inode. */ + int mandatory; /* if mandatory locking is enabled */ + + inode_t *refkeeper; /* hold refs on an inode while locks are + held to prevent pruning */ + uuid_t gfid; /* placeholder for gfid of the inode */ + inode_t *inode; /* pointer to be used for ref and unref + of inode_t as long as there are + locks on it */ + gf_boolean_t migrated; + + /* Flag to indicate whether to read mlock-enforce xattr from disk */ + gf_boolean_t check_mlock_info; + + /* Mandatory_lock enforce: IO will be allowed if and only if the lkowner has + held the lock. + + Note: An xattr is set on the file to recover this information post + reboot. If client does not want mandatory lock to be enforced, then it + should remove this xattr explicitly + */ + gf_boolean_t mlock_enforced; + /* There are scenarios where mandatory lock is granted but there are IOs + pending at posix level. To avoid this before preempting the previous lock + owner, we wait for all the fops to be unwound. + */ + int fop_wind_count; + pthread_cond_t check_fop_wind_count; + + gf_boolean_t track_fop_wind_count; + + int32_t links; /* Number of hard links the inode has. */ + uint32_t remove_running; /* Number of remove operations running. */ + gf_boolean_t is_locked; /* Regular locks will be blocked. */ + gf_boolean_t removed; /* The inode has been deleted. */ }; typedef struct __pl_inode pl_inode_t; +struct __pl_metalk { + pthread_mutex_t mutex; + /* For pl_inode meta lock list */ + struct list_head list; + /* For pl_ctx_t list */ + struct list_head client_list; + char *client_uid; -struct __pl_fd { - gf_boolean_t nonblocking; /* whether O_NONBLOCK has been set */ + pl_inode_t *pl_inode; + int ref; }; -typedef struct __pl_fd pl_fd_t; - +typedef struct __pl_metalk pl_meta_lock_t; typedef struct { - gf_boolean_t mandatory; /* if mandatory locking is enabled */ - gf_boolean_t trace; /* trace lock requests in and out */ - char *brickname; + char *brickname; + uint32_t revocation_secs; + uint32_t revocation_max_blocked; + uint32_t notify_contention_delay; + mlk_mode_t mandatory_mode; /* holds current mandatory locking mode */ + gf_boolean_t trace; /* trace lock requests in and out */ + gf_boolean_t monkey_unlocking; + gf_boolean_t revocation_clear_all; + gf_boolean_t notify_contention; + gf_boolean_t mlock_enforced; } posix_locks_private_t; - typedef struct { - gf_boolean_t entrylk_count_req; - gf_boolean_t inodelk_count_req; - gf_boolean_t inodelk_dom_count_req; - gf_boolean_t posixlk_count_req; - gf_boolean_t parent_entrylk_req; - - /* used by {f,}truncate */ - loc_t loc; - fd_t *fd; - off_t offset; - dict_t *xdata; - enum {TRUNCATE, FTRUNCATE} op; + data_t *inodelk_dom_count_req; + + dict_t *xdata; + loc_t loc[2]; + fd_t *fd; + inode_t *inode; + off_t offset; + glusterfs_fop_t op; + gf_boolean_t entrylk_count_req; + gf_boolean_t inodelk_count_req; + gf_boolean_t posixlk_count_req; + gf_boolean_t parent_entrylk_req; + gf_boolean_t multiple_dom_lk_requests; + int update_mlock_enforced_flag; } pl_local_t; - typedef struct { - struct list_head locks_list; + struct list_head locks_list; } pl_fdctx_t; +struct _locker { + struct list_head lockers; + char *volume; + inode_t *inode; + gf_lkowner_t owner; +}; typedef struct _locks_ctx { - gf_lock_t ltable_lock; /* only for replace, - ltable has its own internal - lock for operations */ - struct _lock_table *ltable; + pthread_mutex_t lock; + struct list_head inodelk_lockers; + struct list_head entrylk_lockers; + struct list_head metalk_list; } pl_ctx_t; +typedef struct _multi_dom_lk_data { + xlator_t *this; + inode_t *inode; + dict_t *xdata_rsp; + gf_boolean_t keep_max; +} multi_dom_lk_data; + +typedef enum { DECREMENT, INCREMENT } pl_count_op_t; pl_ctx_t * -pl_ctx_get (client_t *client, xlator_t *xlator); +pl_ctx_get(client_t *client, xlator_t *xlator); + +int +pl_inodelk_client_cleanup(xlator_t *this, pl_ctx_t *ctx); + +int +pl_entrylk_client_cleanup(xlator_t *this, pl_ctx_t *ctx); #endif /* __POSIX_LOCKS_H__ */ diff --git a/xlators/features/locks/src/pl-messages.h b/xlators/features/locks/src/pl-messages.h new file mode 100644 index 00000000000..e2d3d7ca974 --- /dev/null +++ b/xlators/features/locks/src/pl-messages.h @@ -0,0 +1,29 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _PL_MESSAGES_H_ +#define _PL_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(PL, PL_MSG_LOCK_NUMBER, PL_MSG_INODELK_CONTENTION_FAILED, + PL_MSG_ENTRYLK_CONTENTION_FAILED); + +#endif /* !_PL_MESSAGES_H_ */ diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c index 7bfb38a51ac..cf0ae4c57dd 100644 --- a/xlators/features/locks/src/posix.c +++ b/xlators/features/locks/src/posix.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2006-2012, 2016 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -12,2761 +12,5084 @@ #include <limits.h> #include <pthread.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "inode.h" -#include "logging.h" -#include "common-utils.h" +#include <glusterfs/compat.h> +#include <glusterfs/logging.h> #include "locks.h" #include "common.h" -#include "statedump.h" +#include <glusterfs/statedump.h> #include "clear.h" -#include "defaults.h" -#include "syncop.h" +#include <glusterfs/defaults.h> +#include <glusterfs/syncop.h> #ifndef LLONG_MAX #define LLONG_MAX LONG_LONG_MAX /* compat with old gcc */ -#endif /* LLONG_MAX */ +#endif /* LLONG_MAX */ /* Forward declarations */ +void +do_blocked_rw(pl_inode_t *); +static int +__rw_allowable(pl_inode_t *, posix_lock_t *, glusterfs_fop_t); +static int +format_brickname(char *); +int +pl_lockinfo_get_brickname(xlator_t *, inode_t *, int32_t *); +static int +fetch_pathinfo(xlator_t *, inode_t *, int32_t *, char **); -void do_blocked_rw (pl_inode_t *); -static int __rw_allowable (pl_inode_t *, posix_lock_t *, glusterfs_fop_t); -static int format_brickname(char *); -int pl_lockinfo_get_brickname (xlator_t *, inode_t *, int32_t *); -static int fetch_pathinfo(xlator_t *, inode_t *, int32_t *, char **); +/* + * The client is always requesting data, but older + * servers were not returning it. Newer ones are, so + * the client is receiving a mix of NULL and non-NULL + * xdata in the answers when bricks are of different + * versions. This triggers a bug in older clients. + * To prevent that, we avoid returning extra xdata to + * older clients (making the newer brick to behave as + * an old brick). + */ +#define PL_STACK_UNWIND_FOR_CLIENT(fop, xdata, frame, op_ret, params...) \ + do { \ + pl_local_t *__local = NULL; \ + if (frame->root->client && \ + (frame->root->client->opversion < GD_OP_VERSION_3_10_0)) { \ + __local = frame->local; \ + PL_STACK_UNWIND_AND_FREE(__local, fop, frame, op_ret, params); \ + } else { \ + PL_STACK_UNWIND(fop, xdata, frame, op_ret, params); \ + } \ + } while (0) + +#define PL_STACK_UNWIND(fop, xdata, frame, op_ret, params...) \ + do { \ + pl_local_t *__local = NULL; \ + inode_t *__parent = NULL; \ + inode_t *__inode = NULL; \ + char *__name = NULL; \ + dict_t *__unref = NULL; \ + int __i = 0; \ + __local = frame->local; \ + if (op_ret >= 0 && pl_needs_xdata_response(frame->local)) { \ + if (xdata) \ + dict_ref(xdata); \ + else \ + xdata = dict_new(); \ + if (xdata) { \ + __unref = xdata; \ + while (__local->fd || __local->loc[__i].inode) { \ + pl_get_xdata_rsp_args(__local, #fop, &__parent, &__inode, \ + &__name, __i); \ + pl_set_xdata_response(frame->this, __local, __parent, \ + __inode, __name, xdata, __i > 0); \ + if (__local->fd || __i == 1) \ + break; \ + __i++; \ + } \ + } \ + } \ + PL_STACK_UNWIND_AND_FREE(__local, fop, frame, op_ret, params); \ + if (__unref) \ + dict_unref(__unref); \ + } while (0) + +#define PL_LOCAL_GET_REQUESTS(frame, this, xdata, __fd, __loc, __newloc) \ + do { \ + if (pl_has_xdata_requests(xdata)) { \ + if (!frame->local) \ + frame->local = mem_get0(this->local_pool); \ + pl_local_t *__local = frame->local; \ + if (__local) { \ + if (__fd) { \ + __local->fd = fd_ref(__fd); \ + __local->inode = inode_ref(__fd->inode); \ + } else { \ + if (__loc) \ + loc_copy(&__local->loc[0], __loc); \ + if (__newloc) \ + loc_copy(&__local->loc[1], __newloc); \ + __local->inode = inode_ref(__local->loc[0].inode); \ + } \ + pl_get_xdata_requests(__local, xdata); \ + } \ + } \ + } while (0) + +#define PL_CHECK_LOCK_ENFORCE_KEY(frame, dict, name, this, loc, fd, priv) \ + do { \ + if ((dict && (dict_get(dict, GF_ENFORCE_MANDATORY_LOCK))) || \ + (name && (strcmp(name, GF_ENFORCE_MANDATORY_LOCK) == 0))) { \ + inode_t *__inode = (loc ? loc->inode : fd->inode); \ + pl_inode_t *__pl_inode = pl_inode_get(this, __inode, NULL); \ + if (__pl_inode == NULL) { \ + op_ret = -1; \ + op_errno = ENOMEM; \ + goto unwind; \ + } \ + if (!pl_is_mandatory_locking_enabled(__pl_inode) || \ + !priv->mlock_enforced) { \ + op_ret = -1; \ + gf_msg(this->name, GF_LOG_DEBUG, EINVAL, 0, \ + "option %s would need mandatory lock to be enabled " \ + "and feature.enforce-mandatory-lock option to be set " \ + "to on", \ + GF_ENFORCE_MANDATORY_LOCK); \ + op_errno = EINVAL; \ + goto unwind; \ + } \ + \ + op_ret = pl_local_init(frame, this, loc, fd); \ + if (op_ret) { \ + op_errno = ENOMEM; \ + goto unwind; \ + } \ + \ + ((pl_local_t *)(frame->local))->update_mlock_enforced_flag = 1; \ + } \ + } while (0) + +#define PL_INODE_REMOVE(_fop, _frame, _xl, _loc1, _loc2, _cont, _cbk, \ + _args...) \ + ({ \ + struct list_head contend; \ + pl_inode_t *__pl_inode; \ + call_stub_t *__stub; \ + int32_t __error; \ + INIT_LIST_HEAD(&contend); \ + __error = pl_inode_remove_prepare(_xl, _frame, _loc2 ? _loc2 : _loc1, \ + &__pl_inode, &contend); \ + if (__error < 0) { \ + __stub = fop_##_fop##_stub(_frame, _cont, ##_args); \ + __error = pl_inode_remove_complete(_xl, __pl_inode, __stub, \ + &contend); \ + } else if (__error == 0) { \ + PL_LOCAL_GET_REQUESTS(_frame, _xl, xdata, ((fd_t *)NULL), _loc1, \ + _loc2); \ + STACK_WIND_COOKIE(_frame, _cbk, __pl_inode, FIRST_CHILD(_xl), \ + FIRST_CHILD(_xl)->fops->_fop, ##_args); \ + } \ + __error; \ + }) + +gf_boolean_t +pl_has_xdata_requests(dict_t *xdata) +{ + static char *reqs[] = {GLUSTERFS_ENTRYLK_COUNT, + GLUSTERFS_INODELK_COUNT, + GLUSTERFS_INODELK_DOM_COUNT, + GLUSTERFS_POSIXLK_COUNT, + GLUSTERFS_PARENT_ENTRYLK, + GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS, + NULL}; + static int reqs_size[] = {SLEN(GLUSTERFS_ENTRYLK_COUNT), + SLEN(GLUSTERFS_INODELK_COUNT), + SLEN(GLUSTERFS_INODELK_DOM_COUNT), + SLEN(GLUSTERFS_POSIXLK_COUNT), + SLEN(GLUSTERFS_PARENT_ENTRYLK), + SLEN(GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS), + 0}; + int i = 0; + + if (!xdata) + return _gf_false; + + for (i = 0; reqs[i]; i++) + if (dict_getn(xdata, reqs[i], reqs_size[i])) + return _gf_true; + + return _gf_false; +} -static pl_fdctx_t * -pl_new_fdctx () +static int +dict_delete_domain_key(dict_t *dict, char *key, data_t *value, void *data) { - pl_fdctx_t *fdctx = NULL; + dict_del(dict, key); + return 0; +} - fdctx = GF_CALLOC (1, sizeof (*fdctx), - gf_locks_mt_pl_fdctx_t); - GF_VALIDATE_OR_GOTO ("posix-locks", fdctx, out); +void +pl_get_xdata_requests(pl_local_t *local, dict_t *xdata) +{ + if (!local || !xdata) + return; - INIT_LIST_HEAD (&fdctx->locks_list); + GF_ASSERT(local->xdata == NULL); + local->xdata = dict_copy_with_ref(xdata, NULL); + + if (dict_get_sizen(xdata, GLUSTERFS_ENTRYLK_COUNT)) { + local->entrylk_count_req = 1; + dict_del_sizen(xdata, GLUSTERFS_ENTRYLK_COUNT); + } + if (dict_get_sizen(xdata, GLUSTERFS_INODELK_COUNT)) { + local->inodelk_count_req = 1; + dict_del_sizen(xdata, GLUSTERFS_INODELK_COUNT); + } + if (dict_get_sizen(xdata, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS)) { + local->multiple_dom_lk_requests = 1; + dict_del_sizen(xdata, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS); + dict_foreach_fnmatch(xdata, GLUSTERFS_INODELK_DOM_PREFIX "*", + dict_delete_domain_key, NULL); + } + + local->inodelk_dom_count_req = dict_get_sizen(xdata, + GLUSTERFS_INODELK_DOM_COUNT); + if (local->inodelk_dom_count_req) { + data_ref(local->inodelk_dom_count_req); + dict_del_sizen(xdata, GLUSTERFS_INODELK_DOM_COUNT); + } + + if (dict_get_sizen(xdata, GLUSTERFS_POSIXLK_COUNT)) { + local->posixlk_count_req = 1; + dict_del_sizen(xdata, GLUSTERFS_POSIXLK_COUNT); + } + + if (dict_get_sizen(xdata, GLUSTERFS_PARENT_ENTRYLK)) { + local->parent_entrylk_req = 1; + dict_del_sizen(xdata, GLUSTERFS_PARENT_ENTRYLK); + } +} -out: - return fdctx; +gf_boolean_t +pl_needs_xdata_response(pl_local_t *local) +{ + if (!local) + return _gf_false; + + if (local->parent_entrylk_req || local->entrylk_count_req || + local->inodelk_dom_count_req || local->inodelk_count_req || + local->posixlk_count_req || local->multiple_dom_lk_requests) + return _gf_true; + + return _gf_false; } -static pl_fdctx_t * -pl_check_n_create_fdctx (xlator_t *this, fd_t *fd) +void +pl_get_xdata_rsp_args(pl_local_t *local, char *fop, inode_t **parent, + inode_t **inode, char **name, int i) { - int ret = 0; - uint64_t tmp = 0; - pl_fdctx_t *fdctx = NULL; + if (strcmp(fop, "lookup") == 0) { + *parent = local->loc[0].parent; + *inode = local->loc[0].inode; + *name = (char *)local->loc[0].name; + } else { + if (local->fd) { + *inode = local->fd->inode; + } else { + *inode = local->loc[i].parent; + } + } +} - GF_VALIDATE_OR_GOTO ("posix-locks", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); +static inline int +pl_track_io_fop_count(pl_local_t *local, xlator_t *this, pl_count_op_t op) +{ + pl_inode_t *pl_inode = NULL; - LOCK (&fd->lock); - { - ret = __fd_ctx_get (fd, this, &tmp); - if ((ret != 0) || (tmp == 0)) { - fdctx = pl_new_fdctx (); - if (fdctx == NULL) { - goto unlock; - } - } + if (!local) + return -1; + + pl_inode = pl_inode_get(this, local->inode, NULL); + if (!pl_inode) + return -1; - ret = __fd_ctx_set (fd, this, (uint64_t)(long)fdctx); - if (ret != 0) { - GF_FREE (fdctx); - fdctx = NULL; - gf_log (this->name, GF_LOG_DEBUG, - "failed to set fd ctx"); + if (pl_inode->mlock_enforced && pl_inode->track_fop_wind_count) { + pthread_mutex_lock(&pl_inode->mutex); + { + if (op == DECREMENT) { + pl_inode->fop_wind_count--; + /* fop_wind_count can go negative when lock enforcement is + * enabled on unwind path of an IO. Hence the "<" comparision. + */ + if (pl_inode->fop_wind_count <= 0) { + pthread_cond_broadcast(&pl_inode->check_fop_wind_count); + pl_inode->track_fop_wind_count = _gf_false; + pl_inode->fop_wind_count = 0; } + } else { + pl_inode->fop_wind_count++; + } } -unlock: - UNLOCK (&fd->lock); + pthread_mutex_unlock(&pl_inode->mutex); + } -out: - return fdctx; + return 0; } -int -pl_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +static int32_t +__get_posixlk_count(pl_inode_t *pl_inode) { - pl_local_t *local = NULL; + posix_lock_t *lock = NULL; + int32_t count = 0; - local = frame->local; + list_for_each_entry(lock, &pl_inode->ext_list, list) { count++; } - if (local->op == TRUNCATE) - loc_wipe (&local->loc); + return count; +} - if (local->xdata) - dict_unref (local->xdata); - if (local->fd) - fd_unref (local->fd); +int32_t +get_posixlk_count(xlator_t *this, inode_t *inode) +{ + pl_inode_t *pl_inode = NULL; + uint64_t tmp_pl_inode = 0; + int32_t count = 0; - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, - prebuf, postbuf, xdata); - return 0; + int ret = inode_ctx_get(inode, this, &tmp_pl_inode); + if (ret != 0) { + goto out; + } + + pl_inode = (pl_inode_t *)(long)tmp_pl_inode; + + pthread_mutex_lock(&pl_inode->mutex); + { + count = __get_posixlk_count(pl_inode); + } + pthread_mutex_unlock(&pl_inode->mutex); + +out: + return count; } +void +pl_parent_entrylk_xattr_fill(xlator_t *this, inode_t *parent, char *basename, + dict_t *dict, gf_boolean_t keep_max) +{ + int32_t entrylk = 0; + int32_t maxcount = -1; + int ret = -1; + + if (!parent || !basename) + goto out; + if (keep_max) { + ret = dict_get_int32_sizen(dict, GLUSTERFS_PARENT_ENTRYLK, &maxcount); + if (ret < 0) + gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s", + GLUSTERFS_PARENT_ENTRYLK); + } + entrylk = check_entrylk_on_basename(this, parent, basename); + if (maxcount >= entrylk) + return; +out: + ret = dict_set_int32_sizen(dict, GLUSTERFS_PARENT_ENTRYLK, entrylk); + if (ret < 0) { + gf_msg_debug(this->name, 0, " dict_set failed on key %s", + GLUSTERFS_PARENT_ENTRYLK); + } +} -static int -truncate_allowed (pl_inode_t *pl_inode, - client_t *client, pid_t client_pid, - gf_lkowner_t *owner, off_t offset) +void +pl_entrylk_xattr_fill(xlator_t *this, inode_t *inode, dict_t *dict, + gf_boolean_t keep_max) +{ + int32_t count = 0; + int32_t maxcount = -1; + int ret = -1; + + if (keep_max) { + ret = dict_get_int32_sizen(dict, GLUSTERFS_ENTRYLK_COUNT, &maxcount); + if (ret < 0) + gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s", + GLUSTERFS_ENTRYLK_COUNT); + } + count = get_entrylk_count(this, inode); + if (maxcount >= count) + return; + + ret = dict_set_int32_sizen(dict, GLUSTERFS_ENTRYLK_COUNT, count); + if (ret < 0) { + gf_msg_debug(this->name, 0, " dict_set failed on key %s", + GLUSTERFS_ENTRYLK_COUNT); + } +} + +void +pl_inodelk_xattr_fill(xlator_t *this, inode_t *inode, dict_t *dict, + char *domname, gf_boolean_t keep_max) { - posix_lock_t *l = NULL; - posix_lock_t region = {.list = {0, }, }; - int ret = 1; + int32_t count = 0; + int32_t maxcount = -1; + int ret = -1; + + if (keep_max) { + ret = dict_get_int32_sizen(dict, GLUSTERFS_INODELK_COUNT, &maxcount); + if (ret < 0) + gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s", + GLUSTERFS_INODELK_COUNT); + } + count = get_inodelk_count(this, inode, domname); + if (maxcount >= count) + return; - region.fl_start = offset; - region.fl_end = LLONG_MAX; - region.client = client; - region.client_pid = client_pid; - region.owner = *owner; + ret = dict_set_int32_sizen(dict, GLUSTERFS_INODELK_COUNT, count); + if (ret < 0) { + gf_msg_debug(this->name, 0, + "Failed to set count for " + "key %s", + GLUSTERFS_INODELK_COUNT); + } - pthread_mutex_lock (&pl_inode->mutex); - { - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (!l->blocked - && locks_overlap (®ion, l) - && !same_owner (®ion, l)) { - ret = 0; - gf_log ("posix-locks", GF_LOG_TRACE, "Truncate " - "allowed"); - break; - } - } - } - pthread_mutex_unlock (&pl_inode->mutex); + return; +} - return ret; +void +pl_posixlk_xattr_fill(xlator_t *this, inode_t *inode, dict_t *dict, + gf_boolean_t keep_max) +{ + int32_t count = 0; + int32_t maxcount = -1; + int ret = -1; + + if (keep_max) { + ret = dict_get_int32_sizen(dict, GLUSTERFS_POSIXLK_COUNT, &maxcount); + if (ret < 0) + gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s", + GLUSTERFS_POSIXLK_COUNT); + } + count = get_posixlk_count(this, inode); + if (maxcount >= count) + return; + + ret = dict_set_int32_sizen(dict, GLUSTERFS_POSIXLK_COUNT, count); + if (ret < 0) { + gf_msg_debug(this->name, 0, " dict_set failed on key %s", + GLUSTERFS_POSIXLK_COUNT); + } } +void +pl_inodelk_xattr_fill_each(xlator_t *this, inode_t *inode, dict_t *dict, + char *domname, gf_boolean_t keep_max, char *key) +{ + int32_t count = 0; + int32_t maxcount = -1; + int ret = -1; + + if (keep_max) { + ret = dict_get_int32(dict, key, &maxcount); + if (ret < 0) + gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s", + GLUSTERFS_INODELK_COUNT); + } + count = get_inodelk_count(this, inode, domname); + if (maxcount >= count) + return; + + ret = dict_set_int32(dict, key, count); + if (ret < 0) { + gf_msg_debug(this->name, 0, + "Failed to set count for " + "key %s", + key); + } + + return; +} static int -truncate_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - dict_t *xdata) +pl_inodelk_xattr_fill_multiple(dict_t *this, char *key, data_t *value, + void *data) { - posix_locks_private_t *priv = NULL; - pl_local_t *local = NULL; - inode_t *inode = NULL; - pl_inode_t *pl_inode = NULL; + multi_dom_lk_data *d = data; + char *tmp_key = NULL; + char *save_ptr = NULL; + + tmp_key = gf_strdup(key); + if (!tmp_key) + return -1; + + strtok_r(tmp_key, ":", &save_ptr); + if (!*save_ptr) { + if (tmp_key) + GF_FREE(tmp_key); + gf_msg(THIS->name, GF_LOG_ERROR, 0, EINVAL, + "Could not tokenize domain string from key %s", key); + return -1; + } + + pl_inodelk_xattr_fill_each(d->this, d->inode, d->xdata_rsp, save_ptr, + d->keep_max, key); + if (tmp_key) + GF_FREE(tmp_key); + + return 0; +} +void +pl_fill_multiple_dom_lk_requests(xlator_t *this, pl_local_t *local, + inode_t *inode, dict_t *dict, + gf_boolean_t keep_max) +{ + multi_dom_lk_data data; - priv = this->private; - local = frame->local; + data.this = this; + data.inode = inode; + data.xdata_rsp = dict; + data.keep_max = keep_max; - if (op_ret != 0) { - gf_log (this->name, GF_LOG_ERROR, - "got error (errno=%d, stderror=%s) from child", - op_errno, strerror (op_errno)); - goto unwind; - } + dict_foreach_fnmatch(local->xdata, GLUSTERFS_INODELK_DOM_PREFIX "*", + pl_inodelk_xattr_fill_multiple, &data); +} - if (local->op == TRUNCATE) - inode = local->loc.inode; - else - inode = local->fd->inode; +void +pl_set_xdata_response(xlator_t *this, pl_local_t *local, inode_t *parent, + inode_t *inode, char *name, dict_t *xdata, + gf_boolean_t max_lock) +{ + if (!xdata || !local) + return; - pl_inode = pl_inode_get (this, inode); - if (!pl_inode) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } + if (local->parent_entrylk_req && parent && name && name[0] != '\0') + pl_parent_entrylk_xattr_fill(this, parent, name, xdata, max_lock); - if (priv->mandatory - && pl_inode->mandatory - && !truncate_allowed (pl_inode, frame->root->client, - frame->root->pid, &frame->root->lk_owner, - local->offset)) { - op_ret = -1; - op_errno = EAGAIN; - goto unwind; - } + if (!inode) + return; - switch (local->op) { - case TRUNCATE: - STACK_WIND (frame, pl_truncate_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->truncate, - &local->loc, local->offset, local->xdata); - break; - case FTRUNCATE: - STACK_WIND (frame, pl_truncate_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->ftruncate, - local->fd, local->offset, local->xdata); - break; - } + if (local->entrylk_count_req) + pl_entrylk_xattr_fill(this, inode, xdata, max_lock); - return 0; + if (local->inodelk_dom_count_req) + pl_inodelk_xattr_fill(this, inode, xdata, + data_to_str(local->inodelk_dom_count_req), + max_lock); -unwind: - gf_log (this->name, GF_LOG_ERROR, "truncate failed with ret: %d, " - "error: %s", op_ret, strerror (op_errno)); - if (local->op == TRUNCATE) - loc_wipe (&local->loc); - if (local->xdata) - dict_unref (local->xdata); - if (local->fd) - fd_unref (local->fd); - - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, buf, NULL, xdata); - return 0; -} + if (local->inodelk_count_req) + pl_inodelk_xattr_fill(this, inode, xdata, NULL, max_lock); + if (local->posixlk_count_req) + pl_posixlk_xattr_fill(this, inode, xdata, max_lock); + + if (local->multiple_dom_lk_requests) + pl_fill_multiple_dom_lk_requests(this, local, inode, xdata, max_lock); +} +/* Checks whether the region where fop is acting upon conflicts + * with existing locks. If there is no conflict function returns + * 1 else returns 0 with can_block boolean set accordingly to + * indicate block/fail the fop. + */ int -pl_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset, dict_t *xdata) +pl_is_fop_allowed(pl_inode_t *pl_inode, posix_lock_t *region, fd_t *fd, + glusterfs_fop_t op, gf_boolean_t *can_block) { - pl_local_t *local = NULL; + int ret = 0; + + if (!__rw_allowable(pl_inode, region, op)) { + if (pl_inode->mlock_enforced) { + *can_block = _gf_false; + } else if ((!fd) || (fd && (fd->flags & O_NONBLOCK))) { + gf_log("locks", GF_LOG_TRACE, + "returning EAGAIN" + " because fd is O_NONBLOCK"); + *can_block = _gf_false; + } else { + *can_block = _gf_true; + } + } else { + ret = 1; + } - local = mem_get0 (this->local_pool); - GF_VALIDATE_OR_GOTO (this->name, local, unwind); + return ret; +} - local->op = TRUNCATE; - local->offset = offset; - loc_copy (&local->loc, loc); - if (xdata) - local->xdata = dict_ref (xdata); +static pl_fdctx_t * +pl_new_fdctx() +{ + pl_fdctx_t *fdctx = GF_MALLOC(sizeof(*fdctx), gf_locks_mt_pl_fdctx_t); + GF_VALIDATE_OR_GOTO("posix-locks", fdctx, out); - frame->local = local; + INIT_LIST_HEAD(&fdctx->locks_list); - STACK_WIND (frame, truncate_stat_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->stat, loc, NULL); +out: + return fdctx; +} - return 0; +static pl_fdctx_t * +pl_check_n_create_fdctx(xlator_t *this, fd_t *fd) +{ + int ret = 0; + uint64_t tmp = 0; + pl_fdctx_t *fdctx = NULL; + + GF_VALIDATE_OR_GOTO("posix-locks", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + LOCK(&fd->lock); + { + ret = __fd_ctx_get(fd, this, &tmp); + if ((ret != 0) || (tmp == 0)) { + fdctx = pl_new_fdctx(); + if (fdctx == NULL) { + goto unlock; + } + } -unwind: - gf_log (this->name, GF_LOG_ERROR, "truncate for %s failed with ret: %d, " - "error: %s", loc->path, -1, strerror (ENOMEM)); - STACK_UNWIND_STRICT (truncate, frame, -1, ENOMEM, NULL, NULL, NULL); + ret = __fd_ctx_set(fd, this, (uint64_t)(long)fdctx); + if (ret != 0) { + GF_FREE(fdctx); + fdctx = NULL; + UNLOCK(&fd->lock); + gf_log(this->name, GF_LOG_DEBUG, "failed to set fd ctx"); + goto out; + } + } +unlock: + UNLOCK(&fd->lock); - return 0; +out: + return fdctx; } +int32_t +pl_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + pl_track_io_fop_count(frame->local, this, DECREMENT); + + PL_STACK_UNWIND(discard, xdata, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} int -pl_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset, dict_t *xdata) +pl_discard_cont(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { - pl_local_t *local = NULL; + pl_track_io_fop_count(frame->local, this, INCREMENT); + + STACK_WIND(frame, pl_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; +} + +int32_t +pl_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; + pl_rw_req_t *rw = NULL; + posix_lock_t region = { + .list = + { + 0, + }, + }; + gf_boolean_t enabled = _gf_false; + gf_boolean_t can_block = _gf_true; + int op_ret = 0; + int op_errno = 0; + int allowed = 1; + + GF_VALIDATE_OR_GOTO("locks", this, unwind); + + local = mem_get0(this->local_pool); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + frame->local = local; + local->inode = inode_ref(fd->inode); + local->fd = fd_ref(fd); + + pl_inode = pl_inode_get(this, fd->inode, local); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + if (frame->root->pid < 0) + enabled = _gf_false; + else + enabled = pl_is_mandatory_locking_enabled(pl_inode); + + if (enabled) { + region.fl_start = offset; + region.fl_end = offset + len - 1; + region.client = frame->root->client; + region.fd_num = fd_to_fdnum(fd); + region.client_pid = frame->root->pid; + region.owner = frame->root->lk_owner; + + pthread_mutex_lock(&pl_inode->mutex); + { + allowed = pl_is_fop_allowed(pl_inode, ®ion, fd, GF_FOP_DISCARD, + &can_block); + if (allowed == 1) { + if (pl_inode->mlock_enforced && + pl_inode->track_fop_wind_count) { + pl_inode->fop_wind_count++; + } + goto unlock; + } else if (!can_block) { + op_errno = EAGAIN; + op_ret = -1; + goto unlock; + } - local = mem_get0 (this->local_pool); - GF_VALIDATE_OR_GOTO (this->name, local, unwind); + rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t); + if (!rw) { + op_errno = ENOMEM; + op_ret = -1; + goto unlock; + } - local->op = FTRUNCATE; - local->offset = offset; - local->fd = fd_ref (fd); - if (xdata) - local->xdata = dict_ref (xdata); + rw->stub = fop_discard_stub(frame, pl_discard_cont, fd, offset, len, + xdata); + if (!rw->stub) { + op_errno = ENOMEM; + op_ret = -1; + GF_FREE(rw); + goto unlock; + } - frame->local = local; + rw->region = region; - STACK_WIND (frame, truncate_stat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fstat, fd, xdata); - return 0; + list_add_tail(&rw->list, &pl_inode->rw_list); + } + unlock: + pthread_mutex_unlock(&pl_inode->mutex); + } + if (allowed == 1) + STACK_WIND(frame, pl_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); unwind: - gf_log (this->name, GF_LOG_ERROR, "ftruncate failed with ret: %d, " - "error: %s", -1, strerror (ENOMEM)); - STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL); + if (op_ret == -1) + PL_STACK_UNWIND(discard, xdata, frame, op_ret, op_errno, NULL, NULL, + NULL); - return 0; + return 0; } -int -pl_locks_by_fd (pl_inode_t *pl_inode, fd_t *fd) +int32_t +pl_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - posix_lock_t *l = NULL; - int found = 0; + pl_track_io_fop_count(frame->local, this, DECREMENT); - pthread_mutex_lock (&pl_inode->mutex); - { + PL_STACK_UNWIND(zerofill, xdata, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} - list_for_each_entry (l, &pl_inode->ext_list, list) { - if ((l->fd_num == fd_to_fdnum(fd))) { - found = 1; - break; - } - } +int +pl_zerofill_cont(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + pl_track_io_fop_count(frame->local, this, INCREMENT); - } - pthread_mutex_unlock (&pl_inode->mutex); - return found; + STACK_WIND(frame, pl_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; } -static void -delete_locks_of_fd (xlator_t *this, pl_inode_t *pl_inode, fd_t *fd) +int32_t +pl_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) { - posix_lock_t *tmp = NULL; - posix_lock_t *l = NULL; + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; + pl_rw_req_t *rw = NULL; + posix_lock_t region = { + .list = + { + 0, + }, + }; + gf_boolean_t enabled = _gf_false; + gf_boolean_t can_block = _gf_true; + int op_ret = 0; + int op_errno = 0; + int allowed = 1; + + GF_VALIDATE_OR_GOTO("locks", this, unwind); + + local = mem_get0(this->local_pool); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + frame->local = local; + local->inode = inode_ref(fd->inode); + local->fd = fd_ref(fd); + + pl_inode = pl_inode_get(this, fd->inode, local); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + if (frame->root->pid < 0) + enabled = _gf_false; + else + enabled = pl_is_mandatory_locking_enabled(pl_inode); + + if (enabled) { + region.fl_start = offset; + region.fl_end = offset + len - 1; + region.client = frame->root->client; + region.fd_num = fd_to_fdnum(fd); + region.client_pid = frame->root->pid; + region.owner = frame->root->lk_owner; + + pthread_mutex_lock(&pl_inode->mutex); + { + allowed = pl_is_fop_allowed(pl_inode, ®ion, fd, GF_FOP_ZEROFILL, + &can_block); + if (allowed == 1) { + if (pl_inode->mlock_enforced && + pl_inode->track_fop_wind_count) { + pl_inode->fop_wind_count++; + } + goto unlock; + } else if (!can_block) { + op_errno = EAGAIN; + op_ret = -1; + goto unlock; + } - struct list_head blocked_list; + rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t); + if (!rw) { + op_errno = ENOMEM; + op_ret = -1; + goto unlock; + } - INIT_LIST_HEAD (&blocked_list); + rw->stub = fop_zerofill_stub(frame, pl_zerofill_cont, fd, offset, + len, xdata); + if (!rw->stub) { + op_errno = ENOMEM; + op_ret = -1; + GF_FREE(rw); + goto unlock; + } - pthread_mutex_lock (&pl_inode->mutex); - { + rw->region = region; - list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) { - if ((l->fd_num == fd_to_fdnum(fd))) { - if (l->blocked) { - list_move_tail (&l->list, &blocked_list); - continue; - } - __delete_lock (pl_inode, l); - __destroy_lock (l); - } - } + list_add_tail(&rw->list, &pl_inode->rw_list); + } + unlock: + pthread_mutex_unlock(&pl_inode->mutex); + } - } - pthread_mutex_unlock (&pl_inode->mutex); + if (allowed == 1) + STACK_WIND(frame, pl_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); +unwind: + if (op_ret == -1) + PL_STACK_UNWIND(zerofill, xdata, frame, op_ret, op_errno, NULL, NULL, + NULL); - list_for_each_entry_safe (l, tmp, &blocked_list, list) { - list_del_init(&l->list); - STACK_UNWIND_STRICT (lk, l->frame, -1, EAGAIN, &l->user_flock, - NULL); - __destroy_lock (l); - } + return 0; +} - grant_blocked_locks (this, pl_inode); +int +pl_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + pl_local_t *local = frame->local; - do_blocked_rw (pl_inode); + pl_track_io_fop_count(local, this, DECREMENT); + if (local->op == GF_FOP_TRUNCATE) + PL_STACK_UNWIND(truncate, xdata, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + else + PL_STACK_UNWIND(ftruncate, xdata, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; } -static void -__delete_locks_of_owner (pl_inode_t *pl_inode, - client_t *client, gf_lkowner_t *owner) -{ - posix_lock_t *tmp = NULL; - posix_lock_t *l = NULL; - - /* TODO: what if it is a blocked lock with pending l->frame */ - - list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) { - if (l->blocked) - continue; - if ((l->client == client) && - is_same_lkowner (&l->owner, owner)) { - gf_log ("posix-locks", GF_LOG_TRACE, - " Flushing lock" - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" state: %s", - l->fl_type == F_UNLCK ? "Unlock" : "Lock", - l->client_pid, - lkowner_utoa (&l->owner), - l->user_flock.l_start, - l->user_flock.l_len, - l->blocked == 1 ? "Blocked" : "Active"); - - __delete_lock (pl_inode, l); - __destroy_lock (l); - } - } +int +pl_ftruncate_cont(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + pl_track_io_fop_count(frame->local, this, INCREMENT); - return; + STACK_WIND(frame, pl_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; } - -int32_t -pl_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +int +pl_truncate_cont(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + pl_track_io_fop_count(frame->local, this, INCREMENT); + STACK_WIND(frame, pl_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; } -int32_t -pl_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) -{ - int32_t op_errno = EINVAL; - int op_ret = -1; - int32_t bcount = 0; - int32_t gcount = 0; - char key[PATH_MAX] = {0, }; - char *lk_summary = NULL; - pl_inode_t *pl_inode = NULL; - dict_t *dict = NULL; - clrlk_args args = {0,}; - char *brickname = NULL; - - if (!name) - goto usual; - - if (strncmp (name, GF_XATTR_CLRLK_CMD, strlen (GF_XATTR_CLRLK_CMD))) - goto usual; - - if (clrlk_parse_args (name, &args)) { - op_errno = EINVAL; - goto out; - } +static int +truncate_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + pl_local_t *local = frame->local; + inode_t *inode = NULL; + pl_inode_t *pl_inode = NULL; + pl_rw_req_t *rw = NULL; + posix_lock_t region = { + .list = + { + 0, + }, + }; + gf_boolean_t enabled = _gf_false; + gf_boolean_t can_block = _gf_true; + int allowed = 1; + + GF_VALIDATE_OR_GOTO("locks", this, unwind); + + if (op_ret != 0) { + gf_log(this->name, GF_LOG_ERROR, + "got error (errno=%d, stderror=%s) from child", op_errno, + strerror(op_errno)); + goto unwind; + } + + if (local->op == GF_FOP_TRUNCATE) + inode = local->loc[0].inode; + else + inode = local->fd->inode; + + local->inode = inode_ref(inode); + + pl_inode = pl_inode_get(this, inode, local); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + if (frame->root->pid < 0) + enabled = _gf_false; + else + enabled = pl_is_mandatory_locking_enabled(pl_inode); + + if (enabled) { + region.fl_start = local->offset; + region.fl_end = LLONG_MAX; + region.client = frame->root->client; + region.fd_num = fd_to_fdnum(local->fd); + region.client_pid = frame->root->pid; + region.owner = frame->root->lk_owner; + pthread_mutex_lock(&pl_inode->mutex); + { + allowed = pl_is_fop_allowed(pl_inode, ®ion, local->fd, local->op, + &can_block); - dict = dict_new (); - if (!dict) { - op_errno = ENOMEM; - goto out; - } + if (allowed == 1) { + if (pl_inode->mlock_enforced && + pl_inode->track_fop_wind_count) { + pl_inode->fop_wind_count++; + } + goto unlock; + } else if (!can_block) { + op_errno = EAGAIN; + op_ret = -1; + goto unlock; + } - pl_inode = pl_inode_get (this, loc->inode); - if (!pl_inode) { + rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t); + if (!rw) { op_errno = ENOMEM; - goto out; - } + op_ret = -1; + goto unlock; + } + + if (local->op == GF_FOP_TRUNCATE) + rw->stub = fop_truncate_stub(frame, pl_truncate_cont, + &local->loc[0], local->offset, + local->xdata); + else + rw->stub = fop_ftruncate_stub(frame, pl_ftruncate_cont, + local->fd, local->offset, + local->xdata); + if (!rw->stub) { + op_errno = ENOMEM; + op_ret = -1; + GF_FREE(rw); + goto unlock; + } - switch (args.type) { - case CLRLK_INODE: - case CLRLK_ENTRY: - op_ret = clrlk_clear_lks_in_all_domains (this, pl_inode, - &args, &bcount, - &gcount, - &op_errno); - if (op_ret) - goto out; - break; - case CLRLK_POSIX: - op_ret = clrlk_clear_posixlk (this, pl_inode, &args, - &bcount, &gcount, - &op_errno); - if (op_ret) - goto out; - break; - case CLRLK_TYPE_MAX: - op_errno = EINVAL; - goto out; - } + rw->region = region; - op_ret = fetch_pathinfo (this, loc->inode, &op_errno, &brickname); - if (op_ret) { - gf_log (this->name, GF_LOG_WARNING, - "Couldn't get brickname"); - } else { - op_ret = format_brickname(brickname); - if (op_ret) { - gf_log (this->name, GF_LOG_WARNING, - "Couldn't format brickname"); - GF_FREE(brickname); - brickname = NULL; - } + list_add_tail(&rw->list, &pl_inode->rw_list); } + unlock: + pthread_mutex_unlock(&pl_inode->mutex); + } - if (!gcount && !bcount) { - if (gf_asprintf (&lk_summary, "No locks cleared.") == -1) { - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - } else if (gf_asprintf (&lk_summary, "%s: %s blocked locks=%d " - "granted locks=%d", - (brickname == NULL)? this->name : brickname, - (args.type == CLRLK_INODE)? "inode": - (args.type == CLRLK_ENTRY)? "entry": - (args.type == CLRLK_POSIX)? "posix": " ", - bcount, gcount) == -1) { - op_ret = -1; - op_errno = ENOMEM; - goto out; + if (allowed == 1) { + switch (local->op) { + case GF_FOP_TRUNCATE: + STACK_WIND(frame, pl_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, &local->loc[0], + local->offset, local->xdata); + break; + case GF_FOP_FTRUNCATE: + STACK_WIND(frame, pl_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, local->fd, + local->offset, local->xdata); + break; + default: + break; } + } +unwind: + if (op_ret == -1) { + gf_log(this ? this->name : "locks", GF_LOG_ERROR, + "truncate failed with " + "ret: %d, error: %s", + op_ret, strerror(op_errno)); - strncpy (key, name, strlen (name)); - if (dict_set_dynstr (dict, key, lk_summary)) { - op_ret = -1; - op_errno = ENOMEM; - goto out; + switch (local->op) { + case GF_FOP_TRUNCATE: + PL_STACK_UNWIND(truncate, xdata, frame, op_ret, op_errno, buf, + NULL, xdata); + break; + case GF_FOP_FTRUNCATE: + PL_STACK_UNWIND(ftruncate, xdata, frame, op_ret, op_errno, buf, + NULL, xdata); + break; + default: + break; } + } + return 0; +} - op_ret = 0; -out: - GF_FREE(brickname); - STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata); +int +pl_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + pl_local_t *local = NULL; + int ret = -1; - GF_FREE (args.opts); - if (op_ret && lk_summary) - GF_FREE (lk_summary); - if (dict) - dict_unref (dict); - return 0; + GF_VALIDATE_OR_GOTO("locks", this, unwind); -usual: - STACK_WIND (frame, pl_getxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); - return 0; + local = mem_get0(this->local_pool); + GF_VALIDATE_OR_GOTO(this->name, local, unwind); + + local->op = GF_FOP_TRUNCATE; + local->offset = offset; + loc_copy(&local->loc[0], loc); + if (xdata) + local->xdata = dict_ref(xdata); + + frame->local = local; + + STACK_WIND(frame, truncate_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, NULL); + ret = 0; + +unwind: + if (ret == -1) { + gf_log(this ? this->name : "locks", GF_LOG_ERROR, + "truncate on %s failed with" + " ret: %d, error: %s", + loc->path, -1, strerror(ENOMEM)); + STACK_UNWIND_STRICT(truncate, frame, -1, ENOMEM, NULL, NULL, NULL); + } + return 0; } -static int -format_brickname(char *brickname) +int +pl_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - int ret = -1; - char *hostname = NULL; - char *volume = NULL; - char *saveptr = NULL; + pl_local_t *local = NULL; + int ret = -1; - if (!brickname) - goto out; + GF_VALIDATE_OR_GOTO("locks", this, unwind); + local = mem_get0(this->local_pool); + GF_VALIDATE_OR_GOTO(this->name, local, unwind); - strtok_r(brickname, ":", &saveptr); - hostname = gf_strdup(strtok_r(NULL, ":", &saveptr)); - if (hostname == NULL) - goto out; - volume = gf_strdup(strtok_r(NULL, ".", &saveptr)); - if (volume == NULL) - goto out; + local->op = GF_FOP_FTRUNCATE; + local->offset = offset; + local->fd = fd_ref(fd); + if (xdata) + local->xdata = dict_ref(xdata); - sprintf(brickname, "%s:%s", hostname, volume); + frame->local = local; - ret = 0; -out: - GF_FREE(hostname); - GF_FREE(volume); - return ret; + STACK_WIND(frame, truncate_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + ret = 0; +unwind: + if (ret == -1) { + gf_log(this ? this->name : "locks", GF_LOG_ERROR, + "ftruncate failed with" + " ret: %d, error: %s", + -1, strerror(ENOMEM)); + STACK_UNWIND_STRICT(ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL); + } + return 0; } -static int -fetch_pathinfo (xlator_t *this, inode_t *inode, int32_t *op_errno, - char **brickname) +int +pl_locks_by_fd(pl_inode_t *pl_inode, fd_t *fd) { - int ret = -1; - loc_t loc = {0, }; - dict_t *dict = NULL; + posix_lock_t *l = NULL; + int found = 0; - if (!brickname) - goto out; + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry(l, &pl_inode->ext_list, list) + { + if (l->fd_num == fd_to_fdnum(fd)) { + found = 1; + break; + } + } + } + pthread_mutex_unlock(&pl_inode->mutex); + return found; +} - if (!op_errno) - goto out; +static void +delete_locks_of_fd(xlator_t *this, pl_inode_t *pl_inode, fd_t *fd) +{ + posix_lock_t *tmp = NULL; + posix_lock_t *l = NULL; - uuid_copy (loc.gfid, inode->gfid); - loc.inode = inode_ref (inode); + struct list_head blocked_list; - ret = syncop_getxattr (FIRST_CHILD(this), &loc, &dict, - GF_XATTR_PATHINFO_KEY); - if (ret < 0) { - *op_errno = errno; - goto out; + INIT_LIST_HEAD(&blocked_list); + + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(l, tmp, &pl_inode->ext_list, list) + { + if (l->fd_num == fd_to_fdnum(fd)) { + if (l->blocked) { + list_move_tail(&l->list, &blocked_list); + continue; + } + __delete_lock(l); + __destroy_lock(l); + } } + } + pthread_mutex_unlock(&pl_inode->mutex); - ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, brickname); - if (ret) - goto out; + list_for_each_entry_safe(l, tmp, &blocked_list, list) + { + list_del_init(&l->list); + STACK_UNWIND_STRICT(lk, l->frame, -1, EAGAIN, &l->user_flock, NULL); + __destroy_lock(l); + } - *brickname = gf_strdup(*brickname); - if (*brickname == NULL) { - ret = -1; - goto out; - } + grant_blocked_locks(this, pl_inode); - ret = 0; -out: - if (dict != NULL) { - dict_unref (dict); + do_blocked_rw(pl_inode); +} + +static void +__delete_locks_of_owner(pl_inode_t *pl_inode, client_t *client, + gf_lkowner_t *owner) +{ + posix_lock_t *tmp = NULL; + posix_lock_t *l = NULL; + + /* TODO: what if it is a blocked lock with pending l->frame */ + + list_for_each_entry_safe(l, tmp, &pl_inode->ext_list, list) + { + if (l->blocked) + continue; + if ((l->client == client) && is_same_lkowner(&l->owner, owner)) { + gf_log("posix-locks", GF_LOG_TRACE, + " Flushing lock" + "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 + " state: %s", + l->fl_type == F_UNLCK ? "Unlock" : "Lock", l->client_pid, + lkowner_utoa(&l->owner), l->user_flock.l_start, + l->user_flock.l_len, l->blocked == 1 ? "Blocked" : "Active"); + + __delete_lock(l); + __destroy_lock(l); } - loc_wipe(&loc); + } - return ret; + return; } - -int -pl_lockinfo_get_brickname (xlator_t *this, inode_t *inode, int32_t *op_errno) +int32_t +pl_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - int ret = -1; - posix_locks_private_t *priv = NULL; - char *brickname = NULL; - char *end = NULL; - char *tmp = NULL; + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} - priv = this->private; +static int32_t +pl_getxattr_clrlk(xlator_t *this, const char *name, inode_t *inode, + dict_t **dict, int32_t *op_errno) +{ + int32_t bcount = 0; + int32_t gcount = 0; + char *key = NULL; + char *lk_summary = NULL; + pl_inode_t *pl_inode = NULL; + clrlk_args args = { + 0, + }; + char *brickname = NULL; + int32_t op_ret = -1; + + *op_errno = EINVAL; + + if (clrlk_parse_args(name, &args)) { + *op_errno = EINVAL; + goto out; + } + + *dict = dict_new(); + if (!*dict) { + *op_errno = ENOMEM; + goto out; + } + + pl_inode = pl_inode_get(this, inode, NULL); + if (!pl_inode) { + *op_errno = ENOMEM; + goto out; + } + + switch (args.type) { + case CLRLK_INODE: + case CLRLK_ENTRY: + op_ret = clrlk_clear_lks_in_all_domains(this, pl_inode, &args, + &bcount, &gcount, op_errno); + break; + case CLRLK_POSIX: + op_ret = clrlk_clear_posixlk(this, pl_inode, &args, &bcount, + &gcount, op_errno); + break; + default: + op_ret = -1; + *op_errno = EINVAL; + } + if (op_ret) { + if (args.type >= CLRLK_TYPE_MAX) { + gf_log(this->name, GF_LOG_ERROR, + "clear locks: invalid lock type %d", args.type); + } else { + gf_log(this->name, GF_LOG_ERROR, + "clear locks of type %s failed: %s", + clrlk_type_names[args.type], strerror(*op_errno)); + } - ret = fetch_pathinfo (this, inode, op_errno, &brickname); - if (ret) - goto out; + goto out; + } - end = strrchr (brickname, ':'); - if (!end) { - GF_FREE(brickname); - ret = -1; - goto out; + op_ret = fetch_pathinfo(this, inode, op_errno, &brickname); + if (op_ret) { + gf_log(this->name, GF_LOG_WARNING, "Couldn't get brickname"); + } else { + op_ret = format_brickname(brickname); + if (op_ret) { + gf_log(this->name, GF_LOG_WARNING, "Couldn't format brickname"); + GF_FREE(brickname); + brickname = NULL; } + } - tmp = brickname; - brickname = gf_strndup (brickname, (end - brickname)); - if (brickname == NULL) { - ret = -1; - goto out; + if (!gcount && !bcount) { + if (gf_asprintf(&lk_summary, "No locks cleared.") == -1) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; } + } else if (gf_asprintf(&lk_summary, + "%s: %s blocked locks=%d " + "granted locks=%d", + (brickname == NULL) ? this->name : brickname, + clrlk_type_names[args.type], bcount, gcount) == -1) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; + } + gf_log(this->name, GF_LOG_DEBUG, "%s", lk_summary); + + key = gf_strdup(name); + if (!key) { + op_ret = -1; + goto out; + } + if (dict_set_dynstr(*dict, key, lk_summary)) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; + } + + op_ret = 0; - priv->brickname = brickname; - ret = 0; out: - GF_FREE(tmp); - return ret; + GF_FREE(brickname); + GF_FREE(args.opts); + GF_FREE(key); + if (op_ret) { + GF_FREE(lk_summary); + } + + return op_ret; } -char * -pl_lockinfo_key (xlator_t *this, inode_t *inode, int32_t *op_errno) +int32_t +pl_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata) { - posix_locks_private_t *priv = NULL; - char *key = NULL; - int ret = 0; + int32_t op_errno = EINVAL; + int32_t op_ret = -1; + dict_t *dict = NULL; - priv = this->private; + if (!name) + goto usual; - if (priv->brickname == NULL) { - ret = pl_lockinfo_get_brickname (this, inode, op_errno); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "cannot get brickname"); - goto out; - } - } + if (strncmp(name, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD))) + goto usual; - key = priv->brickname; -out: - return key; + op_ret = pl_getxattr_clrlk(this, name, loc->inode, &dict, &op_errno); + + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xdata); + + if (dict) + dict_unref(dict); + return 0; + +usual: + STACK_WIND(frame, pl_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + return 0; } -int32_t -pl_fgetxattr_handle_lockinfo (xlator_t *this, fd_t *fd, - dict_t *dict, int32_t *op_errno) +static int +format_brickname(char *brickname) { - pl_inode_t *pl_inode = NULL; - char *key = NULL, *buf = NULL; - int32_t op_ret = 0; - unsigned long fdnum = 0, len = 0; - dict_t *tmp = NULL; + int ret = -1; + char *hostname = NULL; + char *volume = NULL; + char *saveptr = NULL; - pl_inode = pl_inode_get (this, fd->inode); + if (!brickname) + goto out; - if (!pl_inode) { - gf_log (this->name, GF_LOG_DEBUG, "Could not get inode."); - *op_errno = EBADFD; - op_ret = -1; - goto out; - } + strtok_r(brickname, ":", &saveptr); + hostname = gf_strdup(strtok_r(NULL, ":", &saveptr)); + if (hostname == NULL) + goto out; + volume = gf_strdup(strtok_r(NULL, ".", &saveptr)); + if (volume == NULL) + goto out; - if (!pl_locks_by_fd (pl_inode, fd)) { - op_ret = 0; - goto out; - } + sprintf(brickname, "%s:%s", hostname, volume); - fdnum = fd_to_fdnum (fd); + ret = 0; +out: + GF_FREE(hostname); + GF_FREE(volume); + return ret; +} - key = pl_lockinfo_key (this, fd->inode, op_errno); - if (key == NULL) { - op_ret = -1; - goto out; - } +static int +fetch_pathinfo(xlator_t *this, inode_t *inode, int32_t *op_errno, + char **brickname) +{ + int ret = -1; + loc_t loc = { + 0, + }; + dict_t *dict = NULL; + + if (!brickname) + goto out; + + if (!op_errno) + goto out; + + gf_uuid_copy(loc.gfid, inode->gfid); + loc.inode = inode_ref(inode); + + ret = syncop_getxattr(FIRST_CHILD(this), &loc, &dict, GF_XATTR_PATHINFO_KEY, + NULL, NULL); + if (ret < 0) { + *op_errno = -ret; + ret = -1; + goto out; + } + + ret = dict_get_str_sizen(dict, GF_XATTR_PATHINFO_KEY, brickname); + if (ret) + goto out; + + *brickname = gf_strdup(*brickname); + if (*brickname == NULL) { + ret = -1; + goto out; + } + + ret = 0; +out: + if (dict != NULL) { + dict_unref(dict); + } + loc_wipe(&loc); - tmp = dict_new (); - if (tmp == NULL) { - op_ret = -1; - *op_errno = ENOMEM; - goto out; - } + return ret; +} - op_ret = dict_set_uint64 (tmp, key, fdnum); - if (op_ret < 0) { - *op_errno = -op_ret; - op_ret = -1; - gf_log (this->name, GF_LOG_WARNING, "setting lockinfo value " - "(%lu) for fd (ptr:%p inode-gfid:%s) failed (%s)", - fdnum, fd, uuid_utoa (fd->inode->gfid), - strerror (*op_errno)); - goto out; - } +int +pl_lockinfo_get_brickname(xlator_t *this, inode_t *inode, int32_t *op_errno) +{ + posix_locks_private_t *priv = this->private; + char *brickname = NULL; + char *end = NULL; + char *tmp = NULL; - len = dict_serialized_length (tmp); - if (len < 0) { - *op_errno = -op_ret; - op_ret = -1; - gf_log (this->name, GF_LOG_WARNING, - "dict_serialized_length failed (%s) while handling " - "lockinfo for fd (ptr:%p inode-gfid:%s)", - strerror (*op_errno), fd, uuid_utoa (fd->inode->gfid)); - goto out; - } + int ret = fetch_pathinfo(this, inode, op_errno, &brickname); + if (ret) + goto out; - buf = GF_CALLOC (1, len, gf_common_mt_char); - if (buf == NULL) { - op_ret = -1; - *op_errno = ENOMEM; - goto out; - } + end = strrchr(brickname, ':'); + if (!end) { + GF_FREE(brickname); + ret = -1; + goto out; + } + + tmp = brickname; + brickname = gf_strndup(brickname, (end - brickname)); + if (brickname == NULL) { + ret = -1; + goto out; + } + + priv->brickname = brickname; + ret = 0; +out: + GF_FREE(tmp); + return ret; +} - op_ret = dict_serialize (tmp, buf); - if (op_ret < 0) { - *op_errno = -op_ret; - op_ret = -1; - gf_log (this->name, GF_LOG_WARNING, - "dict_serialize failed (%s) while handling lockinfo " - "for fd (ptr: %p inode-gfid:%s)", strerror (*op_errno), - fd, uuid_utoa (fd->inode->gfid)); - goto out; - } +char * +pl_lockinfo_key(xlator_t *this, inode_t *inode, int32_t *op_errno) +{ + posix_locks_private_t *priv = this->private; + char *key = NULL; + int ret = 0; - op_ret = dict_set_dynptr (dict, GF_XATTR_LOCKINFO_KEY, buf, len); - if (op_ret < 0) { - *op_errno = -op_ret; - op_ret = -1; - gf_log (this->name, GF_LOG_WARNING, "setting lockinfo value " - "(%lu) for fd (ptr:%p inode-gfid:%s) failed (%s)", - fdnum, fd, uuid_utoa (fd->inode->gfid), - strerror (*op_errno)); - goto out; + if (priv->brickname == NULL) { + ret = pl_lockinfo_get_brickname(this, inode, op_errno); + if (ret < 0) { + gf_log(this->name, GF_LOG_WARNING, "cannot get brickname"); + goto out; } + } - buf = NULL; + key = priv->brickname; out: - if (tmp != NULL) { - dict_unref (tmp); - } + return key; +} - if (buf != NULL) { - GF_FREE (buf); - } +int32_t +pl_fgetxattr_handle_lockinfo(xlator_t *this, fd_t *fd, dict_t *dict, + int32_t *op_errno) +{ + char *key = NULL, *buf = NULL; + int32_t op_ret = 0; + unsigned long fdnum = 0; + int32_t len = 0; + dict_t *tmp = NULL; + + pl_inode_t *pl_inode = pl_inode_get(this, fd->inode, NULL); + + if (!pl_inode) { + gf_log(this->name, GF_LOG_DEBUG, "Could not get inode."); + *op_errno = EBADFD; + op_ret = -1; + goto out; + } + + if (!pl_locks_by_fd(pl_inode, fd)) { + op_ret = 0; + goto out; + } + + fdnum = fd_to_fdnum(fd); + + key = pl_lockinfo_key(this, fd->inode, op_errno); + if (key == NULL) { + op_ret = -1; + goto out; + } + + tmp = dict_new(); + if (tmp == NULL) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; + } + + op_ret = dict_set_uint64(tmp, key, fdnum); + if (op_ret < 0) { + *op_errno = -op_ret; + op_ret = -1; + gf_log(this->name, GF_LOG_WARNING, + "setting lockinfo value " + "(%lu) for fd (ptr:%p inode-gfid:%s) failed (%s)", + fdnum, fd, uuid_utoa(fd->inode->gfid), strerror(*op_errno)); + goto out; + } + + op_ret = dict_allocate_and_serialize(tmp, (char **)&buf, + (unsigned int *)&len); + if (op_ret != 0) { + *op_errno = -op_ret; + op_ret = -1; + gf_log(this->name, GF_LOG_WARNING, + "dict_serialized_length failed (%s) while handling " + "lockinfo for fd (ptr:%p inode-gfid:%s)", + strerror(*op_errno), fd, uuid_utoa(fd->inode->gfid)); + goto out; + } + + op_ret = dict_set_dynptr(dict, GF_XATTR_LOCKINFO_KEY, buf, len); + if (op_ret < 0) { + *op_errno = -op_ret; + op_ret = -1; + gf_log(this->name, GF_LOG_WARNING, + "setting lockinfo value " + "(%lu) for fd (ptr:%p inode-gfid:%s) failed (%s)", + fdnum, fd, uuid_utoa(fd->inode->gfid), strerror(*op_errno)); + goto out; + } + + buf = NULL; +out: + if (tmp != NULL) { + dict_unref(tmp); + } - return op_ret; -} + if (buf != NULL) { + GF_FREE(buf); + } + return op_ret; +} int32_t -pl_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name, dict_t *xdata) +pl_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) { - int32_t op_ret = 0, op_errno = 0; - dict_t *dict = NULL; - - if (!name) { - goto usual; + int32_t op_ret = 0, op_errno = 0; + dict_t *dict = NULL; + + if (!name) { + goto usual; + } + + if (strcmp(name, GF_XATTR_LOCKINFO_KEY) == 0) { + dict = dict_new(); + if (dict == NULL) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; } - if (strcmp (name, GF_XATTR_LOCKINFO_KEY) == 0) { - dict = dict_new (); - if (dict == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } + op_ret = pl_fgetxattr_handle_lockinfo(this, fd, dict, &op_errno); + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "getting lockinfo on fd (ptr:%p inode-gfid:%s) " + "failed (%s)", + fd, uuid_utoa(fd->inode->gfid), strerror(op_errno)); + } - op_ret = pl_fgetxattr_handle_lockinfo (this, fd, dict, - &op_errno); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "getting lockinfo on fd (ptr:%p inode-gfid:%s) " - "failed (%s)", fd, uuid_utoa (fd->inode->gfid), - strerror (op_errno)); - } + goto unwind; + } else if (strncmp(name, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)) == + 0) { + op_ret = pl_getxattr_clrlk(this, name, fd->inode, &dict, &op_errno); - goto unwind; - } else { - goto usual; - } + goto unwind; + } else { + goto usual; + } unwind: - STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, NULL); - if (dict != NULL) { - dict_unref (dict); - } + STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, dict, NULL); + if (dict != NULL) { + dict_unref(dict); + } - return 0; + return 0; usual: - STACK_WIND (frame, default_fgetxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); - return 0; + STACK_WIND(frame, default_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + return 0; } int32_t -pl_migrate_locks (call_frame_t *frame, fd_t *newfd, uint64_t oldfd_num, - int32_t *op_errno) +pl_migrate_locks(call_frame_t *frame, fd_t *newfd, uint64_t oldfd_num, + int32_t *op_errno) { - pl_inode_t *pl_inode = NULL; - uint64_t newfd_num = 0; - posix_lock_t *l = NULL; - int32_t op_ret = 0; - - newfd_num = fd_to_fdnum (newfd); - - pl_inode = pl_inode_get (frame->this, newfd->inode); - if (pl_inode == NULL) { - op_ret = -1; - *op_errno = EBADFD; - goto out; - } - - pthread_mutex_lock (&pl_inode->mutex); + posix_lock_t *l = NULL; + int32_t op_ret = 0; + uint64_t newfd_num = fd_to_fdnum(newfd); + + pl_inode_t *pl_inode = pl_inode_get(frame->this, newfd->inode, NULL); + if (pl_inode == NULL) { + op_ret = -1; + *op_errno = EBADFD; + goto out; + } + + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry(l, &pl_inode->ext_list, list) { - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (l->fd_num == oldfd_num) { - l->fd_num = newfd_num; - l->client = frame->root->client; - } - } + if (l->fd_num == oldfd_num) { + l->fd_num = newfd_num; + l->client = frame->root->client; + } } - pthread_mutex_unlock (&pl_inode->mutex); + } + pthread_mutex_unlock(&pl_inode->mutex); - op_ret = 0; + op_ret = 0; out: - return op_ret; + return op_ret; } int32_t -pl_fsetxattr_handle_lockinfo (call_frame_t *frame, fd_t *fd, char *lockinfo_buf, - int len, int32_t *op_errno) +pl_fsetxattr_handle_lockinfo(call_frame_t *frame, fd_t *fd, char *lockinfo_buf, + int len, int32_t *op_errno) { - int32_t op_ret = -1; - dict_t *lockinfo = NULL; - uint64_t oldfd_num = 0; - char *key = NULL; - - lockinfo = dict_new (); - if (lockinfo == NULL) { - op_ret = -1; - *op_errno = ENOMEM; - goto out; - } + int32_t op_ret = -1; + uint64_t oldfd_num = 0; + char *key = NULL; + + dict_t *lockinfo = dict_new(); + if (lockinfo == NULL) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; + } + + op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo); + if (op_ret < 0) { + *op_errno = -op_ret; + op_ret = -1; + goto out; + } + + key = pl_lockinfo_key(frame->this, fd->inode, op_errno); + if (key == NULL) { + op_ret = -1; + goto out; + } + + op_ret = dict_get_uint64(lockinfo, key, &oldfd_num); + + if (oldfd_num == 0) { + op_ret = 0; + goto out; + } + + op_ret = pl_migrate_locks(frame, fd, oldfd_num, op_errno); + if (op_ret < 0) { + gf_log(frame->this->name, GF_LOG_WARNING, + "migration of locks from oldfd (ptr:%p) to newfd " + "(ptr:%p) (inode-gfid:%s)", + (void *)(uintptr_t)oldfd_num, fd, uuid_utoa(fd->inode->gfid)); + goto out; + } - op_ret = dict_unserialize (lockinfo_buf, len, &lockinfo); - if (op_ret < 0) { - *op_errno = -op_ret; - op_ret = -1; - goto out; - } +out: + dict_unref(lockinfo); - key = pl_lockinfo_key (frame->this, fd->inode, op_errno); - if (key == NULL) { - op_ret = -1; - goto out; - } + return op_ret; +} - op_ret = dict_get_uint64 (lockinfo, key, &oldfd_num); +int32_t +pl_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; - if (oldfd_num == 0) { - op_ret = 0; - goto out; + local = frame->local; + if (local && local->update_mlock_enforced_flag && op_ret != -1) { + pl_inode = pl_inode_get(this, local->inode, NULL); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; } - op_ret = pl_migrate_locks (frame, fd, oldfd_num, op_errno); - if (op_ret < 0) { - gf_log (frame->this->name, GF_LOG_WARNING, - "migration of locks from oldfd (ptr:%p) to newfd " - "(ptr:%p) (inode-gfid:%s)", (void *)oldfd_num, fd, - uuid_utoa (fd->inode->gfid)); - goto out; + pthread_mutex_lock(&pl_inode->mutex); + { + pl_inode->mlock_enforced = _gf_true; + pl_inode->check_mlock_info = _gf_false; } + pthread_mutex_unlock(&pl_inode->mutex); + } -out: - dict_unref (lockinfo); - - return op_ret; +unwind: + PL_STACK_UNWIND_FOR_CLIENT(fsetxattr, xdata, frame, op_ret, op_errno, + xdata); + return 0; } int32_t -pl_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, - int32_t flags, dict_t *xdata) +pl_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) { - int32_t op_ret = 0, op_errno = 0; - void *lockinfo_buf = NULL; - int len = 0; + int32_t op_errno = 0; + void *lockinfo_buf = NULL; + int len = 0; + char *name = NULL; + posix_locks_private_t *priv = this->private; + + int32_t op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY, + &lockinfo_buf, &len); + if (lockinfo_buf == NULL) { + goto usual; + } + + op_ret = pl_fsetxattr_handle_lockinfo(frame, fd, lockinfo_buf, len, + &op_errno); + if (op_ret < 0) { + goto unwind; + } - op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, - &lockinfo_buf, &len); - if (lockinfo_buf == NULL) { - goto usual; - } +usual: + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); - op_ret = pl_fsetxattr_handle_lockinfo (frame, fd, lockinfo_buf, len, - &op_errno); - if (op_ret < 0) { - goto unwind; - } + PL_CHECK_LOCK_ENFORCE_KEY(frame, dict, name, this, ((loc_t *)NULL), fd, + priv); -usual: - STACK_WIND (frame, default_fsetxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); - return 0; + STACK_WIND(frame, pl_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, NULL); - return 0; + PL_STACK_UNWIND_FOR_CLIENT(fsetxattr, xdata, frame, op_ret, op_errno, NULL); + + return 0; } int32_t -pl_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, dict_t *xdata) +pl_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - pl_fdctx_t *fdctx = NULL; + pl_fdctx_t *fdctx = NULL; - if (op_ret < 0) - goto unwind; + if (op_ret < 0) + goto unwind; - fdctx = pl_check_n_create_fdctx (this, fd); - if (!fdctx) { - op_errno = ENOMEM; - op_ret = -1; - goto unwind; - } + fdctx = pl_check_n_create_fdctx(this, fd); + if (!fdctx) { + op_errno = ENOMEM; + op_ret = -1; + goto unwind; + } unwind: - STACK_UNWIND_STRICT (opendir, - frame, - op_ret, - op_errno, - fd, xdata); - return 0; + PL_STACK_UNWIND(opendir, xdata, frame, op_ret, op_errno, fd, xdata); + + return 0; } int32_t -pl_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd, dict_t *xdata) -{ - STACK_WIND (frame, - pl_opendir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->opendir, - loc, fd, xdata); - return 0; - +pl_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + return 0; } int -pl_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +pl_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) { - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata); + PL_STACK_UNWIND_FOR_CLIENT(flush, xdata, frame, op_ret, op_errno, xdata); - return 0; + return 0; } - int -pl_flush (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *xdata) +pl_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - pl_inode_t *pl_inode = NULL; - - pl_inode = pl_inode_get (this, fd->inode); - - if (!pl_inode) { - gf_log (this->name, GF_LOG_DEBUG, "Could not get inode."); - STACK_UNWIND_STRICT (flush, frame, -1, EBADFD, NULL); - return 0; - } - - pl_trace_flush (this, frame, fd); - - if (frame->root->lk_owner.len == 0) { - /* Handle special case when protocol/server sets lk-owner to zero. - * This usually happens due to a client disconnection. Hence, free - * all locks opened with this fd. - */ - gf_log (this->name, GF_LOG_TRACE, - "Releasing all locks with fd %p", fd); - delete_locks_of_fd (this, pl_inode, fd); - goto wind; - - } - pthread_mutex_lock (&pl_inode->mutex); - { - __delete_locks_of_owner (pl_inode, frame->root->client, - &frame->root->lk_owner); + pl_inode_t *pl_inode = pl_inode_get(this, fd->inode, NULL); + if (!pl_inode) { + gf_log(this->name, GF_LOG_DEBUG, "Could not get inode."); + STACK_UNWIND_STRICT(flush, frame, -1, EBADFD, NULL); + return 0; + } + + pthread_mutex_lock(&pl_inode->mutex); + { + if (pl_inode->migrated) { + pthread_mutex_unlock(&pl_inode->mutex); + STACK_UNWIND_STRICT(flush, frame, -1, EREMOTE, NULL); + return 0; } - pthread_mutex_unlock (&pl_inode->mutex); - - grant_blocked_locks (this, pl_inode); - - do_blocked_rw (pl_inode); + } + pthread_mutex_unlock(&pl_inode->mutex); + + pl_trace_flush(this, frame, fd); + + if (frame->root->lk_owner.len == 0) { + /* Handle special case when protocol/server sets lk-owner to zero. + * This usually happens due to a client disconnection. Hence, free + * all locks opened with this fd. + */ + gf_log(this->name, GF_LOG_TRACE, "Releasing all locks with fd %p", fd); + delete_locks_of_fd(this, pl_inode, fd); + goto wind; + } + pthread_mutex_lock(&pl_inode->mutex); + { + __delete_locks_of_owner(pl_inode, frame->root->client, + &frame->root->lk_owner); + } + pthread_mutex_unlock(&pl_inode->mutex); + + grant_blocked_locks(this, pl_inode); + + do_blocked_rw(pl_inode); wind: - STACK_WIND (frame, pl_flush_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, fd, xdata); - return 0; + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; } - int -pl_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +pl_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) { - pl_fdctx_t *fdctx = NULL; + pl_fdctx_t *fdctx = NULL; - if (op_ret < 0) - goto unwind; + if (op_ret < 0) + goto unwind; - fdctx = pl_check_n_create_fdctx (this, fd); - if (!fdctx) { - op_errno = ENOMEM; - op_ret = -1; - goto unwind; - } + fdctx = pl_check_n_create_fdctx(this, fd); + if (!fdctx) { + op_errno = ENOMEM; + op_ret = -1; + goto unwind; + } unwind: - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata); - return 0; + return 0; } - int -pl_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, dict_t *xdata) +pl_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { - STACK_WIND (frame, pl_open_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, - loc, flags, fd, xdata); + int op_ret = -1; + int op_errno = EINVAL; + pl_inode_t *pl_inode = NULL; + posix_lock_t *l = NULL; + posix_locks_private_t *priv = this->private; + + GF_VALIDATE_OR_GOTO("locks", this, unwind); + + op_ret = 0, op_errno = 0; + pl_inode = pl_inode_get(this, fd->inode, NULL); + if (!pl_inode) { + gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, "Could not get inode"); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + /* As per design, under forced and file-based mandatory locking modes + * it doesn't matter whether inodes's lock list contain advisory or + * mandatory type locks. So we just check whether inode's lock list is + * empty or not to make sure that no locks are being held for the file. + * Whereas under optimal mandatory locking mode, we strictly fail open + * if and only if lock list contain mandatory locks. + */ + if (((priv->mandatory_mode == MLK_FILE_BASED) && pl_inode->mandatory) || + priv->mandatory_mode == MLK_FORCED) { + if (fd->flags & O_TRUNC) { + pthread_mutex_lock(&pl_inode->mutex); + { + if (!list_empty(&pl_inode->ext_list)) { + op_ret = -1; + op_errno = EAGAIN; + } + } + pthread_mutex_unlock(&pl_inode->mutex); + } + } else if (priv->mandatory_mode == MLK_OPTIMAL) { + if (fd->flags & O_TRUNC) { + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry(l, &pl_inode->ext_list, list) + { + if ((l->lk_flags & GF_LK_MANDATORY)) { + op_ret = -1; + op_errno = EAGAIN; + break; + } + } + } + pthread_mutex_unlock(&pl_inode->mutex); + } + } - return 0; +unwind: + if (op_ret == -1) + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, NULL, NULL); + else + STACK_WIND(frame, pl_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; } - int -pl_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd, inode_t *inode, struct iatt *buf, - struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +pl_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - pl_fdctx_t *fdctx = NULL; + pl_fdctx_t *fdctx = NULL; - if (op_ret < 0) - goto unwind; + if (op_ret < 0) + goto unwind; - fdctx = pl_check_n_create_fdctx (this, fd); - if (!fdctx) { - op_errno = ENOMEM; - op_ret = -1; - goto unwind; - } + fdctx = pl_check_n_create_fdctx(this, fd); + if (!fdctx) { + op_errno = ENOMEM; + op_ret = -1; + goto unwind; + } unwind: - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent, xdata); + PL_STACK_UNWIND(create, xdata, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); - return 0; + return 0; } - int -pl_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, - dict_t *xdata) +pl_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - STACK_WIND (frame, pl_create_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, - loc, flags, mode, umask, fd, xdata); - return 0; -} + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; +} int -pl_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *stbuf, - struct iobref *iobref, dict_t *xdata) +pl_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, - vector, count, stbuf, iobref, xdata); + pl_track_io_fop_count(frame->local, this, DECREMENT); - return 0; + PL_STACK_UNWIND(readv, xdata, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); + + return 0; } int -pl_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +pl_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + pl_track_io_fop_count(frame->local, this, DECREMENT); - return 0; -} + PL_STACK_UNWIND(writev, xdata, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} void -do_blocked_rw (pl_inode_t *pl_inode) +do_blocked_rw(pl_inode_t *pl_inode) { - struct list_head wind_list; - pl_rw_req_t *rw = NULL; - pl_rw_req_t *tmp = NULL; + struct list_head wind_list; + pl_rw_req_t *rw = NULL; + pl_rw_req_t *tmp = NULL; - INIT_LIST_HEAD (&wind_list); + INIT_LIST_HEAD(&wind_list); - pthread_mutex_lock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + list_for_each_entry_safe(rw, tmp, &pl_inode->rw_list, list) { - list_for_each_entry_safe (rw, tmp, &pl_inode->rw_list, list) { - if (__rw_allowable (pl_inode, &rw->region, - rw->stub->fop)) { - list_del_init (&rw->list); - list_add_tail (&rw->list, &wind_list); - } + if (__rw_allowable(pl_inode, &rw->region, rw->stub->fop)) { + list_del_init(&rw->list); + list_add_tail(&rw->list, &wind_list); + if (pl_inode->mlock_enforced && + pl_inode->track_fop_wind_count) { + pl_inode->fop_wind_count++; } + } } - pthread_mutex_unlock (&pl_inode->mutex); + } + pthread_mutex_unlock(&pl_inode->mutex); - list_for_each_entry_safe (rw, tmp, &wind_list, list) { - list_del_init (&rw->list); - call_resume (rw->stub); - GF_FREE (rw); - } + list_for_each_entry_safe(rw, tmp, &wind_list, list) + { + list_del_init(&rw->list); + call_resume(rw->stub); + GF_FREE(rw); + } - return; + return; } +/* when mandatory lock is enforced: + If an IO request comes on a region which is out of the boundary of the + granted mandatory lock, it will be rejected. + + Note: There is no IO blocking with mandatory lock enforced as it may be + a stale data from an old client. + */ +gf_boolean_t static within_range(posix_lock_t *existing, posix_lock_t *new) +{ + if (existing->fl_start <= new->fl_start && existing->fl_end >= new->fl_end) + return _gf_true; + + return _gf_false; +} static int -__rw_allowable (pl_inode_t *pl_inode, posix_lock_t *region, - glusterfs_fop_t op) +__rw_allowable(pl_inode_t *pl_inode, posix_lock_t *region, glusterfs_fop_t op) { - posix_lock_t *l = NULL; - int ret = 1; + posix_lock_t *l = NULL; + posix_locks_private_t *priv = THIS->private; + int ret = 1; - list_for_each_entry (l, &pl_inode->ext_list, list) { - if (locks_overlap (l, region) && !same_owner (l, region)) { - if ((op == GF_FOP_READ) && (l->fl_type != F_WRLCK)) - continue; - ret = 0; - break; + if (pl_inode->mlock_enforced) { + list_for_each_entry(l, &pl_inode->ext_list, list) + { + /* + with lock enforced (fencing) there should not be any blocking + lock coexisting. + */ + if (same_owner(l, region)) { + /* Should range check be strict for same owner with fencing? */ + if (locks_overlap(l, region)) { + if (within_range(l, region)) { + return 1; + } else { + /* + Should we allow read fop if it does not fit it in the + range? + if (op == GF_FOP_READ && l->fl_type != F_WRLCK) { + return 1; + } + */ + return 0; + } + } + } else { + if (locks_overlap(l, region)) { + /* + with fencing should a read from a different owner be + allowed if the mandatory lock taken is F_RDLCK? + if (op == GF_FOP_READ && l->fl_type != F_WRLCK) { + return 1; + } + */ + return 0; } + } } - return ret; -} + /* No lock has been taken by this owner */ + return 0; + } + + list_for_each_entry(l, &pl_inode->ext_list, list) + { + if (!l->blocked && locks_overlap(l, region) && !same_owner(l, region)) { + if ((op == GF_FOP_READ) && (l->fl_type != F_WRLCK)) + continue; + /* Check for mandatory lock under optimal + * mandatory-locking mode */ + if (priv->mandatory_mode == MLK_OPTIMAL && + !(l->lk_flags & GF_LK_MANDATORY)) + continue; + ret = 0; + break; + } + } + return ret; +} int -pl_readv_cont (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) +pl_readv_cont(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - STACK_WIND (frame, pl_readv_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv, - fd, size, offset, flags, xdata); + pl_track_io_fop_count(frame->local, this, INCREMENT); - return 0; -} + STACK_WIND(frame, pl_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + return 0; +} int -pl_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) -{ - posix_locks_private_t *priv = NULL; - pl_inode_t *pl_inode = NULL; - pl_rw_req_t *rw = NULL; - posix_lock_t region = {.list = {0, }, }; - int op_ret = 0; - int op_errno = 0; - char wind_needed = 1; - - - priv = this->private; - pl_inode = pl_inode_get (this, fd->inode); - - if (priv->mandatory && pl_inode->mandatory) { - region.fl_start = offset; - region.fl_end = offset + size - 1; - region.client = frame->root->client; - region.fd_num = fd_to_fdnum(fd); - region.client_pid = frame->root->pid; - region.owner = frame->root->lk_owner; - - pthread_mutex_lock (&pl_inode->mutex); - { - wind_needed = __rw_allowable (pl_inode, ®ion, - GF_FOP_READ); - if (wind_needed) { - goto unlock; - } - - if (fd->flags & O_NONBLOCK) { - gf_log (this->name, GF_LOG_TRACE, - "returning EAGAIN as fd is O_NONBLOCK"); - op_errno = EAGAIN; - op_ret = -1; - goto unlock; - } - - rw = GF_CALLOC (1, sizeof (*rw), - gf_locks_mt_pl_rw_req_t); - if (!rw) { - op_errno = ENOMEM; - op_ret = -1; - goto unlock; - } - - rw->stub = fop_readv_stub (frame, pl_readv_cont, - fd, size, offset, flags, - xdata); - if (!rw->stub) { - op_errno = ENOMEM; - op_ret = -1; - GF_FREE (rw); - goto unlock; - } +pl_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; + pl_rw_req_t *rw = NULL; + posix_lock_t region = { + .list = + { + 0, + }, + }; + gf_boolean_t enabled = _gf_false; + gf_boolean_t can_block = _gf_true; + int op_ret = 0; + int op_errno = 0; + int allowed = 1; + + GF_VALIDATE_OR_GOTO("locks", this, unwind); + + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + + if (!frame->local) { + frame->local = mem_get0(this->local_pool); + local = frame->local; + local->inode = inode_ref(fd->inode); + local->fd = fd_ref(fd); + } + + pl_inode = pl_inode_get(this, fd->inode, local); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + if (frame->root->pid < 0) + enabled = _gf_false; + else + enabled = pl_is_mandatory_locking_enabled(pl_inode); + + if (enabled) { + region.fl_start = offset; + region.fl_end = offset + size - 1; + region.client = frame->root->client; + region.fd_num = fd_to_fdnum(fd); + region.client_pid = frame->root->pid; + region.owner = frame->root->lk_owner; + + pthread_mutex_lock(&pl_inode->mutex); + { + allowed = pl_is_fop_allowed(pl_inode, ®ion, fd, GF_FOP_READ, + &can_block); + if (allowed == 1) { + if (pl_inode->mlock_enforced && + pl_inode->track_fop_wind_count) { + pl_inode->fop_wind_count++; + } + goto unlock; + } else if (!can_block) { + op_errno = EAGAIN; + op_ret = -1; + goto unlock; + } - rw->region = region; + rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t); + if (!rw) { + op_errno = ENOMEM; + op_ret = -1; + goto unlock; + } - list_add_tail (&rw->list, &pl_inode->rw_list); - } - unlock: - pthread_mutex_unlock (&pl_inode->mutex); - } + rw->stub = fop_readv_stub(frame, pl_readv_cont, fd, size, offset, + flags, xdata); + if (!rw->stub) { + op_errno = ENOMEM; + op_ret = -1; + GF_FREE(rw); + goto unlock; + } + rw->region = region; - if (wind_needed) { - STACK_WIND (frame, pl_readv_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv, - fd, size, offset, flags, xdata); + list_add_tail(&rw->list, &pl_inode->rw_list); } + unlock: + pthread_mutex_unlock(&pl_inode->mutex); + } + + if (allowed == 1) { + STACK_WIND(frame, pl_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); + } +unwind: + if (op_ret == -1) + PL_STACK_UNWIND(readv, xdata, frame, op_ret, op_errno, NULL, 0, NULL, + NULL, NULL); - if (op_ret == -1) - STACK_UNWIND_STRICT (readv, frame, -1, op_errno, - NULL, 0, NULL, NULL, NULL); - - return 0; + return 0; } - int -pl_writev_cont (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int count, off_t offset, - uint32_t flags, struct iobref *iobref, dict_t *xdata) +pl_writev_cont(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) { - STACK_WIND (frame, pl_writev_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, - fd, vector, count, offset, flags, iobref, xdata); + pl_track_io_fop_count(frame->local, this, INCREMENT); - return 0; -} + STACK_WIND(frame, pl_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; +} int -pl_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - uint32_t flags, struct iobref *iobref, dict_t *xdata) -{ - posix_locks_private_t *priv = NULL; - pl_inode_t *pl_inode = NULL; - pl_rw_req_t *rw = NULL; - posix_lock_t region = {.list = {0, }, }; - int op_ret = 0; - int op_errno = 0; - char wind_needed = 1; - - priv = this->private; - pl_inode = pl_inode_get (this, fd->inode); - - if (priv->mandatory && pl_inode->mandatory) { - region.fl_start = offset; - region.fl_end = offset + iov_length (vector, count) - 1; - region.client = frame->root->client; - region.fd_num = fd_to_fdnum(fd); - region.client_pid = frame->root->pid; - region.owner = frame->root->lk_owner; - - pthread_mutex_lock (&pl_inode->mutex); - { - wind_needed = __rw_allowable (pl_inode, ®ion, - GF_FOP_WRITE); - if (wind_needed) - goto unlock; - - if (fd->flags & O_NONBLOCK) { - gf_log (this->name, GF_LOG_TRACE, - "returning EAGAIN because fd is " - "O_NONBLOCK"); - op_errno = EAGAIN; - op_ret = -1; - goto unlock; - } - - rw = GF_CALLOC (1, sizeof (*rw), - gf_locks_mt_pl_rw_req_t); - if (!rw) { - op_errno = ENOMEM; - op_ret = -1; - goto unlock; - } - - rw->stub = fop_writev_stub (frame, pl_writev_cont, - fd, vector, count, offset, - flags, iobref, xdata); - if (!rw->stub) { - op_errno = ENOMEM; - op_ret = -1; - GF_FREE (rw); - goto unlock; - } +pl_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; + pl_rw_req_t *rw = NULL; + posix_lock_t region = { + .list = + { + 0, + }, + }; + gf_boolean_t enabled = _gf_false; + gf_boolean_t can_block = _gf_true; + int op_ret = 0; + int op_errno = 0; + int allowed = 1; + + GF_VALIDATE_OR_GOTO("locks", this, unwind); + + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + + if (!frame->local) { + frame->local = mem_get0(this->local_pool); + local = frame->local; + local->inode = inode_ref(fd->inode); + local->fd = fd_ref(fd); + } + + pl_inode = pl_inode_get(this, fd->inode, local); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + if (frame->root->pid < 0) + enabled = _gf_false; + else + enabled = pl_is_mandatory_locking_enabled(pl_inode); + + if (enabled) { + region.fl_start = offset; + region.fl_end = offset + iov_length(vector, count) - 1; + region.client = frame->root->client; + region.fd_num = fd_to_fdnum(fd); + region.client_pid = frame->root->pid; + region.owner = frame->root->lk_owner; + + pthread_mutex_lock(&pl_inode->mutex); + { + allowed = pl_is_fop_allowed(pl_inode, ®ion, fd, GF_FOP_WRITE, + &can_block); + if (allowed == 1) { + if (pl_inode->mlock_enforced && + pl_inode->track_fop_wind_count) { + pl_inode->fop_wind_count++; + } + goto unlock; + } else if (!can_block) { + if (pl_inode->mlock_enforced) { + op_errno = EBUSY; + } else { + op_errno = EAGAIN; + } - rw->region = region; + op_ret = -1; + goto unlock; + } - list_add_tail (&rw->list, &pl_inode->rw_list); - } - unlock: - pthread_mutex_unlock (&pl_inode->mutex); - } + rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t); + if (!rw) { + op_errno = ENOMEM; + op_ret = -1; + goto unlock; + } + rw->stub = fop_writev_stub(frame, pl_writev_cont, fd, vector, count, + offset, flags, iobref, xdata); + if (!rw->stub) { + op_errno = ENOMEM; + op_ret = -1; + GF_FREE(rw); + goto unlock; + } - if (wind_needed) - STACK_WIND (frame, pl_writev_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, - fd, vector, count, offset, flags, iobref, xdata); + rw->region = region; - if (op_ret == -1) - STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL, - NULL); + list_add_tail(&rw->list, &pl_inode->rw_list); + } + unlock: + pthread_mutex_unlock(&pl_inode->mutex); + } + + if (allowed == 1) { + STACK_WIND(frame, pl_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + } +unwind: + if (op_ret == -1) + PL_STACK_UNWIND(writev, xdata, frame, op_ret, op_errno, NULL, NULL, + NULL); - return 0; + return 0; } static int -__fd_has_locks (pl_inode_t *pl_inode, fd_t *fd) +__fd_has_locks(pl_inode_t *pl_inode, fd_t *fd) { - int found = 0; - posix_lock_t *l = NULL; + posix_lock_t *l = NULL; - list_for_each_entry (l, &pl_inode->ext_list, list) { - if ((l->fd_num == fd_to_fdnum(fd))) { - found = 1; - break; - } + list_for_each_entry(l, &pl_inode->ext_list, list) + { + if (l->fd_num == fd_to_fdnum(fd)) { + return 1; } + } - return found; + return 0; } static posix_lock_t * -lock_dup (posix_lock_t *lock) +lock_dup(posix_lock_t *lock) { - posix_lock_t *new_lock = NULL; - - new_lock = new_posix_lock (&lock->user_flock, lock->client, - lock->client_pid, &lock->owner, - (fd_t *)lock->fd_num); - return new_lock; + int32_t op_errno = 0; + return new_posix_lock(&lock->user_flock, lock->client, lock->client_pid, + &lock->owner, (fd_t *)lock->fd_num, lock->lk_flags, + lock->blocking, &op_errno); } static int -__dup_locks_to_fdctx (pl_inode_t *pl_inode, fd_t *fd, - pl_fdctx_t *fdctx) -{ - posix_lock_t *l = NULL; - posix_lock_t *duplock = NULL; - int ret = 0; - - list_for_each_entry (l, &pl_inode->ext_list, list) { - if ((l->fd_num == fd_to_fdnum(fd))) { - duplock = lock_dup (l); - if (!duplock) { - ret = -1; - break; - } +__dup_locks_to_fdctx(pl_inode_t *pl_inode, fd_t *fd, pl_fdctx_t *fdctx) +{ + posix_lock_t *l = NULL; + posix_lock_t *duplock = NULL; + int ret = 0; + + list_for_each_entry(l, &pl_inode->ext_list, list) + { + if (l->fd_num == fd_to_fdnum(fd)) { + duplock = lock_dup(l); + if (!duplock) { + ret = -1; + break; + } - list_add_tail (&duplock->list, &fdctx->locks_list); - } + list_add_tail(&duplock->list, &fdctx->locks_list); } + } - return ret; + return ret; } static int -__copy_locks_to_fdctx (pl_inode_t *pl_inode, fd_t *fd, - pl_fdctx_t *fdctx) +__copy_locks_to_fdctx(pl_inode_t *pl_inode, fd_t *fd, pl_fdctx_t *fdctx) { - int ret = 0; - - ret = __dup_locks_to_fdctx (pl_inode, fd, fdctx); - if (ret) - goto out; - -out: - return ret; - + return __dup_locks_to_fdctx(pl_inode, fd, fdctx); } static void -pl_mark_eol_lock (posix_lock_t *lock) +pl_mark_eol_lock(posix_lock_t *lock) { - lock->user_flock.l_type = GF_LK_EOL; - return; + lock->user_flock.l_type = GF_LK_EOL; + return; } static posix_lock_t * -__get_next_fdctx_lock (pl_fdctx_t *fdctx) +__get_next_fdctx_lock(pl_fdctx_t *fdctx) { - posix_lock_t *lock = NULL; + posix_lock_t *lock = NULL; - GF_ASSERT (fdctx); + GF_ASSERT(fdctx); - if (list_empty (&fdctx->locks_list)) { - gf_log (THIS->name, GF_LOG_DEBUG, - "fdctx lock list empty"); - goto out; - } + if (list_empty(&fdctx->locks_list)) { + gf_log(THIS->name, GF_LOG_DEBUG, "fdctx lock list empty"); + goto out; + } - lock = list_entry (fdctx->locks_list.next, typeof (*lock), - list); + lock = list_entry(fdctx->locks_list.next, typeof(*lock), list); - GF_ASSERT (lock); + GF_ASSERT(lock); - list_del_init (&lock->list); + list_del_init(&lock->list); out: - return lock; + return lock; } static int -__set_next_lock_fd (pl_fdctx_t *fdctx, posix_lock_t *reqlock) +__set_next_lock_fd(pl_fdctx_t *fdctx, posix_lock_t *reqlock) { - posix_lock_t *lock = NULL; - int ret = 0; + posix_lock_t *lock = NULL; + int ret = 0; - GF_ASSERT (fdctx); + GF_ASSERT(fdctx); - lock = __get_next_fdctx_lock (fdctx); - if (!lock) { - gf_log (THIS->name, GF_LOG_DEBUG, - "marking EOL in reqlock"); - pl_mark_eol_lock (reqlock); - goto out; - } + lock = __get_next_fdctx_lock(fdctx); + if (!lock) { + gf_log(THIS->name, GF_LOG_DEBUG, "marking EOL in reqlock"); + pl_mark_eol_lock(reqlock); + goto out; + } - reqlock->user_flock = lock->user_flock; - reqlock->fl_start = lock->fl_start; - reqlock->fl_type = lock->fl_type; - reqlock->fl_end = lock->fl_end; - reqlock->owner = lock->owner; + reqlock->user_flock = lock->user_flock; + reqlock->fl_start = lock->fl_start; + reqlock->fl_type = lock->fl_type; + reqlock->fl_end = lock->fl_end; + reqlock->owner = lock->owner; out: - if (lock) - __destroy_lock (lock); + if (lock) + __destroy_lock(lock); - return ret; + return ret; } static int -pl_getlk_fd (xlator_t *this, pl_inode_t *pl_inode, - fd_t *fd, posix_lock_t *reqlock) +pl_getlk_fd(xlator_t *this, pl_inode_t *pl_inode, fd_t *fd, + posix_lock_t *reqlock) { - uint64_t tmp = 0; - pl_fdctx_t *fdctx = NULL; - int ret = 0; - - pthread_mutex_lock (&pl_inode->mutex); - { - if (!__fd_has_locks (pl_inode, fd)) { - gf_log (this->name, GF_LOG_DEBUG, - "fd=%p has no active locks", fd); - ret = 0; - goto unlock; - } + uint64_t tmp = 0; + pl_fdctx_t *fdctx = NULL; + int ret = 0; + + pthread_mutex_lock(&pl_inode->mutex); + { + if (!__fd_has_locks(pl_inode, fd)) { + pthread_mutex_unlock(&pl_inode->mutex); + gf_log(this->name, GF_LOG_DEBUG, "fd=%p has no active locks", fd); + ret = 0; + goto out; + } - gf_log (this->name, GF_LOG_DEBUG, - "There are active locks on fd"); + gf_log(this->name, GF_LOG_DEBUG, "There are active locks on fd"); - ret = fd_ctx_get (fd, this, &tmp); - fdctx = (pl_fdctx_t *)(long) tmp; + ret = fd_ctx_get(fd, this, &tmp); + fdctx = (pl_fdctx_t *)(long)tmp; - if (list_empty (&fdctx->locks_list)) { - gf_log (this->name, GF_LOG_TRACE, - "no fdctx -> copying all locks on fd"); + if (list_empty(&fdctx->locks_list)) { + gf_log(this->name, GF_LOG_TRACE, + "no fdctx -> copying all locks on fd"); - ret = __copy_locks_to_fdctx (pl_inode, fd, fdctx); - if (ret) { - goto unlock; - } + ret = __copy_locks_to_fdctx(pl_inode, fd, fdctx); + if (ret) { + goto unlock; + } - ret = __set_next_lock_fd (fdctx, reqlock); + ret = __set_next_lock_fd(fdctx, reqlock); - } else { - gf_log (this->name, GF_LOG_TRACE, - "fdctx present -> returning the next lock"); - ret = __set_next_lock_fd (fdctx, reqlock); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "could not get next lock of fd"); - goto unlock; - } - } + } else { + gf_log(this->name, GF_LOG_TRACE, + "fdctx present -> returning the next lock"); + ret = __set_next_lock_fd(fdctx, reqlock); + if (ret) { + pthread_mutex_unlock(&pl_inode->mutex); + gf_log(this->name, GF_LOG_DEBUG, + "could not get next lock of fd"); + goto out; + } } + } unlock: - pthread_mutex_unlock (&pl_inode->mutex); - return ret; - + pthread_mutex_unlock(&pl_inode->mutex); +out: + return ret; } int -pl_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) -{ - pl_inode_t *pl_inode = NULL; - int op_ret = 0; - int op_errno = 0; - int can_block = 0; - posix_lock_t *reqlock = NULL; - posix_lock_t *conf = NULL; - int ret = 0; - - if ((flock->l_start < 0) || (flock->l_len < 0)) { - op_ret = -1; - op_errno = EINVAL; - goto unwind; - } +pl_metalock_is_active(pl_inode_t *pl_inode) +{ + if (list_empty(&pl_inode->metalk_list)) + return 0; + else + return 1; +} - pl_inode = pl_inode_get (this, fd->inode); - if (!pl_inode) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } +void +__pl_queue_lock(pl_inode_t *pl_inode, posix_lock_t *reqlock) +{ + list_add_tail(&reqlock->list, &pl_inode->queued_locks); +} + +int +pl_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + pl_inode_t *pl_inode = NULL; + int op_ret = 0; + int op_errno = 0; + int can_block = 0; + posix_lock_t *reqlock = NULL; + posix_lock_t *conf = NULL; + uint32_t lk_flags = 0; + posix_locks_private_t *priv = this->private; + pl_local_t *local = NULL; + short lock_type = 0; + + int ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_flags); + if (ret == 0) { + if (priv->mandatory_mode == MLK_NONE) + gf_log(this->name, GF_LOG_DEBUG, + "Lock flags received " + "in a non-mandatory locking environment, " + "continuing"); + else + gf_log(this->name, GF_LOG_DEBUG, + "Lock flags received, " + "continuing"); + } + + if ((flock->l_start < 0) || ((flock->l_start + flock->l_len) < 0)) { + op_ret = -1; + op_errno = EINVAL; + goto unwind; + } + + /* As per 'man 3 fcntl', the value of l_len may be + * negative. In such cases, lock request should be + * considered for the range starting at 'l_start+l_len' + * and ending at 'l_start-1'. Update the fields accordingly. + */ + if (flock->l_len < 0) { + flock->l_start += flock->l_len; + flock->l_len = labs(flock->l_len); + } + + local = mem_get0(this->local_pool); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } else { + frame->local = local; + local->fd = fd_ref(fd); + } - reqlock = new_posix_lock (flock, frame->root->client, frame->root->pid, - &frame->root->lk_owner, fd); + pl_inode = pl_inode_get(this, fd->inode, local); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } - if (!reqlock) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } + reqlock = new_posix_lock(flock, frame->root->client, frame->root->pid, + &frame->root->lk_owner, fd, lk_flags, can_block, + &op_errno); - pl_trace_in (this, frame, fd, NULL, cmd, flock, NULL); + if (!reqlock) { + op_ret = -1; + goto unwind; + } - switch (cmd) { + pl_trace_in(this, frame, fd, NULL, cmd, flock, NULL); + switch (cmd) { case F_RESLK_LCKW: - can_block = 1; + can_block = 1; - /* fall through */ + /* fall through */ case F_RESLK_LCK: - memcpy (&reqlock->user_flock, flock, sizeof (struct gf_flock)); - reqlock->frame = frame; - reqlock->this = this; + reqlock->frame = frame; + reqlock->this = this; - ret = pl_reserve_setlk (this, pl_inode, reqlock, - can_block); - if (ret < 0) { - if (can_block) - goto out; + ret = pl_reserve_setlk(this, pl_inode, reqlock, can_block); + if (ret < 0) { + if (can_block) + goto out; - op_ret = -1; - op_errno = -ret; - __destroy_lock (reqlock); - goto unwind; - } - /* Finally a getlk and return the call */ - conf = pl_getlk (pl_inode, reqlock); - if (conf) - posix_lock_to_flock (conf, flock); - break; + op_ret = -1; + op_errno = -ret; + __destroy_lock(reqlock); + goto unwind; + } + /* Finally a getlk and return the call */ + conf = pl_getlk(pl_inode, reqlock); + if (conf) + posix_lock_to_flock(conf, flock); + break; case F_RESLK_UNLCK: - reqlock->frame = frame; - reqlock->this = this; - ret = pl_reserve_unlock (this, pl_inode, reqlock); - if (ret < 0) { - op_ret = -1; - op_errno = -ret; - } - __destroy_lock (reqlock); - goto unwind; + reqlock->frame = frame; + reqlock->this = this; + ret = pl_reserve_unlock(this, pl_inode, reqlock); + if (ret < 0) { + op_ret = -1; + op_errno = -ret; + } + __destroy_lock(reqlock); + goto unwind; - break; + break; case F_GETLK_FD: - reqlock->frame = frame; - reqlock->this = this; - ret = pl_verify_reservelk (this, pl_inode, reqlock, can_block); - GF_ASSERT (ret >= 0); - - ret = pl_getlk_fd (this, pl_inode, fd, reqlock); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "getting locks on fd failed"); - op_ret = -1; - op_errno = ENOLCK; - goto unwind; - } + reqlock->frame = frame; + reqlock->this = this; + ret = pl_verify_reservelk(this, pl_inode, reqlock, can_block); + GF_ASSERT(ret >= 0); + + ret = pl_getlk_fd(this, pl_inode, fd, reqlock); + if (ret < 0) { + gf_log(this->name, GF_LOG_DEBUG, "getting locks on fd failed"); + op_ret = -1; + op_errno = ENOLCK; + goto unwind; + } - gf_log (this->name, GF_LOG_TRACE, - "Replying with a lock on fd for healing"); + gf_log(this->name, GF_LOG_TRACE, + "Replying with a lock on fd for healing"); - posix_lock_to_flock (reqlock, flock); - __destroy_lock (reqlock); + posix_lock_to_flock(reqlock, flock); + __destroy_lock(reqlock); - break; + break; #if F_GETLK != F_GETLK64 case F_GETLK64: #endif case F_GETLK: - conf = pl_getlk (pl_inode, reqlock); - posix_lock_to_flock (conf, flock); - __destroy_lock (reqlock); + conf = pl_getlk(pl_inode, reqlock); + posix_lock_to_flock(conf, flock); + __destroy_lock(reqlock); - break; + break; #if F_SETLKW != F_SETLKW64 case F_SETLKW64: #endif case F_SETLKW: - can_block = 1; - reqlock->frame = frame; - reqlock->this = this; - - /* fall through */ + can_block = 1; + reqlock->frame = frame; + reqlock->this = this; + reqlock->blocking = can_block; + /* fall through */ #if F_SETLK != F_SETLK64 case F_SETLK64: #endif case F_SETLK: - memcpy (&reqlock->user_flock, flock, sizeof (struct gf_flock)); - ret = pl_verify_reservelk (this, pl_inode, reqlock, can_block); - if (ret < 0) { - gf_log (this->name, GF_LOG_TRACE, - "Lock blocked due to conflicting reserve lock"); - goto out; + reqlock->frame = frame; + reqlock->this = this; + lock_type = flock->l_type; + + pthread_mutex_lock(&pl_inode->mutex); + { + if (pl_inode->migrated) { + op_errno = EREMOTE; + pthread_mutex_unlock(&pl_inode->mutex); + STACK_UNWIND_STRICT(lk, frame, -1, op_errno, flock, xdata); + + __destroy_lock(reqlock); + goto out; } - ret = pl_setlk (this, pl_inode, reqlock, - can_block); + } + pthread_mutex_unlock(&pl_inode->mutex); + + ret = pl_verify_reservelk(this, pl_inode, reqlock, can_block); + if (ret < 0) { + gf_log(this->name, GF_LOG_TRACE, + "Lock blocked due to conflicting reserve lock"); + goto out; + } + if (reqlock->fl_type != F_UNLCK && pl_inode->mlock_enforced) { + ret = pl_lock_preempt(pl_inode, reqlock); if (ret == -1) { - if ((can_block) && (F_UNLCK != flock->l_type)) { - pl_trace_block (this, frame, fd, NULL, cmd, flock, NULL); - goto out; - } - gf_log (this->name, GF_LOG_DEBUG, "returning EAGAIN"); - op_ret = -1; - op_errno = EAGAIN; - __destroy_lock (reqlock); - - } else if ((0 == ret) && (F_UNLCK == flock->l_type)) { - /* For NLM's last "unlock on fd" detection */ - if (pl_locks_by_fd (pl_inode, fd)) - flock->l_type = F_RDLCK; - else - flock->l_type = F_UNLCK; + gf_log(this->name, GF_LOG_ERROR, "lock preempt failed"); + op_ret = -1; + op_errno = EAGAIN; + __destroy_lock(reqlock); + goto out; } - } -unwind: - pl_trace_out (this, frame, fd, NULL, cmd, flock, op_ret, op_errno, NULL); - pl_update_refkeeper (this, fd->inode); + pl_trace_block(this, frame, fd, NULL, cmd, flock, NULL); + goto unwind; + } + ret = pl_setlk(this, pl_inode, reqlock, can_block); + if (ret == -1) { + if ((can_block) && (F_UNLCK != lock_type)) { + goto out; + } + gf_log(this->name, GF_LOG_DEBUG, "returning EAGAIN"); + op_ret = -1; + op_errno = EAGAIN; + __destroy_lock(reqlock); + } else if (ret == -2) { + goto out; + } else if ((0 == ret) && (F_UNLCK == flock->l_type)) { + /* For NLM's last "unlock on fd" detection */ + if (pl_locks_by_fd(pl_inode, fd)) + flock->l_type = F_RDLCK; + else + flock->l_type = F_UNLCK; + } + } - STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, flock, xdata); +unwind: + pl_trace_out(this, frame, fd, NULL, cmd, flock, op_ret, op_errno, NULL); + pl_update_refkeeper(this, fd->inode); + + PL_STACK_UNWIND(lk, xdata, frame, op_ret, op_errno, flock, xdata); out: - return 0; + return 0; } - /* TODO: this function just logs, no action required?? */ int -pl_forget (xlator_t *this, - inode_t *inode) +pl_forget(xlator_t *this, inode_t *inode) { - pl_inode_t *pl_inode = NULL; + pl_inode_t *pl_inode = NULL; - posix_lock_t *ext_tmp = NULL; - posix_lock_t *ext_l = NULL; - struct list_head posixlks_released; + posix_lock_t *ext_tmp = NULL; + posix_lock_t *ext_l = NULL; + struct list_head posixlks_released; - pl_inode_lock_t *ino_tmp = NULL; - pl_inode_lock_t *ino_l = NULL; - struct list_head inodelks_released; + pl_inode_lock_t *ino_tmp = NULL; + pl_inode_lock_t *ino_l = NULL; + struct list_head inodelks_released; - pl_rw_req_t *rw_tmp = NULL; - pl_rw_req_t *rw_req = NULL; + pl_rw_req_t *rw_tmp = NULL; + pl_rw_req_t *rw_req = NULL; - pl_entry_lock_t *entry_tmp = NULL; - pl_entry_lock_t *entry_l = NULL; - struct list_head entrylks_released; + pl_entry_lock_t *entry_tmp = NULL; + pl_entry_lock_t *entry_l = NULL; + struct list_head entrylks_released; - pl_dom_list_t *dom = NULL; - pl_dom_list_t *dom_tmp = NULL; + pl_dom_list_t *dom = NULL; + pl_dom_list_t *dom_tmp = NULL; - INIT_LIST_HEAD (&posixlks_released); - INIT_LIST_HEAD (&inodelks_released); - INIT_LIST_HEAD (&entrylks_released); + INIT_LIST_HEAD(&posixlks_released); + INIT_LIST_HEAD(&inodelks_released); + INIT_LIST_HEAD(&entrylks_released); - pl_inode = pl_inode_get (this, inode); + pl_inode = pl_inode_get(this, inode, NULL); + if (!pl_inode) + return 0; - pthread_mutex_lock (&pl_inode->mutex); - { + pthread_mutex_lock(&pl_inode->mutex); + { + if (!list_empty(&pl_inode->rw_list)) { + gf_log(this->name, GF_LOG_WARNING, + "Pending R/W requests found, releasing."); + + list_for_each_entry_safe(rw_req, rw_tmp, &pl_inode->rw_list, list) + { + list_del(&rw_req->list); + call_stub_destroy(rw_req->stub); + GF_FREE(rw_req); + } + } - if (!list_empty (&pl_inode->rw_list)) { - gf_log (this->name, GF_LOG_WARNING, - "Pending R/W requests found, releasing."); + if (!list_empty(&pl_inode->ext_list)) { + gf_log(this->name, GF_LOG_WARNING, + "Pending fcntl locks found, releasing."); + + list_for_each_entry_safe(ext_l, ext_tmp, &pl_inode->ext_list, list) + { + __delete_lock(ext_l); + if (ext_l->blocked) { + list_add_tail(&ext_l->list, &posixlks_released); + continue; + } + __destroy_lock(ext_l); + } + } - list_for_each_entry_safe (rw_req, rw_tmp, &pl_inode->rw_list, - list) { + list_for_each_entry_safe(dom, dom_tmp, &pl_inode->dom_list, inode_list) + { + if (!list_empty(&dom->inodelk_list)) { + gf_log(this->name, GF_LOG_WARNING, + "Pending inode locks found, releasing."); - list_del (&rw_req->list); - GF_FREE (rw_req); - } + list_for_each_entry_safe(ino_l, ino_tmp, &dom->inodelk_list, + list) + { + __delete_inode_lock(ino_l); + __pl_inodelk_unref(ino_l); } - if (!list_empty (&pl_inode->ext_list)) { - gf_log (this->name, GF_LOG_WARNING, - "Pending fcntl locks found, releasing."); + list_splice_init(&dom->blocked_inodelks, &inodelks_released); + } + if (!list_empty(&dom->entrylk_list)) { + gf_log(this->name, GF_LOG_WARNING, + "Pending entry locks found, releasing."); - list_for_each_entry_safe (ext_l, ext_tmp, &pl_inode->ext_list, - list) { + list_for_each_entry_safe(entry_l, entry_tmp, &dom->entrylk_list, + domain_list) + { + list_del_init(&entry_l->domain_list); - __delete_lock (pl_inode, ext_l); - if (ext_l->blocked) { - list_add_tail (&ext_l->list, &posixlks_released); - continue; - } - __destroy_lock (ext_l); - } + GF_FREE((char *)entry_l->basename); + GF_FREE(entry_l->connection_id); + GF_FREE(entry_l); } + list_splice_init(&dom->blocked_entrylks, &entrylks_released); + } - list_for_each_entry_safe (dom, dom_tmp, &pl_inode->dom_list, inode_list) { + list_del(&dom->inode_list); + gf_log("posix-locks", GF_LOG_TRACE, " Cleaning up domain: %s", + dom->domain); + GF_FREE((char *)(dom->domain)); + GF_FREE(dom); + } + } + pthread_mutex_unlock(&pl_inode->mutex); - if (!list_empty (&dom->inodelk_list)) { - gf_log (this->name, GF_LOG_WARNING, - "Pending inode locks found, releasing."); + if (!list_empty(&posixlks_released)) { + list_for_each_entry_safe(ext_l, ext_tmp, &posixlks_released, list) + { + STACK_UNWIND_STRICT(lk, ext_l->frame, -1, 0, &ext_l->user_flock, + NULL); + __destroy_lock(ext_l); + } + } - list_for_each_entry_safe (ino_l, ino_tmp, &dom->inodelk_list, list) { - __delete_inode_lock (ino_l); - __pl_inodelk_unref (ino_l); - } + if (!list_empty(&inodelks_released)) { + list_for_each_entry_safe(ino_l, ino_tmp, &inodelks_released, + blocked_locks) + { + STACK_UNWIND_STRICT(inodelk, ino_l->frame, -1, 0, NULL); + __pl_inodelk_unref(ino_l); + } + } - list_splice_init (&dom->blocked_inodelks, &inodelks_released); + if (!list_empty(&entrylks_released)) { + list_for_each_entry_safe(entry_l, entry_tmp, &entrylks_released, + blocked_locks) + { + STACK_UNWIND_STRICT(entrylk, entry_l->frame, -1, 0, NULL); + GF_FREE((char *)entry_l->basename); + GF_FREE(entry_l->connection_id); + GF_FREE(entry_l); + } + } + pthread_mutex_destroy(&pl_inode->mutex); - } - if (!list_empty (&dom->entrylk_list)) { - gf_log (this->name, GF_LOG_WARNING, - "Pending entry locks found, releasing."); + GF_FREE(pl_inode); - list_for_each_entry_safe (entry_l, entry_tmp, &dom->entrylk_list, domain_list) { - list_del_init (&entry_l->domain_list); + return 0; +} - GF_FREE ((char *)entry_l->basename); - GF_FREE (entry_l->connection_id); - GF_FREE (entry_l); - } +int +pl_release(xlator_t *this, fd_t *fd) +{ + pl_inode_t *pl_inode = NULL; + uint64_t tmp_pl_inode = 0; + int ret = -1; + uint64_t tmp = 0; + pl_fdctx_t *fdctx = NULL; - list_splice_init (&dom->blocked_entrylks, &entrylks_released); - } + if (fd == NULL) { + goto out; + } - list_del (&dom->inode_list); - gf_log ("posix-locks", GF_LOG_TRACE, - " Cleaning up domain: %s", dom->domain); - GF_FREE ((char *)(dom->domain)); - GF_FREE (dom); - } + ret = inode_ctx_get(fd->inode, this, &tmp_pl_inode); + if (ret != 0) + goto clean; - } - pthread_mutex_unlock (&pl_inode->mutex); + pl_inode = (pl_inode_t *)(long)tmp_pl_inode; - list_for_each_entry_safe (ext_l, ext_tmp, &posixlks_released, list) { + pl_trace_release(this, fd); - STACK_UNWIND_STRICT (lk, ext_l->frame, -1, 0, - &ext_l->user_flock, NULL); - __destroy_lock (ext_l); - } + gf_log(this->name, GF_LOG_TRACE, "Releasing all locks with fd %p", fd); - list_for_each_entry_safe (ino_l, ino_tmp, &inodelks_released, blocked_locks) { + delete_locks_of_fd(this, pl_inode, fd); + pl_update_refkeeper(this, fd->inode); - STACK_UNWIND_STRICT (inodelk, ino_l->frame, -1, 0, NULL); - __pl_inodelk_unref (ino_l); - } +clean: + ret = fd_ctx_del(fd, this, &tmp); + if (ret) { + gf_log(this->name, GF_LOG_DEBUG, "Could not get fdctx"); + goto out; + } - list_for_each_entry_safe (entry_l, entry_tmp, &entrylks_released, blocked_locks) { + fdctx = (pl_fdctx_t *)(long)tmp; - STACK_UNWIND_STRICT (entrylk, entry_l->frame, -1, 0, NULL); - GF_FREE ((char *)entry_l->basename); - GF_FREE (entry_l->connection_id); - GF_FREE (entry_l); + GF_FREE(fdctx); +out: + return ret; +} - } +int +pl_releasedir(xlator_t *this, fd_t *fd) +{ + int ret = -1; + uint64_t tmp = 0; + pl_fdctx_t *fdctx = NULL; - GF_FREE (pl_inode); + if (fd == NULL) { + goto out; + } - return 0; + ret = fd_ctx_del(fd, this, &tmp); + if (ret) { + gf_log(this->name, GF_LOG_DEBUG, "Could not get fdctx"); + goto out; + } + + fdctx = (pl_fdctx_t *)(long)tmp; + + GF_FREE(fdctx); +out: + return ret; } -int -pl_release (xlator_t *this, fd_t *fd) +static int32_t +pl_request_link_count(dict_t **pxdata) { - pl_inode_t *pl_inode = NULL; - uint64_t tmp_pl_inode = 0; - int ret = -1; - uint64_t tmp = 0; - pl_fdctx_t *fdctx = NULL; + dict_t *xdata; - if (fd == NULL) { - goto out; + xdata = *pxdata; + if (xdata == NULL) { + xdata = dict_new(); + if (xdata == NULL) { + return ENOMEM; } + } else { + dict_ref(xdata); + } - ret = inode_ctx_get (fd->inode, this, &tmp_pl_inode); - if (ret != 0) - goto out; + if (dict_set_uint32(xdata, GET_LINK_COUNT, 0) != 0) { + dict_unref(xdata); + return ENOMEM; + } - pl_inode = (pl_inode_t *)(long)tmp_pl_inode; + *pxdata = xdata; - pl_trace_release (this, fd); + return 0; +} - gf_log (this->name, GF_LOG_TRACE, - "Releasing all locks with fd %p", fd); +static int32_t +pl_check_link_count(dict_t *xdata) +{ + int32_t count; - delete_locks_of_fd (this, pl_inode, fd); - pl_update_refkeeper (this, fd->inode); + /* In case we are unable to read the link count from xdata, we take a + * conservative approach and return -2, which will prevent the inode from + * being considered deleted. In fact it will cause link tracking for this + * inode to be disabled completely to avoid races. */ - ret = fd_ctx_del (fd, this, &tmp); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "Could not get fdctx"); - goto out; - } + if (xdata == NULL) { + return -2; + } - fdctx = (pl_fdctx_t *)(long)tmp; + if (dict_get_int32(xdata, GET_LINK_COUNT, &count) != 0) { + return -2; + } - GF_FREE (fdctx); -out: - return ret; + return count; } -int -pl_releasedir (xlator_t *this, fd_t *fd) +int32_t +pl_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) { - int ret = -1; - uint64_t tmp = 0; - pl_fdctx_t *fdctx = NULL; + pl_inode_t *pl_inode; - if (fd == NULL) { - goto out; + if (op_ret >= 0) { + pl_inode = pl_inode_get(this, inode, NULL); + if (pl_inode == NULL) { + PL_STACK_UNWIND(lookup, xdata, frame, -1, ENOMEM, NULL, NULL, NULL, + NULL); + return 0; } - ret = fd_ctx_del (fd, this, &tmp); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "Could not get fdctx"); - goto out; + pthread_mutex_lock(&pl_inode->mutex); + + /* We only update the link count if we previously didn't know it. + * Doing it always can lead to races since lookup is not executed + * atomically most of the times. */ + if (pl_inode->links == -2) { + pl_inode->links = pl_check_link_count(xdata); + if (buf->ia_type == IA_IFDIR) { + /* Directories have at least 2 links. To avoid special handling + * for directories, we simply decrement the value here to make + * them equivalent to regular files. */ + pl_inode->links--; + } } - fdctx = (pl_fdctx_t *)(long)tmp; + pthread_mutex_unlock(&pl_inode->mutex); + } - GF_FREE (fdctx); -out: - return ret; + PL_STACK_UNWIND(lookup, xdata, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + return 0; } int32_t -__get_posixlk_count (xlator_t *this, pl_inode_t *pl_inode) +pl_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - posix_lock_t *lock = NULL; - int32_t count = 0; - - list_for_each_entry (lock, &pl_inode->ext_list, list) { - - count++; - } + int32_t error; + + error = pl_request_link_count(&xdata); + if (error == 0) { + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + dict_unref(xdata); + } else { + STACK_UNWIND_STRICT(lookup, frame, -1, error, NULL, NULL, NULL, NULL); + } + return 0; +} - return count; +int32_t +pl_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) +{ + PL_STACK_UNWIND(fstat, xdata, frame, op_ret, op_errno, buf, xdata); + return 0; } int32_t -get_posixlk_count (xlator_t *this, inode_t *inode) +pl_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - pl_inode_t *pl_inode = NULL; - uint64_t tmp_pl_inode = 0; - int ret = 0; - int32_t count = 0; + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; +} - ret = inode_ctx_get (inode, this, &tmp_pl_inode); - if (ret != 0) { - goto out; - } +int +pl_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *entries, dict_t *xdata) +{ + pl_local_t *local = NULL; + gf_dirent_t *entry = NULL; - pl_inode = (pl_inode_t *)(long) tmp_pl_inode; + if (op_ret <= 0) + goto unwind; - pthread_mutex_lock (&pl_inode->mutex); - { - count =__get_posixlk_count (this, pl_inode); - } - pthread_mutex_unlock (&pl_inode->mutex); + local = frame->local; + if (!local) + goto unwind; -out: - return count; + list_for_each_entry(entry, &entries->list, list) + { + pl_set_xdata_response(this, local, local->fd->inode, entry->inode, + entry->d_name, entry->dict, 0); + } + +unwind: + PL_STACK_UNWIND(readdirp, xdata, frame, op_ret, op_errno, entries, xdata); + + return 0; } -void -pl_parent_entrylk_xattr_fill (xlator_t *this, inode_t *parent, - char *basename, dict_t *dict) +int +pl_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - uint32_t entrylk = 0; - int ret = -1; + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, xdata); + + return 0; +} + +lock_migration_info_t * +gf_mig_info_for_lock(posix_lock_t *lock) +{ + lock_migration_info_t *new = GF_MALLOC(sizeof(lock_migration_info_t), + gf_common_mt_lock_mig); + if (new == NULL) { + goto out; + } + + INIT_LIST_HEAD(&new->list); + + posix_lock_to_flock(lock, &new->flock); + + new->lk_flags = lock->lk_flags; + + new->client_uid = gf_strdup(lock->client_uid); - if (!parent || !basename || !strlen (basename)) - goto out; - entrylk = check_entrylk_on_basename (this, parent, basename); out: - ret = dict_set_uint32 (dict, GLUSTERFS_PARENT_ENTRYLK, entrylk); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - " dict_set failed on key %s", GLUSTERFS_PARENT_ENTRYLK); - } + return new; } -void -pl_entrylk_xattr_fill (xlator_t *this, inode_t *inode, - dict_t *dict) +int +pl_fill_active_locks(pl_inode_t *pl_inode, lock_migration_info_t *lmi) { - int32_t count = 0; - int ret = -1; + posix_lock_t *temp = NULL; + lock_migration_info_t *newlock = NULL; + int count = 0; + + pthread_mutex_lock(&pl_inode->mutex); + { + if (list_empty(&pl_inode->ext_list)) { + count = 0; + goto unlock; + } - count = get_entrylk_count (this, inode); - ret = dict_set_int32 (dict, GLUSTERFS_ENTRYLK_COUNT, count); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - " dict_set failed on key %s", GLUSTERFS_ENTRYLK_COUNT); + list_for_each_entry(temp, &pl_inode->ext_list, list) + { + if (temp->blocked) + continue; + + newlock = gf_mig_info_for_lock(temp); + if (!newlock) { + pthread_mutex_unlock(&pl_inode->mutex); + gf_msg(THIS->name, GF_LOG_ERROR, 0, 0, "lock_dup failed"); + count = -1; + goto out; + } + + list_add_tail(&newlock->list, &lmi->list); + count++; } + } +unlock: + pthread_mutex_unlock(&pl_inode->mutex); +out: + return count; } -void -pl_inodelk_xattr_fill (xlator_t *this, inode_t *inode, dict_t *dict, - gf_boolean_t per_dom) +/* This function reads only active locks */ +static int +pl_getactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - int32_t count = 0; - int ret = -1; - char *domname = NULL; + pl_inode_t *pl_inode = NULL; + lock_migration_info_t locks; + int op_ret = 0; + int op_errno = 0; + int count = 0; + INIT_LIST_HEAD(&locks.list); - if (per_dom){ - ret = dict_get_str (dict, GLUSTERFS_INODELK_DOM_COUNT, - &domname); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to get " - "value for key %s",GLUSTERFS_INODELK_DOM_COUNT); - goto out; - } - } + pl_inode = pl_inode_get(this, loc->inode, NULL); + if (!pl_inode) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "pl_inode_get failed"); - count = get_inodelk_count (this, inode, domname); + op_ret = -1; + op_errno = ENOMEM; + goto out; + } - ret = dict_set_int32 (dict, GLUSTERFS_INODELK_COUNT, count); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, "Failed to set count for " - "key %s", GLUSTERFS_INODELK_COUNT); - } + count = pl_fill_active_locks(pl_inode, &locks); + + op_ret = count; out: - return; + STACK_UNWIND_STRICT(getactivelk, frame, op_ret, op_errno, &locks, NULL); + + gf_free_mig_locks(&locks); + + return 0; } void -pl_posixlk_xattr_fill (xlator_t *this, inode_t *inode, - dict_t *dict) +pl_metalk_unref(pl_meta_lock_t *lock) { - int32_t count = 0; - int ret = -1; + lock->ref--; + if (!lock->ref) { + GF_FREE(lock->client_uid); + GF_FREE(lock); + } +} - count = get_posixlk_count (this, inode); - ret = dict_set_int32 (dict, GLUSTERFS_POSIXLK_COUNT, count); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - " dict_set failed on key %s", GLUSTERFS_POSIXLK_COUNT); - } +void +__pl_metalk_ref(pl_meta_lock_t *lock) +{ + lock->ref++; +} +pl_meta_lock_t * +new_meta_lock(call_frame_t *frame, xlator_t *this) +{ + pl_meta_lock_t *lock = GF_CALLOC(1, sizeof(*lock), + gf_locks_mt_pl_meta_lock_t); + + if (!lock) { + gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, + "mem allocation" + " failed for meta lock"); + goto out; + } + + INIT_LIST_HEAD(&lock->list); + INIT_LIST_HEAD(&lock->client_list); + + lock->client_uid = gf_strdup(frame->root->client->client_uid); + if (!lock->client_uid) { + gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, + "mem allocation" + " failed for client_uid"); + GF_FREE(lock); + lock = NULL; + goto out; + } + + __pl_metalk_ref(lock); +out: + return lock; } -int32_t -pl_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *xdata, - struct iatt *postparent) +int +pl_insert_metalk(pl_inode_t *pl_inode, pl_ctx_t *ctx, pl_meta_lock_t *lock) { - pl_local_t *local = NULL; + int ret = 0; - GF_VALIDATE_OR_GOTO (this->name, frame->local, out); + if (!pl_inode || !ctx || !lock) { + gf_msg(THIS->name, GF_LOG_INFO, 0, 0, "NULL parameter"); + ret = -1; + goto out; + } - if (op_ret) - goto out; + lock->pl_inode = pl_inode; - local = frame->local; + /* refer function pl_inode_setlk for more info for this ref. + * This should be unrefed on meta-unlock triggered by rebalance or + * in cleanup with client disconnect*/ + /*TODO: unref this in cleanup code for disconnect and meta-unlock*/ + pl_inode->inode = inode_ref(pl_inode->inode); - if (local->parent_entrylk_req) - pl_parent_entrylk_xattr_fill (this, local->loc.parent, - (char*)local->loc.name, xdata); - if (local->entrylk_count_req) - pl_entrylk_xattr_fill (this, inode, xdata); - if (local->inodelk_count_req) - pl_inodelk_xattr_fill (this, inode, xdata, _gf_false); - if (local->inodelk_dom_count_req) - pl_inodelk_xattr_fill (this, inode, xdata, _gf_true); - if (local->posixlk_count_req) - pl_posixlk_xattr_fill (this, inode, xdata); + /* NOTE:In case of a client-server disconnect we need to cleanup metalk. + * Hence, adding the metalk to pl_ctx_t as well. The mutex lock order + * should always be on ctx and then on pl_inode*/ + pthread_mutex_lock(&ctx->lock); + { + pthread_mutex_lock(&pl_inode->mutex); + { + list_add_tail(&lock->list, &pl_inode->metalk_list); + } + pthread_mutex_unlock(&pl_inode->mutex); + + list_add_tail(&lock->client_list, &ctx->metalk_list); + } + pthread_mutex_unlock(&ctx->lock); out: - local = frame->local; - frame->local = NULL; - - if (local != NULL) { - loc_wipe (&local->loc); - mem_put (local); - } - - STACK_UNWIND_STRICT ( - lookup, - frame, - op_ret, - op_errno, - inode, - buf, - xdata, - postparent); - return 0; + return ret; } int32_t -pl_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xdata) +pl_metalk(call_frame_t *frame, xlator_t *this, inode_t *inode) { - pl_local_t *local = NULL; - int ret = -1; + pl_inode_t *pl_inode = NULL; + int ret = 0; + pl_meta_lock_t *reqlk = NULL; + pl_ctx_t *ctx = NULL; + + pl_inode = pl_inode_get(this, inode, NULL); + if (!pl_inode) { + gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, + "pl_inode mem allocation failedd"); + + ret = -1; + goto out; + } + + /* Non rebalance process trying to do metalock */ + if (frame->root->pid != GF_CLIENT_PID_DEFRAG) { + ret = -1; + goto out; + } + + /* Note: In the current scheme of glusterfs where lock migration is + * experimental, (ideally) the rebalance process which is migrating + * the file should request for a metalock. Hence, the metalock count + * should not be more than one for an inode. In future, if there is a + * need for meta-lock from other clients, the following block can be + * removed. + * + * Since pl_metalk is called as part of setxattr operation, any client + * process(non-rebalance) residing outside trusted network can exhaust + * memory of the server node by issuing setxattr repetitively on the + * metalock key. The following code makes sure that more than + * one metalock cannot be granted on an inode*/ + pthread_mutex_lock(&pl_inode->mutex); + { + if (pl_metalock_is_active(pl_inode)) { + ret = -1; + } + } + pthread_mutex_unlock(&pl_inode->mutex); + + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, EINVAL, 0, + "More than one meta-lock cannot be granted on" + " the inode"); + goto out; + } + + if (frame->root->client) { + ctx = pl_ctx_get(frame->root->client, this); + if (!ctx) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "pl_ctx_get failed"); + + ret = -1; + goto out; + } + } else { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "frame-root-client " + "is NULL"); + + ret = -1; + goto out; + } + + reqlk = new_meta_lock(frame, this); + if (!reqlk) { + ret = -1; + goto out; + } + + ret = pl_insert_metalk(pl_inode, ctx, reqlk); + if (ret < 0) { + pl_metalk_unref(reqlk); + } - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); +out: + return ret; +} + +static void +__unwind_queued_locks(pl_inode_t *pl_inode, struct list_head *tmp_list) +{ + if (list_empty(&pl_inode->queued_locks)) + return; - local = mem_get0 (this->local_pool); - GF_VALIDATE_OR_GOTO (this->name, local, out); + list_splice_init(&pl_inode->queued_locks, tmp_list); +} - if (xdata) { - if (dict_get (xdata, GLUSTERFS_ENTRYLK_COUNT)) - local->entrylk_count_req = 1; - if (dict_get (xdata, GLUSTERFS_INODELK_COUNT)) - local->inodelk_count_req = 1; - if (dict_get (xdata, GLUSTERFS_INODELK_DOM_COUNT)) - local->inodelk_dom_count_req = 1; - if (dict_get (xdata, GLUSTERFS_POSIXLK_COUNT)) - local->posixlk_count_req = 1; - if (dict_get (xdata, GLUSTERFS_PARENT_ENTRYLK)) - local->parent_entrylk_req = 1; - } +static void +__unwind_blocked_locks(pl_inode_t *pl_inode, struct list_head *tmp_list) +{ + posix_lock_t *lock = NULL; + posix_lock_t *tmp = NULL; - frame->local = local; - loc_copy (&local->loc, loc); - - STACK_WIND (frame, - pl_lookup_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, - loc, xdata); - ret = 0; -out: - if (ret == -1) - STACK_UNWIND_STRICT (lookup, frame, -1, 0, NULL, - NULL, NULL, NULL); + if (list_empty(&pl_inode->ext_list)) + return; - return 0; + list_for_each_entry_safe(lock, tmp, &pl_inode->ext_list, list) + { + if (!lock->blocking) + continue; + + list_del_init(&lock->list); + list_add_tail(&lock->list, tmp_list); + } } + int -pl_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries, dict_t *xdata) +pl_metaunlock(call_frame_t *frame, xlator_t *this, inode_t *inode, dict_t *dict) { - pl_local_t *local = NULL; - gf_dirent_t *entry = NULL; + pl_inode_t *pl_inode = NULL; + int ret = 0; + pl_meta_lock_t *meta_lock = NULL; + pl_meta_lock_t *tmp_metalk = NULL; + pl_ctx_t *ctx = NULL; + posix_lock_t *posix_lock = NULL; + posix_lock_t *tmp_posixlk = NULL; + struct list_head tmp_posixlk_list; + + INIT_LIST_HEAD(&tmp_posixlk_list); + + if (frame->root->client) { + ctx = pl_ctx_get(frame->root->client, this); + if (!ctx) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "pl_ctx_get failed"); + + ret = -1; + goto out; + } + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "frame-root-client is " + "NULL"); + ret = -1; + goto out; + } + + pl_inode = pl_inode_get(this, inode, NULL); + if (!pl_inode) { + ret = -1; + goto out; + } + + pthread_mutex_lock(&ctx->lock); + { + pthread_mutex_lock(&pl_inode->mutex); + { + /* Unwind queued locks regardless of migration status */ + __unwind_queued_locks(pl_inode, &tmp_posixlk_list); - local = frame->local; + /* Unwind blocked locks only for successful migration */ + if (dict_get_sizen(dict, "status")) { + /* unwind all blocked locks */ + __unwind_blocked_locks(pl_inode, &tmp_posixlk_list); + } - if (op_ret <= 0) - goto unwind; + /* unlock metalk */ + /* if this list is empty then pl_inode->metalk_list + * should be empty too. meta lock should in all cases + * be added/removed from both pl_ctx_t and pl_inode */ + + if (list_empty(&ctx->metalk_list)) + goto unlock; + + list_for_each_entry_safe(meta_lock, tmp_metalk, &ctx->metalk_list, + client_list) + { + list_del_init(&meta_lock->client_list); + + pl_inode = meta_lock->pl_inode; - list_for_each_entry (entry, &entries->list, list) { - if (local->entrylk_count_req) - pl_entrylk_xattr_fill (this, entry->inode, entry->dict); - if (local->inodelk_count_req) - pl_inodelk_xattr_fill (this, entry->inode, entry->dict, - _gf_false); - if (local->inodelk_dom_count_req) - pl_inodelk_xattr_fill (this, entry->inode, entry->dict, - _gf_true); - if (local->posixlk_count_req) - pl_posixlk_xattr_fill (this, entry->inode, entry->dict); + list_del_init(&meta_lock->list); + + pl_metalk_unref(meta_lock); + + /* The corresponding ref is taken in + * pl_insert_metalk*/ + inode_unref(pl_inode->inode); + } + + if (dict_get_sizen(dict, "status")) + pl_inode->migrated = _gf_true; + else + pl_inode->migrated = _gf_false; } + unlock: -unwind: - frame->local = NULL; - STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); + pthread_mutex_unlock(&pl_inode->mutex); + } + pthread_mutex_unlock(&ctx->lock); - if (local) - mem_put (local); +out: + list_for_each_entry_safe(posix_lock, tmp_posixlk, &tmp_posixlk_list, list) + { + list_del_init(&posix_lock->list); - return 0; + STACK_UNWIND_STRICT(lk, posix_lock->frame, -1, EREMOTE, + &posix_lock->user_flock, NULL); + + __destroy_lock(posix_lock); + } + + return ret; } -int -pl_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, dict_t *dict) +int32_t +pl_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - pl_local_t *local = NULL; - - local = mem_get0 (this->local_pool); - GF_VALIDATE_OR_GOTO (this->name, local, out); + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; + local = frame->local; + if (local && local->update_mlock_enforced_flag && op_ret != -1) { + pl_inode = pl_inode_get(this, local->inode, NULL); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } - if (dict) { - if (dict_get (dict, GLUSTERFS_ENTRYLK_COUNT)) - local->entrylk_count_req = 1; - if (dict_get (dict, GLUSTERFS_INODELK_COUNT)) - local->inodelk_count_req = 1; - if (dict_get (dict, GLUSTERFS_INODELK_DOM_COUNT)) - local->inodelk_dom_count_req = 1; - if (dict_get (dict, GLUSTERFS_POSIXLK_COUNT)) - local->posixlk_count_req = 1; + pthread_mutex_lock(&pl_inode->mutex); + { + while (pl_inode->fop_wind_count > 0) { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "waiting for existing fops (count %d) to drain for " + "gfid %s", + pl_inode->fop_wind_count, uuid_utoa(pl_inode->gfid)); + pthread_cond_wait(&pl_inode->check_fop_wind_count, + &pl_inode->mutex); + } + pl_inode->mlock_enforced = _gf_true; + pl_inode->check_mlock_info = _gf_false; } + pthread_mutex_unlock(&pl_inode->mutex); + } - frame->local = local; +unwind: + PL_STACK_UNWIND_FOR_CLIENT(setxattr, xdata, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +pl_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int flags, dict_t *xdata) +{ + int op_ret = 0; + int op_errno = EINVAL; + dict_t *xdata_rsp = NULL; + char *name = NULL; + posix_locks_private_t *priv = this->private; - STACK_WIND (frame, pl_readdirp_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, - fd, size, offset, dict); + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); - return 0; -out: - STACK_UNWIND_STRICT (readdirp, frame, -1, ENOMEM, NULL, NULL); - return 0; -} + if (dict_get_sizen(dict, GF_META_LOCK_KEY)) { + op_ret = pl_metalk(frame, this, loc->inode); + + } else if (dict_get_sizen(dict, GF_META_UNLOCK_KEY)) { + op_ret = pl_metaunlock(frame, this, loc->inode, dict); + } else { + goto usual; + } + + PL_STACK_UNWIND_FOR_CLIENT(setxattr, xdata_rsp, frame, op_ret, op_errno, + xdata_rsp); + return 0; + +usual: + PL_CHECK_LOCK_ENFORCE_KEY(frame, dict, name, this, loc, ((fd_t *)NULL), + priv); + + STACK_WIND(frame, pl_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata); + return 0; +unwind: + PL_STACK_UNWIND_FOR_CLIENT(setxattr, xdata, frame, op_ret, op_errno, xdata); + + return 0; +} void -pl_dump_lock (char *str, int size, struct gf_flock *flock, - gf_lkowner_t *owner, void *trans, char *conn_id, - time_t *granted_time, time_t *blkd_time, gf_boolean_t active) +pl_dump_lock(char *str, int size, struct gf_flock *flock, gf_lkowner_t *owner, + void *trans, char *conn_id, time_t *granted_time, + time_t *blkd_time, gf_boolean_t active) { - char *type_str = NULL; - char granted[32] = {0,}; - char blocked[32] = {0,}; - - switch (flock->l_type) { + char *type_str = NULL; + char granted[GF_TIMESTR_SIZE] = { + 0, + }; + char blocked[GF_TIMESTR_SIZE] = { + 0, + }; + + if (granted_time) + gf_time_fmt(granted, sizeof(granted), *granted_time, gf_timefmt_FT); + if (blkd_time) + gf_time_fmt(blocked, sizeof(blocked), *blkd_time, gf_timefmt_FT); + switch (flock->l_type) { case F_RDLCK: - type_str = "READ"; - break; + type_str = "READ"; + break; case F_WRLCK: - type_str = "WRITE"; - break; + type_str = "WRITE"; + break; case F_UNLCK: - type_str = "UNLOCK"; - break; + type_str = "UNLOCK"; + break; default: - type_str = "UNKNOWN"; - break; + type_str = "UNKNOWN"; + break; + } + + if (active) { + if (blkd_time && *blkd_time == 0) { + snprintf(str, size, RANGE_GRNTD_FMT, type_str, flock->l_whence, + (unsigned long long)flock->l_start, + (unsigned long long)flock->l_len, + (unsigned long long)flock->l_pid, lkowner_utoa(owner), + trans, conn_id, granted); + } else { + snprintf(str, size, RANGE_BLKD_GRNTD_FMT, type_str, flock->l_whence, + (unsigned long long)flock->l_start, + (unsigned long long)flock->l_len, + (unsigned long long)flock->l_pid, lkowner_utoa(owner), + trans, conn_id, blocked, granted); } + } else { + snprintf(str, size, RANGE_BLKD_FMT, type_str, flock->l_whence, + (unsigned long long)flock->l_start, + (unsigned long long)flock->l_len, + (unsigned long long)flock->l_pid, lkowner_utoa(owner), trans, + conn_id, blocked); + } +} - if (active) { - if (blkd_time && *blkd_time == 0) { - snprintf (str, size, RANGE_GRNTD_FMT, - type_str, flock->l_whence, - (unsigned long long) flock->l_start, - (unsigned long long) flock->l_len, - (unsigned long long) flock->l_pid, - lkowner_utoa (owner), trans, conn_id, - ctime_r (granted_time, granted)); - } else { - snprintf (str, size, RANGE_BLKD_GRNTD_FMT, - type_str, flock->l_whence, - (unsigned long long) flock->l_start, - (unsigned long long) flock->l_len, - (unsigned long long) flock->l_pid, - lkowner_utoa (owner), trans, conn_id, - ctime_r (blkd_time, blocked), - ctime_r (granted_time, granted)); - } - } - else { - snprintf (str, size, RANGE_BLKD_FMT, - type_str, flock->l_whence, - (unsigned long long) flock->l_start, - (unsigned long long) flock->l_len, - (unsigned long long) flock->l_pid, - lkowner_utoa (owner), trans, conn_id, - ctime_r (blkd_time, blocked)); +void +__dump_entrylks(pl_inode_t *pl_inode) +{ + pl_dom_list_t *dom = NULL; + pl_entry_lock_t *lock = NULL; + char blocked[GF_TIMESTR_SIZE] = { + 0, + }; + char granted[GF_TIMESTR_SIZE] = { + 0, + }; + int count = 0; + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + char *k = "xlator.feature.locks.lock-dump.domain.entrylk"; + + char tmp[4098]; + + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + count = 0; + + gf_proc_dump_build_key(key, "lock-dump.domain", "domain"); + gf_proc_dump_write(key, "%s", dom->domain); + + list_for_each_entry(lock, &dom->entrylk_list, domain_list) + { + gf_time_fmt(granted, sizeof(granted), lock->granted_time, + gf_timefmt_FT); + gf_proc_dump_build_key(key, k, "entrylk[%d](ACTIVE)", count); + if (lock->blkd_time == 0) { + snprintf(tmp, sizeof(tmp), ENTRY_GRNTD_FMT, + lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" + : "ENTRYLK_WRLCK", + lock->basename, (unsigned long long)lock->client_pid, + lkowner_utoa(&lock->owner), lock->client, + lock->connection_id, granted); + } else { + gf_time_fmt(blocked, sizeof(blocked), lock->blkd_time, + gf_timefmt_FT); + snprintf(tmp, sizeof(tmp), ENTRY_BLKD_GRNTD_FMT, + lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" + : "ENTRYLK_WRLCK", + lock->basename, (unsigned long long)lock->client_pid, + lkowner_utoa(&lock->owner), lock->client, + lock->connection_id, blocked, granted); + } + + gf_proc_dump_write(key, "%s", tmp); + + count++; } + list_for_each_entry(lock, &dom->blocked_entrylks, blocked_locks) + { + gf_time_fmt(blocked, sizeof(blocked), lock->blkd_time, + gf_timefmt_FT); + + gf_proc_dump_build_key(key, k, "entrylk[%d](BLOCKED)", count); + snprintf( + tmp, sizeof(tmp), ENTRY_BLKD_FMT, + lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" : "ENTRYLK_WRLCK", + lock->basename, (unsigned long long)lock->client_pid, + lkowner_utoa(&lock->owner), lock->client, lock->connection_id, + blocked); + + gf_proc_dump_write(key, "%s", tmp); + + count++; + } + } } void -__dump_entrylks (pl_inode_t *pl_inode) -{ - pl_dom_list_t *dom = NULL; - pl_entry_lock_t *lock = NULL; - char blocked[32] = {0,}; - char granted[32] = {0,}; - int count = 0; - char key[GF_DUMP_MAX_BUF_LEN] = {0,}; - - char tmp[256]; - - list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { - - count = 0; - - gf_proc_dump_build_key(key, - "lock-dump.domain", - "domain"); - gf_proc_dump_write(key, "%s", dom->domain); - - list_for_each_entry (lock, &dom->entrylk_list, domain_list) { - - gf_proc_dump_build_key(key, - "xlator.feature.locks.lock-dump.domain.entrylk", - "entrylk[%d](ACTIVE)", count ); - if (lock->blkd_time.tv_sec == 0 && lock->blkd_time.tv_usec == 0) { - snprintf (tmp, 256, ENTRY_GRNTD_FMT, - lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" : - "ENTRYLK_WRLCK", lock->basename, - (unsigned long long) lock->client_pid, - lkowner_utoa (&lock->owner), lock->trans, - lock->connection_id, - ctime_r (&lock->granted_time.tv_sec, granted)); - } else { - snprintf (tmp, 256, ENTRY_BLKD_GRNTD_FMT, - lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" : - "ENTRYLK_WRLCK", lock->basename, - (unsigned long long) lock->client_pid, - lkowner_utoa (&lock->owner), lock->trans, - lock->connection_id, - ctime_r (&lock->blkd_time.tv_sec, blocked), - ctime_r (&lock->granted_time.tv_sec, granted)); - } +dump_entrylks(pl_inode_t *pl_inode) +{ + pthread_mutex_lock(&pl_inode->mutex); + { + __dump_entrylks(pl_inode); + } + pthread_mutex_unlock(&pl_inode->mutex); +} - gf_proc_dump_write(key, tmp); +void +__dump_inodelks(pl_inode_t *pl_inode) +{ + pl_dom_list_t *dom = NULL; + pl_inode_lock_t *lock = NULL; + int count = 0; + char key[GF_DUMP_MAX_BUF_LEN]; - count++; - } + char tmp[4098]; - list_for_each_entry (lock, &dom->blocked_entrylks, blocked_locks) { + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + count = 0; - gf_proc_dump_build_key(key, - "xlator.feature.locks.lock-dump.domain.entrylk", - "entrylk[%d](BLOCKED)", count ); - snprintf (tmp, 256, ENTRY_BLKD_FMT, - lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" : - "ENTRYLK_WRLCK", lock->basename, - (unsigned long long) lock->client_pid, - lkowner_utoa (&lock->owner), lock->trans, - lock->connection_id, - ctime_r (&lock->blkd_time.tv_sec, blocked)); + gf_proc_dump_build_key(key, "lock-dump.domain", "domain"); + gf_proc_dump_write(key, "%s", dom->domain); - gf_proc_dump_write(key, tmp); + list_for_each_entry(lock, &dom->inodelk_list, list) + { + gf_proc_dump_build_key(key, "inodelk", "inodelk[%d](ACTIVE)", + count); - count++; - } + SET_FLOCK_PID(&lock->user_flock, lock); + pl_dump_lock(tmp, sizeof(tmp), &lock->user_flock, &lock->owner, + lock->client, lock->connection_id, &lock->granted_time, + &lock->blkd_time, _gf_true); + gf_proc_dump_write(key, "%s", tmp); + count++; } + list_for_each_entry(lock, &dom->blocked_inodelks, blocked_locks) + { + gf_proc_dump_build_key(key, "inodelk", "inodelk[%d](BLOCKED)", + count); + SET_FLOCK_PID(&lock->user_flock, lock); + pl_dump_lock(tmp, sizeof(tmp), &lock->user_flock, &lock->owner, + lock->client, lock->connection_id, 0, &lock->blkd_time, + _gf_false); + gf_proc_dump_write(key, "%s", tmp); + + count++; + } + } } void -dump_entrylks (pl_inode_t *pl_inode) +dump_inodelks(pl_inode_t *pl_inode) { - pthread_mutex_lock (&pl_inode->mutex); - { - __dump_entrylks (pl_inode); - } - pthread_mutex_unlock (&pl_inode->mutex); + pthread_mutex_lock(&pl_inode->mutex); + { + __dump_inodelks(pl_inode); + } + pthread_mutex_unlock(&pl_inode->mutex); +} +void +__dump_posixlks(pl_inode_t *pl_inode) +{ + posix_lock_t *lock = NULL; + int count = 0; + char key[GF_DUMP_MAX_BUF_LEN]; + + char tmp[4098]; + + list_for_each_entry(lock, &pl_inode->ext_list, list) + { + SET_FLOCK_PID(&lock->user_flock, lock); + gf_proc_dump_build_key(key, "posixlk", "posixlk[%d](%s)", count, + lock->blocked ? "BLOCKED" : "ACTIVE"); + pl_dump_lock(tmp, sizeof(tmp), &lock->user_flock, &lock->owner, + lock->client, lock->client_uid, &lock->granted_time, + &lock->blkd_time, (lock->blocked) ? _gf_false : _gf_true); + gf_proc_dump_write(key, "%s", tmp); + + count++; + } } void -__dump_inodelks (pl_inode_t *pl_inode) +dump_posixlks(pl_inode_t *pl_inode) { - pl_dom_list_t *dom = NULL; - pl_inode_lock_t *lock = NULL; - int count = 0; - char key[GF_DUMP_MAX_BUF_LEN]; + pthread_mutex_lock(&pl_inode->mutex); + { + __dump_posixlks(pl_inode); + } + pthread_mutex_unlock(&pl_inode->mutex); +} + +int32_t +pl_dump_inode_priv(xlator_t *this, inode_t *inode) +{ + int ret = -1; + uint64_t tmp_pl_inode = 0; + pl_inode_t *pl_inode = NULL; + char *pathname = NULL; + gf_boolean_t section_added = _gf_false; + + int count = 0; + + if (!inode) { + errno = EINVAL; + goto out; + } + + ret = TRY_LOCK(&inode->lock); + if (ret) + goto out; + { + ret = __inode_ctx_get(inode, this, &tmp_pl_inode); + if (ret) + goto unlock; + } +unlock: + UNLOCK(&inode->lock); + if (ret) + goto out; + + pl_inode = (pl_inode_t *)(long)tmp_pl_inode; + if (!pl_inode) { + ret = -1; + goto out; + } + + gf_proc_dump_add_section("xlator.features.locks.%s.inode", this->name); + section_added = _gf_true; + + /*We are safe to call __inode_path since we have the + * inode->table->lock */ + __inode_path(inode, NULL, &pathname); + if (pathname) + gf_proc_dump_write("path", "%s", pathname); + + gf_proc_dump_write("mandatory", "%d", pl_inode->mandatory); + + ret = pthread_mutex_trylock(&pl_inode->mutex); + if (ret) + goto out; + { + count = __get_entrylk_count(this, pl_inode); + if (count) { + gf_proc_dump_write("entrylk-count", "%d", count); + __dump_entrylks(pl_inode); + } - char tmp[256]; + count = __get_inodelk_count(this, pl_inode, NULL); + if (count) { + gf_proc_dump_write("inodelk-count", "%d", count); + __dump_inodelks(pl_inode); + } - list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { + count = __get_posixlk_count(pl_inode); + if (count) { + gf_proc_dump_write("posixlk-count", "%d", count); + __dump_posixlks(pl_inode); + } - count = 0; + gf_proc_dump_write("links", "%d", pl_inode->links); + gf_proc_dump_write("removes_pending", "%u", pl_inode->remove_running); + gf_proc_dump_write("removed", "%u", pl_inode->removed); + } + pthread_mutex_unlock(&pl_inode->mutex); - gf_proc_dump_build_key(key, - "lock-dump.domain", - "domain"); - gf_proc_dump_write(key, "%s", dom->domain); +out: + GF_FREE(pathname); + + if (ret && inode) { + if (!section_added) + gf_proc_dump_add_section( + "xlator.features.locks.%s." + "inode", + this->name); + gf_proc_dump_write("Unable to print lock state", + "(Lock " + "acquisition failure) %s", + uuid_utoa(inode->gfid)); + } + return ret; +} - list_for_each_entry (lock, &dom->inodelk_list, list) { +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; - gf_proc_dump_build_key(key, - "inodelk", - "inodelk[%d](ACTIVE)",count ); + if (!this) + return ret; - SET_FLOCK_PID (&lock->user_flock, lock); - pl_dump_lock (tmp, 256, &lock->user_flock, - &lock->owner, - lock->client, lock->connection_id, - &lock->granted_time.tv_sec, - &lock->blkd_time.tv_sec, - _gf_true); - gf_proc_dump_write(key, tmp); + ret = xlator_mem_acct_init(this, gf_locks_mt_end + 1); - count++; - } + if (ret != 0) { + gf_log(this->name, GF_LOG_ERROR, + "Memory accounting init" + "failed"); + return ret; + } - list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) { + return ret; +} - gf_proc_dump_build_key(key, - "inodelk", - "inodelk[%d](BLOCKED)",count ); - SET_FLOCK_PID (&lock->user_flock, lock); - pl_dump_lock (tmp, 256, &lock->user_flock, - &lock->owner, - lock->client, lock->connection_id, - 0, &lock->blkd_time.tv_sec, - _gf_false); - gf_proc_dump_write(key, tmp); +pl_ctx_t * +pl_ctx_get(client_t *client, xlator_t *xlator) +{ + void *tmp = NULL; + pl_ctx_t *ctx = NULL; + pl_ctx_t *setted_ctx = NULL; - count++; - } + client_ctx_get(client, xlator, &tmp); - } + ctx = tmp; + + if (ctx != NULL) + goto out; + ctx = GF_CALLOC(1, sizeof(pl_ctx_t), gf_locks_mt_posix_lock_t); + + if (ctx == NULL) + goto out; + + pthread_mutex_init(&ctx->lock, NULL); + INIT_LIST_HEAD(&ctx->inodelk_lockers); + INIT_LIST_HEAD(&ctx->entrylk_lockers); + INIT_LIST_HEAD(&ctx->metalk_list); + + setted_ctx = client_ctx_set(client, xlator, ctx); + if (ctx != setted_ctx) { + pthread_mutex_destroy(&ctx->lock); + GF_FREE(ctx); + ctx = setted_ctx; + } +out: + return ctx; } -void -dump_inodelks (pl_inode_t *pl_inode) +int +pl_metalk_client_cleanup(xlator_t *this, pl_ctx_t *ctx) { - pthread_mutex_lock (&pl_inode->mutex); + pl_meta_lock_t *meta_lock = NULL; + pl_meta_lock_t *tmp_metalk = NULL; + pl_inode_t *pl_inode = NULL; + posix_lock_t *posix_lock = NULL; + posix_lock_t *tmp_posixlk = NULL; + struct list_head tmp_posixlk_list; + + INIT_LIST_HEAD(&tmp_posixlk_list); + + pthread_mutex_lock(&ctx->lock); + { + /* if this list is empty then pl_inode->metalk_list should be + * empty too. meta lock should in all cases be added/removed + * from both pl_ctx_t and pl_inode */ + if (list_empty(&ctx->metalk_list)) + goto unlock; + + list_for_each_entry_safe(meta_lock, tmp_metalk, &ctx->metalk_list, + client_list) { - __dump_inodelks (pl_inode); + list_del_init(&meta_lock->client_list); + + pl_inode = meta_lock->pl_inode; + + pthread_mutex_lock(&pl_inode->mutex); + + { + /* Since the migration status is unknown here + * unwind all queued and blocked locks to check + * migration status and find the correct + * destination */ + __unwind_queued_locks(pl_inode, &tmp_posixlk_list); + + __unwind_blocked_locks(pl_inode, &tmp_posixlk_list); + + list_del_init(&meta_lock->list); + + pl_metalk_unref(meta_lock); + } + pthread_mutex_unlock(&pl_inode->mutex); + + /* The corresponding ref is taken in + * pl_insert_metalk*/ + inode_unref(pl_inode->inode); } - pthread_mutex_unlock (&pl_inode->mutex); + } +unlock: + pthread_mutex_unlock(&ctx->lock); + + list_for_each_entry_safe(posix_lock, tmp_posixlk, &tmp_posixlk_list, list) + { + list_del_init(&posix_lock->list); + + STACK_UNWIND_STRICT(lk, posix_lock->frame, -1, EREMOTE, + &posix_lock->user_flock, NULL); + + __destroy_lock(posix_lock); + } + return 0; } -void -__dump_posixlks (pl_inode_t *pl_inode) +static int +pl_client_disconnect_cbk(xlator_t *this, client_t *client) { - posix_lock_t *lock = NULL; - int count = 0; - char key[GF_DUMP_MAX_BUF_LEN]; + pl_ctx_t *pl_ctx = pl_ctx_get(client, this); + if (pl_ctx) { + pl_inodelk_client_cleanup(this, pl_ctx); + pl_entrylk_client_cleanup(this, pl_ctx); + pl_metalk_client_cleanup(this, pl_ctx); + } + + return 0; +} - char tmp[256]; +static int +pl_client_destroy_cbk(xlator_t *this, client_t *client) +{ + void *tmp = NULL; + pl_ctx_t *pl_ctx = NULL; - list_for_each_entry (lock, &pl_inode->ext_list, list) { + pl_client_disconnect_cbk(this, client); - SET_FLOCK_PID (&lock->user_flock, lock); - gf_proc_dump_build_key(key, - "posixlk", - "posixlk[%d](%s)", - count, - lock->blocked ? "BLOCKED" : "ACTIVE"); - pl_dump_lock (tmp, 256, &lock->user_flock, - &lock->owner, lock->client, NULL, - &lock->granted_time.tv_sec, &lock->blkd_time.tv_sec, - (lock->blocked)? _gf_false: _gf_true); - gf_proc_dump_write(key, tmp); + client_ctx_del(client, this, &tmp); - count++; - } + if (tmp == NULL) + return 0; + + pl_ctx = tmp; + + GF_ASSERT(list_empty(&pl_ctx->inodelk_lockers)); + GF_ASSERT(list_empty(&pl_ctx->entrylk_lockers)); + + pthread_mutex_destroy(&pl_ctx->lock); + GF_FREE(pl_ctx); + + return 0; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + posix_locks_private_t *priv = this->private; + int ret = -1; + char *tmp_str = NULL; + + GF_OPTION_RECONF("trace", priv->trace, options, bool, out); + + GF_OPTION_RECONF("monkey-unlocking", priv->monkey_unlocking, options, bool, + out); + + GF_OPTION_RECONF("revocation-secs", priv->revocation_secs, options, uint32, + out); + + GF_OPTION_RECONF("revocation-clear-all", priv->revocation_clear_all, + options, bool, out); + + GF_OPTION_RECONF("revocation-max-blocked", priv->revocation_max_blocked, + options, uint32, out); + + GF_OPTION_RECONF("notify-contention", priv->notify_contention, options, + bool, out); + + GF_OPTION_RECONF("notify-contention-delay", priv->notify_contention_delay, + options, uint32, out); + + GF_OPTION_RECONF("mandatory-locking", tmp_str, options, str, out); + + GF_OPTION_RECONF("enforce-mandatory-lock", priv->mlock_enforced, options, + bool, out); + + if (!strcmp(tmp_str, "forced")) + priv->mandatory_mode = MLK_FORCED; + else if (!strcmp(tmp_str, "file")) + priv->mandatory_mode = MLK_FILE_BASED; + else if (!strcmp(tmp_str, "optimal")) + priv->mandatory_mode = MLK_OPTIMAL; + else + priv->mandatory_mode = MLK_NONE; + + ret = 0; + +out: + return ret; +} + +int +init(xlator_t *this) +{ + posix_locks_private_t *priv = NULL; + xlator_list_t *trav = NULL; + char *tmp_str = NULL; + int ret = -1; + + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_CRITICAL, + "FATAL: posix-locks should have exactly one child"); + goto out; + } + + if (!this->parents) { + gf_log(this->name, GF_LOG_WARNING, + "Volume is dangling. Please check the volume file."); + } + + trav = this->children; + while (trav->xlator->children) + trav = trav->xlator->children; + + if (strncmp("storage/", trav->xlator->type, 8)) { + gf_log(this->name, GF_LOG_CRITICAL, + "'locks' translator is not loaded over a storage " + "translator"); + goto out; + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_locks_mt_posix_locks_private_t); + + GF_OPTION_INIT("mandatory-locking", tmp_str, str, out); + if (!strcmp(tmp_str, "forced")) + priv->mandatory_mode = MLK_FORCED; + else if (!strcmp(tmp_str, "file")) + priv->mandatory_mode = MLK_FILE_BASED; + else if (!strcmp(tmp_str, "optimal")) + priv->mandatory_mode = MLK_OPTIMAL; + else + priv->mandatory_mode = MLK_NONE; + + tmp_str = NULL; + + GF_OPTION_INIT("trace", priv->trace, bool, out); + + GF_OPTION_INIT("monkey-unlocking", priv->monkey_unlocking, bool, out); + + GF_OPTION_INIT("revocation-secs", priv->revocation_secs, uint32, out); + + GF_OPTION_INIT("revocation-clear-all", priv->revocation_clear_all, bool, + out); + + GF_OPTION_INIT("revocation-max-blocked", priv->revocation_max_blocked, + uint32, out); + + GF_OPTION_INIT("notify-contention", priv->notify_contention, bool, out); + + GF_OPTION_INIT("notify-contention-delay", priv->notify_contention_delay, + uint32, out); + + GF_OPTION_INIT("enforce-mandatory-lock", priv->mlock_enforced, bool, out); + + this->local_pool = mem_pool_new(pl_local_t, 32); + if (!this->local_pool) { + ret = -1; + gf_log(this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto out; + } + + this->private = priv; + ret = 0; + +out: + if (ret) { + GF_FREE(priv); + } + return ret; } void -dump_posixlks (pl_inode_t *pl_inode) +fini(xlator_t *this) { - pthread_mutex_lock (&pl_inode->mutex); - { - __dump_posixlks (pl_inode); - } - pthread_mutex_unlock (&pl_inode->mutex); + posix_locks_private_t *priv = this->private; + if (!priv) + return; + this->private = NULL; + if (this->local_pool) { + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } + GF_FREE(priv->brickname); + GF_FREE(priv); + + return; +} + +int +pl_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *flock, dict_t *xdata); + +int +pl_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata); + +int +pl_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata); +int +pl_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata); + +int32_t +pl_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, struct iatt *preoldparent, + struct iatt *postoldparent, struct iatt *prenewparent, + struct iatt *postnewparent, dict_t *xdata) +{ + pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0); + + PL_STACK_UNWIND(rename, xdata, frame, op_ret, op_errno, buf, preoldparent, + postoldparent, prenewparent, postnewparent, xdata); + + return 0; } int32_t -pl_dump_inode_priv (xlator_t *this, inode_t *inode) +pl_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { + int32_t error; - int ret = -1; - uint64_t tmp_pl_inode = 0; - pl_inode_t *pl_inode = NULL; - char *pathname = NULL; - gf_boolean_t section_added = _gf_false; + error = PL_INODE_REMOVE(rename, frame, this, oldloc, newloc, pl_rename, + pl_rename_cbk, oldloc, newloc, xdata); + if (error > 0) { + STACK_UNWIND_STRICT(rename, frame, -1, error, NULL, NULL, NULL, NULL, + NULL, NULL); + } - int count = 0; + return 0; +} - if (!inode) { - errno = EINVAL; - goto out; +posix_lock_t * +gf_lkmig_info_to_posix_lock(call_frame_t *frame, lock_migration_info_t *lmi) +{ + posix_lock_t *lock = GF_CALLOC(1, sizeof(posix_lock_t), + gf_locks_mt_posix_lock_t); + if (!lock) + goto out; + + lock->fl_start = lmi->flock.l_start; + lock->fl_type = lmi->flock.l_type; + + if (lmi->flock.l_len == 0) + lock->fl_end = LLONG_MAX; + else + lock->fl_end = lmi->flock.l_start + lmi->flock.l_len - 1; + + lock->client = frame->root->client; + + lock->lk_flags = lmi->lk_flags; + + lock->client_uid = gf_strdup(lmi->client_uid); + if (lock->client_uid == NULL) { + GF_FREE(lock); + lock = NULL; + goto out; + } + + lock->client_pid = lmi->flock.l_pid; + lock->owner = lmi->flock.l_owner; + + INIT_LIST_HEAD(&lock->list); + +out: + return lock; +} + +/* This function is supposed to write the active locks from the source brick(in + * rebalance context) and write here. Hence, will add the locks directly to the + * pl_inode->ext_list*/ +int +pl_write_active_locks(call_frame_t *frame, pl_inode_t *pl_inode, + lock_migration_info_t *locklist) +{ + posix_lock_t *newlock = NULL; + lock_migration_info_t *temp = NULL; + int ret = 0; + + pthread_mutex_lock(&pl_inode->mutex); + { + /* Just making sure the activelk list is empty. Should not + * happen though*/ + if (!list_empty(&pl_inode->ext_list)) { + pthread_mutex_unlock(&pl_inode->mutex); + gf_msg(THIS->name, GF_LOG_ERROR, 0, 0, "invalid locks found"); + + ret = -1; + goto out; } - ret = TRY_LOCK (&inode->lock); - if (ret) - goto out; - { - ret = __inode_ctx_get (inode, this, &tmp_pl_inode); - if (ret) - goto unlock; + /* This list also should not be empty */ + if (list_empty(&locklist->list)) { + pthread_mutex_unlock(&pl_inode->mutex); + gf_msg(THIS->name, GF_LOG_ERROR, 0, 0, "empty lock list"); + + ret = -1; + goto out; } -unlock: - UNLOCK (&inode->lock); - if (ret) - goto out; - pl_inode = (pl_inode_t *)(long)tmp_pl_inode; - if (!pl_inode) { + list_for_each_entry(temp, &locklist->list, list) + { + newlock = gf_lkmig_info_to_posix_lock(frame, temp); + if (!newlock) { + pthread_mutex_unlock(&pl_inode->mutex); + gf_msg(THIS->name, GF_LOG_ERROR, 0, 0, + "mem allocation failed for newlock"); + ret = -1; goto out; + } + list_add_tail(&newlock->list, &pl_inode->ext_list); } + } + /*TODO: What if few lock add failed with ENOMEM. Should the already + * added locks be clearted */ + pthread_mutex_unlock(&pl_inode->mutex); +out: + return ret; +} - gf_proc_dump_add_section("xlator.features.locks.%s.inode", this->name); - section_added = _gf_true; +static int +pl_setactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc, + lock_migration_info_t *locklist, dict_t *xdata) +{ + int op_ret = 0; + int op_errno = 0; + int ret = 0; - /*We are safe to call __inode_path since we have the - * inode->table->lock */ - __inode_path (inode, NULL, &pathname); - if (pathname) - gf_proc_dump_write ("path", "%s", pathname); + pl_inode_t *pl_inode = pl_inode_get(this, loc->inode, NULL); + if (!pl_inode) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "pl_inode_get failed"); - gf_proc_dump_write("mandatory", "%d", pl_inode->mandatory); + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + ret = pl_write_active_locks(frame, pl_inode, locklist); - ret = pthread_mutex_trylock (&pl_inode->mutex); - if (ret) - goto out; - { - count = __get_entrylk_count (this, pl_inode); - if (count) { - gf_proc_dump_write("entrylk-count", "%d", count); - __dump_entrylks (pl_inode); - } + op_ret = ret; - count = __get_inodelk_count (this, pl_inode, NULL); - if (count) { - gf_proc_dump_write("inodelk-count", "%d", count); - __dump_inodelks (pl_inode); - } +out: + STACK_UNWIND_STRICT(setactivelk, frame, op_ret, op_errno, NULL); - count = __get_posixlk_count (this, pl_inode); - if (count) { - gf_proc_dump_write("posixlk-count", "%d", count); - __dump_posixlks (pl_inode); - } - } - pthread_mutex_unlock (&pl_inode->mutex); + return 0; +} -out: - GF_FREE (pathname); +int32_t +pl_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0); - if (ret && inode) { - if (!section_added) - gf_proc_dump_add_section ("xlator.features.locks.%s." - "inode", this->name); - gf_proc_dump_write ("Unable to print lock state", "(Lock " - "acquisition failure) %s", - uuid_utoa (inode->gfid)); - } - return ret; + PL_STACK_UNWIND(unlink, xdata, frame, op_ret, op_errno, preparent, + postparent, xdata); + + return 0; } int32_t -mem_acct_init (xlator_t *this) +pl_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - int ret = -1; + int32_t error; - if (!this) - return ret; + error = PL_INODE_REMOVE(unlink, frame, this, loc, NULL, pl_unlink, + pl_unlink_cbk, loc, xflag, xdata); + if (error > 0) { + STACK_UNWIND_STRICT(unlink, frame, -1, error, NULL, NULL, NULL); + } - ret = xlator_mem_acct_init (this, gf_locks_mt_end + 1); + return 0; +} - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } +int32_t +pl_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(mkdir, xdata, frame, op_ret, op_errno, inode, + buf, preparent, postparent, xdata); + return 0; +} - return ret; +int +pl_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); + return 0; } +int32_t +pl_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(stat, xdata, frame, op_ret, op_errno, buf, + xdata); + return 0; +} -pl_ctx_t* -pl_ctx_get (client_t *client, xlator_t *xlator) +int +pl_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - void *tmp = NULL; - pl_ctx_t *ctx = NULL; + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; +} - client_ctx_get (client, xlator, &tmp); +int32_t +pl_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(mknod, xdata, frame, op_ret, op_errno, inode, + buf, preparent, postparent, xdata); + return 0; +} - ctx = tmp; +int +pl_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); + return 0; +} - if (ctx != NULL) - goto out; +int32_t +pl_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0); - ctx = GF_CALLOC (1, sizeof (pl_ctx_t), gf_locks_mt_posix_lock_t); + PL_STACK_UNWIND_FOR_CLIENT(rmdir, xdata, frame, op_ret, op_errno, preparent, + postparent, xdata); - if (ctx == NULL) - goto out; + return 0; +} - ctx->ltable = pl_lock_table_new(); +int +pl_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) +{ + int32_t error; - if (ctx->ltable == NULL) { - GF_FREE (ctx); - ctx = NULL; - goto out; - } + error = PL_INODE_REMOVE(rmdir, frame, this, loc, NULL, pl_rmdir, + pl_rmdir_cbk, loc, xflags, xdata); + if (error > 0) { + STACK_UNWIND_STRICT(rmdir, frame, -1, error, NULL, NULL, NULL); + } - LOCK_INIT (&ctx->ltable_lock); + return 0; +} - if (client_ctx_set (client, xlator, ctx) != 0) { - LOCK_DESTROY (&ctx->ltable_lock); - GF_FREE (ctx->ltable); - GF_FREE (ctx); - ctx = NULL; - } -out: - return ctx; +int32_t +pl_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(symlink, xdata, frame, op_ret, op_errno, inode, + buf, preparent, postparent, xdata); + return 0; } -static void -ltable_delete_locks (struct _lock_table *ltable) +int +pl_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_symlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata); + return 0; +} + +int32_t +pl_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - struct _locker *locker = NULL; - struct _locker *tmp = NULL; + pl_inode_t *pl_inode = (pl_inode_t *)cookie; - list_for_each_entry_safe (locker, tmp, <able->inodelk_lockers, lockers) { - if (locker->fd) - pl_del_locker (ltable, locker->volume, &locker->loc, - locker->fd, &locker->owner, - GF_FOP_INODELK); - GF_FREE (locker->volume); - GF_FREE (locker); - } + if (op_ret >= 0) { + pthread_mutex_lock(&pl_inode->mutex); - list_for_each_entry_safe (locker, tmp, <able->entrylk_lockers, lockers) { - if (locker->fd) - pl_del_locker (ltable, locker->volume, &locker->loc, - locker->fd, &locker->owner, - GF_FOP_ENTRYLK); - GF_FREE (locker->volume); - GF_FREE (locker); + /* TODO: can happen pl_inode->links == 0 ? */ + if (pl_inode->links >= 0) { + pl_inode->links++; } - GF_FREE (ltable); + + pthread_mutex_unlock(&pl_inode->mutex); + } + + PL_STACK_UNWIND_FOR_CLIENT(link, xdata, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; } +int +pl_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + pl_inode_t *pl_inode; -static int32_t -destroy_cbk (xlator_t *this, client_t *client) + pl_inode = pl_inode_get(this, oldloc->inode, NULL); + if (pl_inode == NULL) { + STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, + NULL); + return 0; + } + + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), oldloc, newloc); + STACK_WIND_COOKIE(frame, pl_link_cbk, pl_inode, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; +} + +int32_t +pl_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - void *tmp = NULL; - pl_ctx_t *locks_ctx = NULL; + PL_STACK_UNWIND_FOR_CLIENT(fsync, xdata, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; +} - client_ctx_del (client, this, &tmp); +int +pl_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; +} - if (tmp == NULL) - return 0 -; - locks_ctx = tmp; - if (locks_ctx->ltable) - ltable_delete_locks (locks_ctx->ltable); +int32_t +pl_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(readdir, xdata, frame, op_ret, op_errno, entries, + xdata); + return 0; +} - LOCK_DESTROY (&locks_ctx->ltable_lock); - GF_FREE (locks_ctx); +int +pl_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata); + return 0; +} - return 0; +int32_t +pl_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(fsyncdir, xdata, frame, op_ret, op_errno, xdata); + return 0; } +int +pl_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_fsyncdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsyncdir, fd, datasync, xdata); + return 0; +} -static int32_t -disconnect_cbk (xlator_t *this, client_t *client) +int32_t +pl_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct statvfs *buf, dict_t *xdata) { - int32_t ret = 0; - pl_ctx_t *locks_ctx = NULL; - struct _lock_table *ltable = NULL; + PL_STACK_UNWIND_FOR_CLIENT(statfs, xdata, frame, op_ret, op_errno, buf, + xdata); + return 0; +} - locks_ctx = pl_ctx_get (client, this); - if (locks_ctx == NULL) { - gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed"); - goto out; +int +pl_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_statfs_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, loc, xdata); + return 0; +} + +int32_t +pl_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; + + local = frame->local; + if (local && local->update_mlock_enforced_flag && op_ret != -1) { + pl_inode = pl_inode_get(this, local->inode, NULL); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; } - LOCK (&locks_ctx->ltable_lock); + pthread_mutex_lock(&pl_inode->mutex); { - if (locks_ctx->ltable) { - ltable = locks_ctx->ltable; - locks_ctx->ltable = pl_lock_table_new (); - } + pl_inode->mlock_enforced = _gf_false; + pl_inode->check_mlock_info = _gf_false; + pl_inode->track_fop_wind_count = _gf_true; } - UNLOCK (&locks_ctx->ltable_lock); + pthread_mutex_unlock(&pl_inode->mutex); + } - if (ltable) - ltable_delete_locks (ltable); - -out: - return ret; +unwind: + PL_STACK_UNWIND_FOR_CLIENT(removexattr, xdata, frame, op_ret, op_errno, + xdata); + return 0; } - int -init (xlator_t *this) +pl_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - posix_locks_private_t *priv = NULL; - xlator_list_t *trav = NULL; - data_t *mandatory = NULL; - data_t *trace = NULL; - int ret = -1; + int op_ret = 0; + int op_errno = EINVAL; + posix_locks_private_t *priv = this->private; - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: posix-locks should have exactly one child"); - goto out; - } + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "Volume is dangling. Please check the volume file."); - } + PL_CHECK_LOCK_ENFORCE_KEY(frame, ((dict_t *)NULL), name, this, loc, + ((fd_t *)NULL), priv); - trav = this->children; - while (trav->xlator->children) - trav = trav->xlator->children; + STACK_WIND(frame, pl_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; - if (strncmp ("storage/", trav->xlator->type, 8)) { - gf_log (this->name, GF_LOG_CRITICAL, - "'locks' translator is not loaded over a storage " - "translator"); - goto out; - } +unwind: + PL_STACK_UNWIND_FOR_CLIENT(removexattr, xdata, frame, op_ret, op_errno, + NULL); - priv = GF_CALLOC (1, sizeof (*priv), - gf_locks_mt_posix_locks_private_t); + return 0; +} - mandatory = dict_get (this->options, "mandatory-locks"); - if (mandatory) - gf_log (this->name, GF_LOG_WARNING, - "mandatory locks not supported in this minor release."); +int32_t +pl_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + pl_local_t *local = NULL; + pl_inode_t *pl_inode = NULL; - trace = dict_get (this->options, "trace"); - if (trace) { - if (gf_string2boolean (trace->data, - &priv->trace) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "'trace' takes on only boolean values."); - goto out; - } + local = frame->local; + if (local && local->update_mlock_enforced_flag && op_ret != -1) { + pl_inode = pl_inode_get(this, local->inode, NULL); + if (!pl_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; } - this->local_pool = mem_pool_new (pl_local_t, 32); - if (!this->local_pool) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "failed to create local_t's memory pool"); - goto out; + pthread_mutex_lock(&pl_inode->mutex); + { + pl_inode->mlock_enforced = _gf_false; + pl_inode->check_mlock_info = _gf_false; } + pthread_mutex_unlock(&pl_inode->mutex); + } + +unwind: + PL_STACK_UNWIND_FOR_CLIENT(fremovexattr, xdata, frame, op_ret, op_errno, + xdata); + return 0; +} - this->private = priv; - ret = 0; +int +pl_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + int op_ret = -1; + int op_errno = EINVAL; + posix_locks_private_t *priv = this->private; -out: - if (ret) { - GF_FREE (priv); - } - return ret; + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + + PL_CHECK_LOCK_ENFORCE_KEY(frame, ((dict_t *)NULL), name, this, + ((loc_t *)NULL), fd, priv); + + STACK_WIND(frame, pl_fremovexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + return 0; + +unwind: + PL_STACK_UNWIND_FOR_CLIENT(fremovexattr, xdata, frame, op_ret, op_errno, + NULL); + return 0; +} + +int32_t +pl_rchecksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, uint32_t weak_cksum, + uint8_t *strong_cksum, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(rchecksum, xdata, frame, op_ret, op_errno, + weak_cksum, strong_cksum, xdata); + return 0; +} + +int +pl_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + int32_t len, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_rchecksum_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata); + return 0; } +int32_t +pl_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(xattrop, xdata, frame, op_ret, op_errno, dict, + xdata); + return 0; +} int -fini (xlator_t *this) +pl_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - posix_locks_private_t *priv = NULL; + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_xattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, loc, optype, xattr, xdata); + return 0; +} - priv = this->private; - if (!priv) - return 0; - this->private = NULL; - GF_FREE (priv->brickname); - GF_FREE (priv); +int32_t +pl_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(fxattrop, xdata, frame, op_ret, op_errno, dict, + xdata); + return 0; +} - return 0; +int +pl_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_fxattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, fd, optype, xattr, xdata); + return 0; } +int32_t +pl_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(setattr, xdata, frame, op_ret, op_errno, statpre, + statpost, xdata); + return 0; +} int -pl_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, - dict_t *xdata); +pl_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; +} + +int32_t +pl_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(fsetattr, xdata, frame, op_ret, op_errno, + statpre, statpost, xdata); + return 0; +} + +int +pl_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; +} + +int32_t +pl_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(fallocate, xdata, frame, op_ret, op_errno, pre, + post, xdata); + return 0; +} int -pl_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, - dict_t *xdata); +pl_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset, len, + xdata); + return 0; +} + +int32_t +pl_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, const char *path, + struct iatt *buf, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(readlink, xdata, frame, op_ret, op_errno, path, + buf, xdata); + return 0; +} int -pl_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata); +pl_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_readlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, loc, size, xdata); + return 0; +} + +int32_t +pl_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(access, xdata, frame, op_ret, op_errno, xdata); + return 0; +} int -pl_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata); +pl_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_access_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->access, loc, mask, xdata); + return 0; +} + +int32_t +pl_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, off_t offset, dict_t *xdata) +{ + PL_STACK_UNWIND_FOR_CLIENT(seek, xdata, frame, op_ret, op_errno, offset, + xdata); + return 0; +} + +int32_t +pl_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) +{ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL); + STACK_WIND(frame, pl_seek_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->seek, fd, offset, what, xdata); + return 0; +} struct xlator_fops fops = { - .lookup = pl_lookup, - .create = pl_create, - .truncate = pl_truncate, - .ftruncate = pl_ftruncate, - .open = pl_open, - .readv = pl_readv, - .writev = pl_writev, - .lk = pl_lk, - .inodelk = pl_inodelk, - .finodelk = pl_finodelk, - .entrylk = pl_entrylk, - .fentrylk = pl_fentrylk, - .flush = pl_flush, - .opendir = pl_opendir, - .readdirp = pl_readdirp, - .getxattr = pl_getxattr, - .fgetxattr = pl_fgetxattr, - .fsetxattr = pl_fsetxattr, + .lookup = pl_lookup, + .create = pl_create, + .fstat = pl_fstat, + .truncate = pl_truncate, + .ftruncate = pl_ftruncate, + .discard = pl_discard, + .zerofill = pl_zerofill, + .open = pl_open, + .readv = pl_readv, + .writev = pl_writev, + .lk = pl_lk, + .inodelk = pl_inodelk, + .finodelk = pl_finodelk, + .entrylk = pl_entrylk, + .fentrylk = pl_fentrylk, + .flush = pl_flush, + .opendir = pl_opendir, + .readdirp = pl_readdirp, + .setxattr = pl_setxattr, + .fsetxattr = pl_fsetxattr, + .getxattr = pl_getxattr, + .fgetxattr = pl_fgetxattr, + .removexattr = pl_removexattr, + .fremovexattr = pl_fremovexattr, + .rename = pl_rename, + .getactivelk = pl_getactivelk, + .setactivelk = pl_setactivelk, + .unlink = pl_unlink, + .access = pl_access, + .readlink = pl_readlink, + .fallocate = pl_fallocate, + .fsetattr = pl_fsetattr, + .setattr = pl_setattr, + .fxattrop = pl_fxattrop, + .xattrop = pl_xattrop, + .rchecksum = pl_rchecksum, + .statfs = pl_statfs, + .fsyncdir = pl_fsyncdir, + .readdir = pl_readdir, + .symlink = pl_symlink, + .link = pl_link, + .rmdir = pl_rmdir, + .mknod = pl_mknod, + .stat = pl_stat, + .seek = pl_seek, }; struct xlator_dumpops dumpops = { - .inodectx = pl_dump_inode_priv, + .inodectx = pl_dump_inode_priv, }; struct xlator_cbks cbks = { - .forget = pl_forget, - .release = pl_release, - .releasedir = pl_releasedir, - .client_destroy = destroy_cbk, - .client_disconnect = disconnect_cbk, + .forget = pl_forget, + .release = pl_release, + .releasedir = pl_releasedir, + .client_destroy = pl_client_destroy_cbk, + .client_disconnect = pl_client_disconnect_cbk, }; - struct volume_options options[] = { - { .key = { "mandatory-locks", "mandatory" }, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = { "trace" }, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {NULL} }, + {.key = {"mandatory-locking"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "off", + .op_version = {GD_OP_VERSION_3_8_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"locks"}, + .description = "Specifies the mandatory-locking mode. Valid options " + "are 'file' to use linux style mandatory locks, " + "'forced' to use volume strictly under mandatory lock " + "semantics only and 'optimal' to treat advisory and " + "mandatory locks separately on their own."}, + {.key = {"trace"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"locks"}, + .description = "Trace the different lock requests " + "to logs."}, + {.key = {"monkey-unlocking"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE, + .tags = {"locks"}, + .description = "Ignore a random number of unlock requests. Useful " + "for testing/creating robust lock recovery mechanisms."}, + { + .key = {"revocation-secs"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "0", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"locks"}, + .description = "Maximum time a lock can be taken out, before" + "being revoked.", + }, + { + .key = {"revocation-clear-all"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"locks"}, + .description = "If set to true, will revoke BOTH granted and blocked " + "(pending) lock requests if a revocation threshold is " + "hit.", + }, + {.key = {"revocation-max-blocked"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "0", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"locks"}, + .description = "A number of blocked lock requests after which a lock " + "will be revoked to allow the others to proceed. Can " + "be used in conjunction w/ revocation-clear-all."}, + {.key = {"notify-contention"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "yes", + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .op_version = {GD_OP_VERSION_4_0_0}, + .tags = {"locks", "contention"}, + .description = "When this option is enabled and a lock request " + "conflicts with a currently granted lock, an upcall " + "notification will be sent to the current owner of " + "the lock to request it to be released as soon as " + "possible."}, + {.key = {"notify-contention-delay"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, /* An upcall notification is sent every time a conflict is + * detected. */ + .max = 60, + .default_value = "5", + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .op_version = {GD_OP_VERSION_4_0_0}, + .tags = {"locks", "contention", "timeout"}, + .description = "This value determines the minimum amount of time " + "(in seconds) between upcall contention notifications " + "on the same inode. If multiple lock requests are " + "received during this period, only one upcall will " + "be sent."}, + {.key = {"enforce-mandatory-lock"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .flags = OPT_FLAG_SETTABLE, + .op_version = {GD_OP_VERSION_6_0}, + .description = "option to enable lock enforcement"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "locks", + .category = GF_MAINTAINED, }; diff --git a/xlators/features/locks/src/reservelk.c b/xlators/features/locks/src/reservelk.c index 11abd26d85f..604691fd887 100644 --- a/xlators/features/locks/src/reservelk.c +++ b/xlators/features/locks/src/reservelk.c @@ -7,437 +7,376 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "inode.h" -#include "logging.h" -#include "common-utils.h" -#include "list.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/xlator.h> +#include <glusterfs/logging.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/list.h> #include "locks.h" #include "common.h" -void -__delete_reserve_lock (posix_lock_t *lock) -{ - list_del (&lock->list); -} - -void -__destroy_reserve_lock (posix_lock_t *lock) -{ - GF_FREE (lock); -} - /* Return true if the two reservelks have exactly same lock boundaries */ int -reservelks_equal (posix_lock_t *l1, posix_lock_t *l2) +reservelks_equal(posix_lock_t *l1, posix_lock_t *l2) { - if ((l1->fl_start == l2->fl_start) && - (l1->fl_end == l2->fl_end)) - return 1; + if ((l1->fl_start == l2->fl_start) && (l1->fl_end == l2->fl_end)) + return 1; - return 0; + return 0; } /* Determine if lock is grantable or not */ static posix_lock_t * -__reservelk_grantable (pl_inode_t *pl_inode, posix_lock_t *lock) +__reservelk_grantable(pl_inode_t *pl_inode, posix_lock_t *lock) { - xlator_t *this = NULL; - posix_lock_t *l = NULL; - posix_lock_t *ret_lock = NULL; - - this = THIS; - - if (list_empty (&pl_inode->reservelk_list)) { - gf_log (this->name, GF_LOG_TRACE, - "No reservelks in list"); - goto out; - } - list_for_each_entry (l, &pl_inode->reservelk_list, list){ - if (reservelks_equal (lock, l)) { - ret_lock = l; - break; - } + xlator_t *this = THIS; + posix_lock_t *l = NULL; + posix_lock_t *ret_lock = NULL; + + if (list_empty(&pl_inode->reservelk_list)) { + gf_log(this->name, GF_LOG_TRACE, "No reservelks in list"); + goto out; + } + list_for_each_entry(l, &pl_inode->reservelk_list, list) + { + if (reservelks_equal(lock, l)) { + ret_lock = l; + break; } + } out: - return ret_lock; + return ret_lock; } -static inline int -__same_owner_reservelk (posix_lock_t *l1, posix_lock_t *l2) +static int +__same_owner_reservelk(posix_lock_t *l1, posix_lock_t *l2) { - return (is_same_lkowner (&l1->owner, &l2->owner)); - + return (is_same_lkowner(&l1->owner, &l2->owner)); } static posix_lock_t * -__matching_reservelk (pl_inode_t *pl_inode, posix_lock_t *lock) +__matching_reservelk(pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *l = NULL; + posix_lock_t *l = NULL; - if (list_empty (&pl_inode->reservelk_list)) { - gf_log ("posix-locks", GF_LOG_TRACE, - "reservelk list empty"); - return NULL; - } + if (list_empty(&pl_inode->reservelk_list)) { + gf_log("posix-locks", GF_LOG_TRACE, "reservelk list empty"); + return NULL; + } - list_for_each_entry (l, &pl_inode->reservelk_list, list) { - if (reservelks_equal (l, lock)) { - gf_log ("posix-locks", GF_LOG_TRACE, - "equal reservelk found"); - break; - } + list_for_each_entry(l, &pl_inode->reservelk_list, list) + { + if (reservelks_equal(l, lock)) { + gf_log("posix-locks", GF_LOG_TRACE, "equal reservelk found"); + break; } + } - return l; + return l; } static int -__reservelk_conflict (xlator_t *this, pl_inode_t *pl_inode, - posix_lock_t *lock) +__reservelk_conflict(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *conf = NULL; - int ret = 0; - - conf = __matching_reservelk (pl_inode, lock); - if (conf) { - gf_log (this->name, GF_LOG_TRACE, - "Matching reservelk found"); - if (__same_owner_reservelk (lock, conf)) { - list_del_init (&conf->list); - gf_log (this->name, GF_LOG_TRACE, - "Removing the matching reservelk for setlk to progress"); - GF_FREE (conf); - ret = 0; - } else { - gf_log (this->name, GF_LOG_TRACE, - "Conflicting reservelk found"); - ret = 1; - } - + int ret = 0; + + posix_lock_t *conf = __matching_reservelk(pl_inode, lock); + if (conf) { + gf_log(this->name, GF_LOG_TRACE, "Matching reservelk found"); + if (__same_owner_reservelk(lock, conf)) { + list_del_init(&conf->list); + gf_log(this->name, GF_LOG_TRACE, + "Removing the matching reservelk for setlk to progress"); + __destroy_lock(conf); + ret = 0; + } else { + gf_log(this->name, GF_LOG_TRACE, "Conflicting reservelk found"); + ret = 1; } - return ret; - + } + return ret; } int -pl_verify_reservelk (xlator_t *this, pl_inode_t *pl_inode, - posix_lock_t *lock, int can_block) +pl_verify_reservelk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, + const int can_block) { - int ret = 0; - - pthread_mutex_lock (&pl_inode->mutex); - { - if (__reservelk_conflict (this, pl_inode, lock)) { - gf_log (this->name, GF_LOG_TRACE, - "Found conflicting reservelk. Blocking until reservelk is unlocked."); - lock->blocked = can_block; - list_add_tail (&lock->list, &pl_inode->blocked_calls); - ret = -1; - goto unlock; - } - - gf_log (this->name, GF_LOG_TRACE, - "no conflicting reservelk found. Call continuing"); - ret = 0; - + int ret = 0; + + pthread_mutex_lock(&pl_inode->mutex); + { + if (__reservelk_conflict(this, pl_inode, lock)) { + lock->blocked = can_block; + list_add_tail(&lock->list, &pl_inode->blocked_calls); + pthread_mutex_unlock(&pl_inode->mutex); + gf_log(this->name, GF_LOG_TRACE, + "Found conflicting reservelk. Blocking until reservelk is " + "unlocked."); + ret = -1; + goto out; } -unlock: - pthread_mutex_unlock (&pl_inode->mutex); - - return ret; - + } + pthread_mutex_unlock(&pl_inode->mutex); + gf_log(this->name, GF_LOG_TRACE, + "no conflicting reservelk found. Call continuing"); + ret = 0; +out: + return ret; } - /* Determines if lock can be granted and adds the lock. If the lock * is blocking, adds it to the blocked_reservelks. */ static int -__lock_reservelk (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, - int can_block) +__lock_reservelk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, + const int can_block) { - posix_lock_t *conf = NULL; - int ret = -EINVAL; - - conf = __reservelk_grantable (pl_inode, lock); - if (conf){ - ret = -EAGAIN; - if (can_block == 0) - goto out; + int ret = -EINVAL; - list_add_tail (&lock->list, &pl_inode->blocked_reservelks); + posix_lock_t *conf = __reservelk_grantable(pl_inode, lock); + if (conf) { + ret = -EAGAIN; + if (can_block == 0) + goto out; - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => Blocked", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); + list_add_tail(&lock->list, &pl_inode->blocked_reservelks); + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64 " => Blocked", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, + lkowner_utoa(&lock->owner), lock->user_flock.l_start, + lock->user_flock.l_len); - goto out; - } + goto out; + } - list_add (&lock->list, &pl_inode->reservelk_list); + list_add(&lock->list, &pl_inode->reservelk_list); - ret = 0; + ret = 0; out: - return ret; + return ret; } static posix_lock_t * -find_matching_reservelk (posix_lock_t *lock, pl_inode_t *pl_inode) +find_matching_reservelk(posix_lock_t *lock, pl_inode_t *pl_inode) { - posix_lock_t *l = NULL; - list_for_each_entry (l, &pl_inode->reservelk_list, list) { - if (reservelks_equal (l, lock)) - return l; - } - return NULL; + posix_lock_t *l = NULL; + list_for_each_entry(l, &pl_inode->reservelk_list, list) + { + if (reservelks_equal(l, lock)) + return l; + } + return NULL; } /* Set F_UNLCK removes a lock which has the exact same lock boundaries * as the UNLCK lock specifies. If such a lock is not found, returns invalid */ static posix_lock_t * -__reserve_unlock_lock (xlator_t *this, posix_lock_t *lock, pl_inode_t *pl_inode) +__reserve_unlock_lock(xlator_t *this, posix_lock_t *lock, pl_inode_t *pl_inode) { - - posix_lock_t *conf = NULL; - - conf = find_matching_reservelk (lock, pl_inode); - if (!conf) { - gf_log (this->name, GF_LOG_DEBUG, - " Matching lock not found for unlock"); - goto out; - } - __delete_reserve_lock (conf); - gf_log (this->name, GF_LOG_DEBUG, - " Matching lock found for unlock"); + posix_lock_t *conf = find_matching_reservelk(lock, pl_inode); + if (!conf) { + gf_log(this->name, GF_LOG_DEBUG, " Matching lock not found for unlock"); + goto out; + } + __delete_lock(conf); + gf_log(this->name, GF_LOG_DEBUG, " Matching lock found for unlock"); out: - return conf; - - + return conf; } static void -__grant_blocked_reserve_locks (xlator_t *this, pl_inode_t *pl_inode, - struct list_head *granted) +__grant_blocked_reserve_locks(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted) { - int bl_ret = 0; - posix_lock_t *bl = NULL; - posix_lock_t *tmp = NULL; - - struct list_head blocked_list; + int bl_ret = 0; + posix_lock_t *bl = NULL; + posix_lock_t *tmp = NULL; - INIT_LIST_HEAD (&blocked_list); - list_splice_init (&pl_inode->blocked_reservelks, &blocked_list); + struct list_head blocked_list; - list_for_each_entry_safe (bl, tmp, &blocked_list, list) { + INIT_LIST_HEAD(&blocked_list); + list_splice_init(&pl_inode->blocked_reservelks, &blocked_list); - list_del_init (&bl->list); + list_for_each_entry_safe(bl, tmp, &blocked_list, list) + { + list_del_init(&bl->list); - bl_ret = __lock_reservelk (this, pl_inode, bl, 1); + bl_ret = __lock_reservelk(this, pl_inode, bl, 1); - if (bl_ret == 0) { - list_add (&bl->list, granted); - } + if (bl_ret == 0) { + list_add(&bl->list, granted); } - return; + } + return; } /* Grant all reservelks blocked on lock(s) */ void -grant_blocked_reserve_locks (xlator_t *this, pl_inode_t *pl_inode) +grant_blocked_reserve_locks(xlator_t *this, pl_inode_t *pl_inode) { - struct list_head granted; - posix_lock_t *lock = NULL; - posix_lock_t *tmp = NULL; + struct list_head granted; + posix_lock_t *lock = NULL; + posix_lock_t *tmp = NULL; - INIT_LIST_HEAD (&granted); - - if (list_empty (&pl_inode->blocked_reservelks)) { - gf_log (this->name, GF_LOG_TRACE, - "No blocked locks to be granted"); - return; - } - - pthread_mutex_lock (&pl_inode->mutex); - { - __grant_blocked_reserve_locks (this, pl_inode, &granted); - } - pthread_mutex_unlock (&pl_inode->mutex); - - list_for_each_entry_safe (lock, tmp, &granted, list) { - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => Granted", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - - STACK_UNWIND_STRICT (lk, lock->frame, 0, 0, &lock->user_flock, - NULL); - } + INIT_LIST_HEAD(&granted); + if (list_empty(&pl_inode->blocked_reservelks)) { + gf_log(this->name, GF_LOG_TRACE, "No blocked locks to be granted"); + return; + } + + pthread_mutex_lock(&pl_inode->mutex); + { + __grant_blocked_reserve_locks(this, pl_inode, &granted); + } + pthread_mutex_unlock(&pl_inode->mutex); + + list_for_each_entry_safe(lock, tmp, &granted, list) + { + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => Granted", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, + lkowner_utoa(&lock->owner), lock->user_flock.l_start, + lock->user_flock.l_len); + + STACK_UNWIND_STRICT(lk, lock->frame, 0, 0, &lock->user_flock, NULL); + } } static void -__grant_blocked_lock_calls (xlator_t *this, pl_inode_t *pl_inode, - struct list_head *granted) +__grant_blocked_lock_calls(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted) { - int bl_ret = 0; - posix_lock_t *bl = NULL; - posix_lock_t *tmp = NULL; + int bl_ret = 0; + posix_lock_t *bl = NULL; + posix_lock_t *tmp = NULL; - struct list_head blocked_list; + struct list_head blocked_list; - INIT_LIST_HEAD (&blocked_list); - list_splice_init (&pl_inode->blocked_reservelks, &blocked_list); + INIT_LIST_HEAD(&blocked_list); + list_splice_init(&pl_inode->blocked_reservelks, &blocked_list); - list_for_each_entry_safe (bl, tmp, &blocked_list, list) { + list_for_each_entry_safe(bl, tmp, &blocked_list, list) + { + list_del_init(&bl->list); - list_del_init (&bl->list); + bl_ret = pl_verify_reservelk(this, pl_inode, bl, bl->blocked); - bl_ret = pl_verify_reservelk (this, pl_inode, bl, bl->blocked); - - if (bl_ret == 0) { - list_add_tail (&bl->list, granted); - } + if (bl_ret == 0) { + list_add_tail(&bl->list, granted); } - return; + } + return; } void -grant_blocked_lock_calls (xlator_t *this, pl_inode_t *pl_inode) +grant_blocked_lock_calls(xlator_t *this, pl_inode_t *pl_inode) { - struct list_head granted; - posix_lock_t *lock = NULL; - posix_lock_t *tmp = NULL; - fd_t *fd = NULL; - - int can_block = 0; - int32_t cmd = 0; - int ret = 0; - - if (list_empty (&pl_inode->blocked_calls)) { - gf_log (this->name, GF_LOG_TRACE, - "No blocked lock calls to be granted"); - return; - } + struct list_head granted; + posix_lock_t *lock = NULL; + posix_lock_t *tmp = NULL; + fd_t *fd = NULL; - pthread_mutex_lock (&pl_inode->mutex); - { - __grant_blocked_lock_calls (this, pl_inode, &granted); - } - pthread_mutex_unlock (&pl_inode->mutex); - - list_for_each_entry_safe (lock, tmp, &granted, list) { - fd = fd_from_fdnum (lock); - - if (lock->blocked) { - can_block = 1; - cmd = F_SETLKW; - } - else - cmd = F_SETLK; - - lock->blocked = 0; - ret = pl_setlk (this, pl_inode, lock, can_block); - if (ret == -1) { - if (can_block) { - pl_trace_block (this, lock->frame, fd, NULL, - cmd, &lock->user_flock, NULL); - continue; - } else { - gf_log (this->name, GF_LOG_DEBUG, "returning EAGAIN"); - pl_trace_out (this, lock->frame, fd, NULL, cmd, - &lock->user_flock, -1, EAGAIN, NULL); - pl_update_refkeeper (this, fd->inode); - STACK_UNWIND_STRICT (lk, lock->frame, -1, - EAGAIN, &lock->user_flock, - NULL); - __destroy_lock (lock); - } - } + int can_block = 0; + int32_t cmd = 0; + int ret = 0; + if (list_empty(&pl_inode->blocked_calls)) { + gf_log(this->name, GF_LOG_TRACE, "No blocked lock calls to be granted"); + return; + } + + pthread_mutex_lock(&pl_inode->mutex); + { + __grant_blocked_lock_calls(this, pl_inode, &granted); + } + pthread_mutex_unlock(&pl_inode->mutex); + + list_for_each_entry_safe(lock, tmp, &granted, list) + { + fd = fd_from_fdnum(lock); + + if (lock->blocked) { + can_block = 1; + cmd = F_SETLKW; + } else + cmd = F_SETLK; + + lock->blocked = 0; + ret = pl_setlk(this, pl_inode, lock, can_block); + if (ret == -1) { + if (can_block) { + continue; + } else { + gf_log(this->name, GF_LOG_DEBUG, "returning EAGAIN"); + pl_trace_out(this, lock->frame, fd, NULL, cmd, + &lock->user_flock, -1, EAGAIN, NULL); + pl_update_refkeeper(this, fd->inode); + STACK_UNWIND_STRICT(lk, lock->frame, -1, EAGAIN, + &lock->user_flock, NULL); + __destroy_lock(lock); + } } - + } } - int -pl_reserve_unlock (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock) +pl_reserve_unlock(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *retlock = NULL; - int ret = -1; - - pthread_mutex_lock (&pl_inode->mutex); - { - retlock = __reserve_unlock_lock (this, lock, pl_inode); - if (!retlock) { - gf_log (this->name, GF_LOG_DEBUG, - "Bad Unlock issued on Inode lock"); - ret = -EINVAL; - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "Reservelk Unlock successful"); - __destroy_reserve_lock (retlock); - ret = 0; + posix_lock_t *retlock = NULL; + int ret = -1; + + pthread_mutex_lock(&pl_inode->mutex); + { + retlock = __reserve_unlock_lock(this, lock, pl_inode); + if (!retlock) { + pthread_mutex_unlock(&pl_inode->mutex); + gf_log(this->name, GF_LOG_DEBUG, "Bad Unlock issued on Inode lock"); + ret = -EINVAL; + goto out; } -out: - pthread_mutex_unlock (&pl_inode->mutex); - - grant_blocked_reserve_locks (this, pl_inode); - grant_blocked_lock_calls (this, pl_inode); - return ret; + gf_log(this->name, GF_LOG_TRACE, "Reservelk Unlock successful"); + __destroy_lock(retlock); + ret = 0; + } + pthread_mutex_unlock(&pl_inode->mutex); +out: + grant_blocked_reserve_locks(this, pl_inode); + grant_blocked_lock_calls(this, pl_inode); + return ret; } int -pl_reserve_setlk (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, - int can_block) +pl_reserve_setlk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, + int can_block) { - int ret = -EINVAL; - - pthread_mutex_lock (&pl_inode->mutex); - { - - ret = __lock_reservelk (this, pl_inode, lock, can_block); - if (ret < 0) - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => NOK", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - else - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => OK", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->fl_start, - lock->fl_end); - - } - pthread_mutex_unlock (&pl_inode->mutex); - return ret; + int ret = -EINVAL; + + pthread_mutex_lock(&pl_inode->mutex); + { + ret = __lock_reservelk(this, pl_inode, lock, can_block); + } + pthread_mutex_unlock(&pl_inode->mutex); + + if (ret < 0) + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => NOK", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, + lkowner_utoa(&lock->owner), lock->user_flock.l_start, + lock->user_flock.l_len); + else + gf_log(this->name, GF_LOG_TRACE, + "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => OK", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, + lkowner_utoa(&lock->owner), lock->fl_start, lock->fl_end); + + return ret; } diff --git a/xlators/features/locks/tests/unit-test.c b/xlators/features/locks/tests/unit-test.c index d2cca32dec3..d285b12b5aa 100644 --- a/xlators/features/locks/tests/unit-test.c +++ b/xlators/features/locks/tests/unit-test.c @@ -7,59 +7,71 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "inode.h" -#include "logging.h" -#include "common-utils.h" -#include "list.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/xlator.h> +#include <glusterfs/logging.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/list.h> #include "locks.h" #include "common.h" -#define expect(cond) if (!(cond)) { goto out; } +#define expect(cond) \ + if (!(cond)) { \ + goto out; \ + } -extern int lock_name (pl_inode_t *, const char *, entrylk_type); -extern int unlock_name (pl_inode_t *, const char *, entrylk_type); +extern int +lock_name(pl_inode_t *, const char *, entrylk_type); +extern int +unlock_name(pl_inode_t *, const char *, entrylk_type); -int main (int argc, char **argv) +int +main(int argc, char **argv) { - int ret = 1; - int r = -1; + int ret = 1; + int r = -1; + + pl_inode_t *pinode = CALLOC(sizeof(pl_inode_t), 1); + pthread_mutex_init(&pinode->dir_lock_mutex, NULL); + INIT_LIST_HEAD(&pinode->gf_dir_locks); - pl_inode_t *pinode = CALLOC (sizeof (pl_inode_t), 1); - pthread_mutex_init (&pinode->dir_lock_mutex, NULL); - INIT_LIST_HEAD (&pinode->gf_dir_locks); + r = lock_name(pinode, NULL, ENTRYLK_WRLCK); + expect(r == 0); + { + r = lock_name(pinode, "foo", ENTRYLK_WRLCK); + expect(r == -EAGAIN); + } + r = unlock_name(pinode, NULL, ENTRYLK_WRLCK); + expect(r == 0); - r = lock_name (pinode, NULL, ENTRYLK_WRLCK); expect (r == 0); - { - r = lock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == -EAGAIN); - } - r = unlock_name (pinode, NULL, ENTRYLK_WRLCK); expect (r == 0); + r = lock_name(pinode, "foo", ENTRYLK_RDLCK); + expect(r == 0); + { + r = lock_name(pinode, "foo", ENTRYLK_RDLCK); + expect(r == 0); + { + r = lock_name(pinode, "foo", ENTRYLK_WRLCK); + expect(r == -EAGAIN); + } + r = unlock_name(pinode, "foo", ENTRYLK_RDLCK); + expect(r == 0); + } + r = unlock_name(pinode, "foo", ENTRYLK_RDLCK); + expect(r == 0); - r = lock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0); - { - r = lock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0); - { - r = lock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == -EAGAIN); - } - r = unlock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0); - } - r = unlock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0); - - r = lock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == 0); - r = unlock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == 0); + r = lock_name(pinode, "foo", ENTRYLK_WRLCK); + expect(r == 0); + r = unlock_name(pinode, "foo", ENTRYLK_WRLCK); + expect(r == 0); - r = lock_name (pinode, "baz", ENTRYLK_WRLCK); expect (r == 0); - r = lock_name (pinode, "baz", ENTRYLK_RDLCK); expect (r == -EAGAIN); + r = lock_name(pinode, "baz", ENTRYLK_WRLCK); + expect(r == 0); + r = lock_name(pinode, "baz", ENTRYLK_RDLCK); + expect(r == -EAGAIN); - ret = 0; + ret = 0; out: - return ret; + return ret; } diff --git a/xlators/features/mac-compat/src/Makefile.am b/xlators/features/mac-compat/src/Makefile.am deleted file mode 100644 index f8567edce71..00000000000 --- a/xlators/features/mac-compat/src/Makefile.am +++ /dev/null @@ -1,14 +0,0 @@ -xlator_LTLIBRARIES = mac-compat.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features - -mac_compat_la_LDFLAGS = -module -avoid-version - -mac_compat_la_SOURCES = mac-compat.c -mac_compat_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src - -AM_CFLAGS = -Wall $(GF_CFLAGS) - -CLEANFILES = - diff --git a/xlators/features/mac-compat/src/mac-compat.c b/xlators/features/mac-compat/src/mac-compat.c deleted file mode 100644 index 7cb550ad581..00000000000 --- a/xlators/features/mac-compat/src/mac-compat.c +++ /dev/null @@ -1,237 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "defaults.h" -#include "compat-errno.h" - - -enum apple_xattr { - GF_FINDER_INFO_XATTR, - GF_RESOURCE_FORK_XATTR, - GF_XATTR_ALL, - GF_XATTR_NONE -}; - -static char *apple_xattr_name[] = { - [GF_FINDER_INFO_XATTR] = "com.apple.FinderInfo", - [GF_RESOURCE_FORK_XATTR] = "com.apple.ResourceFork" -}; - -static const char *apple_xattr_value[] = { - [GF_FINDER_INFO_XATTR] = - /* 1 2 3 4 5 6 7 8 */ - "\0\0\0\0\0\0\0\0" - "\0\0\0\0\0\0\0\0" - "\0\0\0\0\0\0\0\0" - "\0\0\0\0\0\0\0\0", - [GF_RESOURCE_FORK_XATTR] = "" -}; - -static int32_t apple_xattr_len[] = { - [GF_FINDER_INFO_XATTR] = 32, - [GF_RESOURCE_FORK_XATTR] = 1 -}; - - -int32_t -maccomp_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) -{ - intptr_t ax = (intptr_t)this->private; - int i = 0; - - if ((ax == GF_XATTR_ALL && op_ret >= 0) || ax != GF_XATTR_NONE) { - op_ret = op_errno = 0; - - for (i = 0; i < GF_XATTR_ALL; i++) { - if (dict_get (dict, apple_xattr_name[i])) - continue; - - if (dict_set (dict, apple_xattr_name[i], - bin_to_data ((void *)apple_xattr_value[i], - apple_xattr_len[i])) == -1) { - op_ret = -1; - op_errno = ENOMEM; - - break; - } - } - } - - STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata); - - return 0; -} - - -int32_t -maccomp_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) -{ - intptr_t ax = GF_XATTR_NONE; - int i = 0; - - if (name) { - for (i = 0; i < GF_XATTR_ALL; i++) { - if (strcmp (apple_xattr_name[i], name) == 0) { - ax = i; - - break; - } - } - } else - ax = GF_XATTR_ALL; - - this->private = (void *)ax; - - STACK_WIND (frame, maccomp_getxattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, - loc, name, xdata); - return 0; -} - - -int32_t -maccomp_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name, dict_t *xdata) -{ - intptr_t ax = GF_XATTR_NONE; - int i = 0; - - if (name) { - for (i = 0; i < GF_XATTR_ALL; i++) { - if (strcmp (apple_xattr_name[i], name) == 0) { - ax = i; - - break; - } - } - } else - ax = GF_XATTR_ALL; - - this->private = (void *)ax; - - STACK_WIND (frame, maccomp_getxattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fgetxattr, - fd, name, xdata); - return 0; -} - - -int32_t -maccomp_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - intptr_t ax = (intptr_t)this->private; - - if (op_ret == -1 && ax != GF_XATTR_NONE) - op_ret = op_errno = 0; - - STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata); - - return 0; -} - - -int32_t -maccomp_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags, dict_t *xdata) -{ - intptr_t ax = GF_XATTR_NONE; - int i = 0; - - for (i = 0; i < GF_XATTR_ALL; i++) { - if (dict_get (dict, apple_xattr_name[i])) { - ax = i; - - break; - } - } - - this->private = (void *)ax; - - STACK_WIND (frame, maccomp_setxattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, - loc, dict, flags, xdata); - return 0; -} - - -int32_t -maccomp_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, - int32_t flags, dict_t *xdata) -{ - intptr_t ax = GF_XATTR_NONE; - int i = 0; - - for (i = 0; i < GF_XATTR_ALL; i++) { - if (dict_get (dict, apple_xattr_name[i])) { - ax = i; - - break; - } - } - - this->private = (void *)ax; - - STACK_WIND (frame, maccomp_setxattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsetxattr, - fd, dict, flags, xdata); - return 0; -} - - -int32_t -init (xlator_t *this) -{ - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "translator not configured with exactly one child"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - return 0; -} - - -void -fini (xlator_t *this) -{ - return; -} - - -struct xlator_fops fops = { - .getxattr = maccomp_getxattr, - .fgetxattr = maccomp_fgetxattr, - .setxattr = maccomp_setxattr, - .fsetxattr = maccomp_fsetxattr, -}; - -struct xlator_cbks cbks; - -struct volume_options options[] = { - { .key = {NULL} }, -}; diff --git a/xlators/features/marker/src/Makefile.am b/xlators/features/marker/src/Makefile.am index a7c67647218..58056b36511 100644 --- a/xlators/features/marker/src/Makefile.am +++ b/xlators/features/marker/src/Makefile.am @@ -1,14 +1,21 @@ +if WITH_SERVER xlator_LTLIBRARIES = marker.la +endif xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features -marker_la_LDFLAGS = -module -avoid-version +marker_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +marker_la_SOURCES = marker.c marker-quota.c marker-quota-helper.c \ + marker-common.c -marker_la_SOURCES = marker.c marker-quota.c marker-quota-helper.c marker-common.c marker_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = marker-mem-types.h marker.h marker-quota.h marker-quota-helper.h marker-common.h $(top_builddir)/xlators/lib/src/libxlator.h +noinst_HEADERS = marker-mem-types.h marker.h marker-quota.h \ + marker-quota-helper.h marker-common.h \ + $(top_builddir)/xlators/lib/src/libxlator.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ -I$(top_srcdir)/xlators/lib/src AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS) diff --git a/xlators/features/marker/src/marker-common.c b/xlators/features/marker/src/marker-common.c index 84a718add97..9c9047005d6 100644 --- a/xlators/features/marker/src/marker-common.c +++ b/xlators/features/marker/src/marker-common.c @@ -7,63 +7,51 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif #include <fnmatch.h> #include "marker-common.h" marker_inode_ctx_t * -marker_inode_ctx_new () +marker_inode_ctx_new() { - marker_inode_ctx_t *ctx = NULL; + marker_inode_ctx_t *ctx = NULL; - ctx = GF_CALLOC (1, sizeof (marker_inode_ctx_t), - gf_marker_mt_marker_inode_ctx_t); - if (ctx == NULL) - goto out; + ctx = GF_CALLOC(1, sizeof(marker_inode_ctx_t), + gf_marker_mt_marker_inode_ctx_t); + if (ctx == NULL) + goto out; - ctx->quota_ctx = NULL; + ctx->quota_ctx = NULL; out: - return ctx; + return ctx; } int32_t -marker_force_inode_ctx_get (inode_t *inode, xlator_t *this, - marker_inode_ctx_t **ctx) +marker_force_inode_ctx_get(inode_t *inode, xlator_t *this, + marker_inode_ctx_t **ctx) { - int32_t ret = -1; - uint64_t ctx_int = 0; - - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx_int); - if (ret == 0) - *ctx = (marker_inode_ctx_t *) (unsigned long)ctx_int; - else { - *ctx = marker_inode_ctx_new (); - if (*ctx == NULL) - goto unlock; - - ret = __inode_ctx_put (inode, this, - (uint64_t )(unsigned long) *ctx); - if (ret == -1) { - GF_FREE (*ctx); - goto unlock; - } - ret = 0; - } + int32_t ret = -1; + uint64_t ctx_int = 0; + + LOCK(&inode->lock); + { + ret = __inode_ctx_get(inode, this, &ctx_int); + if (ret == 0) + *ctx = (marker_inode_ctx_t *)(unsigned long)ctx_int; + else { + *ctx = marker_inode_ctx_new(); + if (*ctx == NULL) + goto unlock; + + ret = __inode_ctx_put(inode, this, (uint64_t)(unsigned long)*ctx); + if (ret == -1) { + GF_FREE(*ctx); + goto unlock; + } + ret = 0; } -unlock: UNLOCK (&inode->lock); + } +unlock: + UNLOCK(&inode->lock); - return ret; -} - -int -marker_filter_quota_xattr (dict_t *dict, char *key, - data_t *value, void *data) -{ - dict_del (dict, key); - return 0; + return ret; } diff --git a/xlators/features/marker/src/marker-common.h b/xlators/features/marker/src/marker-common.h index 23dd846cb0a..7f8cffe7d35 100644 --- a/xlators/features/marker/src/marker-common.h +++ b/xlators/features/marker/src/marker-common.h @@ -10,18 +10,10 @@ #ifndef _MARKER_COMMON_H #define _MARKER_COMMON_H -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "inode.h" -#include "xlator.h" +#include <glusterfs/xlator.h> #include "marker.h" int32_t -marker_force_inode_ctx_get (inode_t *, xlator_t *, marker_inode_ctx_t **); +marker_force_inode_ctx_get(inode_t *, xlator_t *, marker_inode_ctx_t **); -int -marker_filter_quota_xattr (dict_t *, char *, data_t *, void *); #endif diff --git a/xlators/features/marker/src/marker-mem-types.h b/xlators/features/marker/src/marker-mem-types.h index 1f74d504897..aedfdb4a1b7 100644 --- a/xlators/features/marker/src/marker-mem-types.h +++ b/xlators/features/marker/src/marker-mem-types.h @@ -10,16 +10,19 @@ #ifndef __MARKER_MEM_TYPES_H__ #define __MARKER_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_marker_mem_types_ { - gf_marker_mt_marker_conf_t = gf_common_mt_end + 1, - gf_marker_mt_loc_t, - gf_marker_mt_volume_mark, - gf_marker_mt_int64_t, - gf_marker_mt_quota_inode_ctx_t, - gf_marker_mt_marker_inode_ctx_t, - gf_marker_mt_inode_contribution_t, - gf_marker_mt_end + /* Those are used by ALLOCATE_OR_GOTO macro */ + gf_marker_mt_marker_conf_t = gf_common_mt_end + 1, + gf_marker_mt_loc_t, + gf_marker_mt_volume_mark, + gf_marker_mt_int64_t, + gf_marker_mt_quota_inode_ctx_t, + gf_marker_mt_marker_inode_ctx_t, + gf_marker_mt_inode_contribution_t, + gf_marker_mt_quota_meta_t, + gf_marker_mt_quota_synctask_t, + gf_marker_mt_end }; #endif diff --git a/xlators/features/marker/src/marker-quota-helper.c b/xlators/features/marker/src/marker-quota-helper.c index ec0d83316c7..ecd85d67b2b 100644 --- a/xlators/features/marker/src/marker-quota-helper.c +++ b/xlators/features/marker/src/marker-quota-helper.c @@ -7,417 +7,374 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "locking.h" +#include <glusterfs/locking.h> #include "marker-quota.h" #include "marker-common.h" #include "marker-quota-helper.h" #include "marker-mem-types.h" int -mq_loc_fill (loc_t *loc, inode_t *inode, inode_t *parent, char *path) +mq_loc_fill(loc_t *loc, inode_t *inode, inode_t *parent, char *path) { - int ret = -1; + int ret = -1; - GF_VALIDATE_OR_GOTO ("marker", loc, out); - GF_VALIDATE_OR_GOTO ("marker", inode, out); - GF_VALIDATE_OR_GOTO ("marker", path, out); - /* Not checking for parent because while filling - * loc of root, parent will be NULL - */ + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", inode, out); + GF_VALIDATE_OR_GOTO("marker", path, out); + /* Not checking for parent because while filling + * loc of root, parent will be NULL + */ - if (inode) { - loc->inode = inode_ref (inode); - } + if (inode) { + loc->inode = inode_ref(inode); + } - if (parent) - loc->parent = inode_ref (parent); + if (parent) + loc->parent = inode_ref(parent); - loc->path = gf_strdup (path); - if (!loc->path) { - gf_log ("loc fill", GF_LOG_ERROR, "strdup failed"); - goto loc_wipe; - } + if (!gf_uuid_is_null(inode->gfid)) + gf_uuid_copy(loc->gfid, inode->gfid); - loc->name = strrchr (loc->path, '/'); - if (loc->name) - loc->name++; - else - goto loc_wipe; + loc->path = gf_strdup(path); + if (!loc->path) { + gf_log("loc fill", GF_LOG_ERROR, "strdup failed"); + goto out; + } + + loc->name = strrchr(loc->path, '/'); + if (loc->name) + loc->name++; + else + goto out; + + ret = 0; - ret = 0; -loc_wipe: - if (ret < 0) - loc_wipe (loc); out: - return ret; -} + if (ret < 0) + loc_wipe(loc); + return ret; +} int32_t -mq_inode_loc_fill (const char *parent_gfid, inode_t *inode, loc_t *loc) +mq_inode_loc_fill(const char *parent_gfid, inode_t *inode, loc_t *loc) { - char *resolvedpath = NULL; - inode_t *parent = NULL; - int ret = -1; + char *resolvedpath = NULL; + inode_t *parent = NULL; + quota_inode_ctx_t *ctx = NULL; + xlator_t *this = NULL; + int ret = -1; + + this = THIS; + + if (inode == NULL) { + gf_log_callingfn("marker", GF_LOG_ERROR, + "loc fill failed, " + "inode is NULL"); + return ret; + } - if ((!inode) || (!loc)) - return ret; + if (loc == NULL) + return ret; - if ((inode) && __is_root_gfid (inode->gfid)) { - loc->parent = NULL; - goto ignore_parent; - } + if ((inode) && __is_root_gfid(inode->gfid)) { + loc->parent = NULL; + goto ignore_parent; + } - if (parent_gfid == NULL) - parent = inode_parent (inode, 0, NULL); - else - parent = inode_find (inode->table, - (unsigned char *) parent_gfid); + if (parent_gfid == NULL) + parent = inode_parent(inode, 0, NULL); + else + parent = inode_find(inode->table, (unsigned char *)parent_gfid); - if (parent == NULL) - goto err; + if (parent == NULL) { + gf_log("marker", GF_LOG_ERROR, "parent is NULL for %s", + uuid_utoa(inode->gfid)); + goto err; + } ignore_parent: - ret = inode_path (inode, NULL, &resolvedpath); - if (ret < 0) - goto err; - - ret = mq_loc_fill (loc, inode, parent, resolvedpath); - if (ret < 0) - goto err; + ret = inode_path(inode, NULL, &resolvedpath); + if (ret < 0) { + gf_log("marker", GF_LOG_ERROR, "failed to resolve path for %s", + uuid_utoa(inode->gfid)); + goto err; + } + + ret = mq_loc_fill(loc, inode, parent, resolvedpath); + if (ret < 0) + goto err; + + ret = mq_inode_ctx_get(inode, this, &ctx); + if (ret < 0 || ctx == NULL) + ctx = mq_inode_ctx_new(inode, this); + if (ctx == NULL) { + gf_log(this->name, GF_LOG_WARNING, + "mq_inode_ctx_new " + "failed for %s", + uuid_utoa(inode->gfid)); + ret = -1; + goto err; + } + ret = 0; err: - if (parent) - inode_unref (parent); + if (parent) + inode_unref(parent); - GF_FREE (resolvedpath); + GF_FREE(resolvedpath); - return ret; + return ret; } - quota_inode_ctx_t * -mq_alloc_inode_ctx () -{ - int32_t ret = -1; - quota_inode_ctx_t *ctx = NULL; - - QUOTA_ALLOC (ctx, quota_inode_ctx_t, ret); - if (ret == -1) - goto out; - - ctx->size = 0; - ctx->dirty = 0; - ctx->updation_status = _gf_false; - LOCK_INIT (&ctx->lock); - INIT_LIST_HEAD (&ctx->contribution_head); -out: - return ctx; -} - -inode_contribution_t * -mq_get_contribution_node (inode_t *inode, quota_inode_ctx_t *ctx) +mq_alloc_inode_ctx() { - inode_contribution_t *contri = NULL; - inode_contribution_t *temp = NULL; - - if (!inode || !ctx) - goto out; - - list_for_each_entry (temp, &ctx->contribution_head, contri_list) { - if (uuid_compare (temp->gfid, inode->gfid) == 0) { - contri = temp; - goto out; - } - } + int32_t ret = -1; + quota_inode_ctx_t *ctx = NULL; + + QUOTA_ALLOC(ctx, quota_inode_ctx_t, ret); + if (ret == -1) + goto out; + + ctx->size = 0; + ctx->dirty = 0; + ctx->updation_status = _gf_false; + LOCK_INIT(&ctx->lock); + INIT_LIST_HEAD(&ctx->contribution_head); out: - return contri; + return ctx; } - -int32_t -mq_delete_contribution_node (dict_t *dict, char *key, - inode_contribution_t *contribution) +static void +mq_contri_fini(inode_contribution_t *contri) { - if (dict_get (dict, key) != NULL) - goto out; - - QUOTA_FREE_CONTRIBUTION_NODE (contribution); -out: - return 0; + LOCK_DESTROY(&contri->lock); + GF_FREE(contri); } - inode_contribution_t * -__mq_add_new_contribution_node (xlator_t *this, quota_inode_ctx_t *ctx, - loc_t *loc) +mq_contri_init(inode_t *inode) { - int32_t ret = 0; - inode_contribution_t *contribution = NULL; - - if (!loc->parent) { - if (!uuid_is_null (loc->pargfid)) - loc->parent = inode_find (loc->inode->table, - loc->pargfid); - - if (!loc->parent) - loc->parent = inode_parent (loc->inode, loc->pargfid, - loc->name); - if (!loc->parent) - goto out; - } - - list_for_each_entry (contribution, &ctx->contribution_head, - contri_list) { - if (loc->parent && - uuid_compare (contribution->gfid, loc->parent->gfid) == 0) { - goto out; - } - } - - QUOTA_ALLOC (contribution, inode_contribution_t, ret); - if (ret == -1) - goto out; + inode_contribution_t *contri = NULL; + int32_t ret = 0; - contribution->contribution = 0; + QUOTA_ALLOC(contri, inode_contribution_t, ret); + if (ret == -1) + goto out; - uuid_copy (contribution->gfid, loc->parent->gfid); + GF_REF_INIT(contri, mq_contri_fini); - LOCK_INIT (&contribution->lock); - INIT_LIST_HEAD (&contribution->contri_list); + contri->contribution = 0; + contri->file_count = 0; + contri->dir_count = 0; + gf_uuid_copy(contri->gfid, inode->gfid); - list_add_tail (&contribution->contri_list, &ctx->contribution_head); + LOCK_INIT(&contri->lock); + INIT_LIST_HEAD(&contri->contri_list); out: - return contribution; + return contri; } - inode_contribution_t * -mq_add_new_contribution_node (xlator_t *this, quota_inode_ctx_t *ctx, - loc_t *loc) +mq_get_contribution_node(inode_t *inode, quota_inode_ctx_t *ctx) { - inode_contribution_t *contribution = NULL; + inode_contribution_t *contri = NULL; + inode_contribution_t *temp = NULL; - if ((ctx == NULL) || (loc == NULL)) - return NULL; + if (!inode || !ctx) + goto out; - if (((loc->path) && (strcmp (loc->path, "/") == 0)) - || (!loc->path && uuid_is_null (loc->pargfid))) - return NULL; + LOCK(&ctx->lock); + { + if (list_empty(&ctx->contribution_head)) + goto unlock; - LOCK (&ctx->lock); + list_for_each_entry(temp, &ctx->contribution_head, contri_list) { - contribution = __mq_add_new_contribution_node (this, ctx, loc); - } - UNLOCK (&ctx->lock); - - return contribution; -} - - -int32_t -mq_dict_set_contribution (xlator_t *this, dict_t *dict, - loc_t *loc) -{ - int32_t ret = -1; - char contri_key [512] = {0, }; - - GF_VALIDATE_OR_GOTO ("marker", this, out); - GF_VALIDATE_OR_GOTO ("marker", dict, out); - GF_VALIDATE_OR_GOTO ("marker", loc, out); - - if (loc->parent) { - GET_CONTRI_KEY (contri_key, loc->parent->gfid, ret); - if (ret < 0) { - ret = -1; - goto out; - } - } else { - /* nameless lookup, fetch contributions to all parents */ - GET_CONTRI_KEY (contri_key, NULL, ret); - } - - ret = dict_set_int64 (dict, contri_key, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "unable to set dict value on %s.", - loc->path); - goto out; + if (gf_uuid_compare(temp->gfid, inode->gfid) == 0) { + contri = temp; + GF_REF_GET(contri); + break; + } } + } +unlock: + UNLOCK(&ctx->lock); - ret = 0; out: - return ret; + return contri; } - -int32_t -mq_inode_ctx_get (inode_t *inode, xlator_t *this, - quota_inode_ctx_t **ctx) +inode_contribution_t * +__mq_add_new_contribution_node(xlator_t *this, quota_inode_ctx_t *ctx, + loc_t *loc) { - int32_t ret = -1; - uint64_t ctx_int = 0; - marker_inode_ctx_t *mark_ctx = NULL; - - GF_VALIDATE_OR_GOTO ("marker", inode, out); - GF_VALIDATE_OR_GOTO ("marker", this, out); - GF_VALIDATE_OR_GOTO ("marker", ctx, out); - - ret = inode_ctx_get (inode, this, &ctx_int); - if (ret < 0) { - ret = -1; - *ctx = NULL; - goto out; - } - - mark_ctx = (marker_inode_ctx_t *) (unsigned long)ctx_int; - if (mark_ctx->quota_ctx == NULL) { - ret = -1; - goto out; + inode_contribution_t *contribution = NULL; + + if (!loc->parent) { + if (!gf_uuid_is_null(loc->pargfid)) + loc->parent = inode_find(loc->inode->table, loc->pargfid); + + if (!loc->parent) + loc->parent = inode_parent(loc->inode, loc->pargfid, loc->name); + if (!loc->parent) + goto out; + } + + list_for_each_entry(contribution, &ctx->contribution_head, contri_list) + { + if (loc->parent && + gf_uuid_compare(contribution->gfid, loc->parent->gfid) == 0) { + goto out; } + } - *ctx = mark_ctx->quota_ctx; - - ret = 0; - -out: - return ret; -} - + contribution = mq_contri_init(loc->parent); + if (contribution == NULL) + goto out; -quota_inode_ctx_t * -__mq_inode_ctx_new (inode_t *inode, xlator_t *this) -{ - int32_t ret = -1; - quota_inode_ctx_t *quota_ctx = NULL; - marker_inode_ctx_t *mark_ctx = NULL; - - ret = marker_force_inode_ctx_get (inode, this, &mark_ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "marker_force_inode_ctx_get() failed"); - goto out; - } + list_add_tail(&contribution->contri_list, &ctx->contribution_head); - LOCK (&inode->lock); - { - if (mark_ctx->quota_ctx == NULL) { - quota_ctx = mq_alloc_inode_ctx (); - if (quota_ctx == NULL) { - ret = -1; - goto unlock; - } - mark_ctx->quota_ctx = quota_ctx; - } else { - quota_ctx = mark_ctx->quota_ctx; - } - - ret = 0; - } -unlock: - UNLOCK (&inode->lock); out: - return quota_ctx; -} - - -quota_inode_ctx_t * -mq_inode_ctx_new (inode_t * inode, xlator_t *this) -{ - return __mq_inode_ctx_new (inode, this); + return contribution; } -quota_local_t * -mq_local_new () +inode_contribution_t * +mq_add_new_contribution_node(xlator_t *this, quota_inode_ctx_t *ctx, loc_t *loc) { - quota_local_t *local = NULL; + inode_contribution_t *contribution = NULL; - local = mem_get0 (THIS->local_pool); - if (!local) - goto out; + if ((ctx == NULL) || (loc == NULL)) + return NULL; - local->ref = 1; - LOCK_INIT (&local->lock); + if (((loc->path) && (strcmp(loc->path, "/") == 0)) || + (!loc->path && gf_uuid_is_null(loc->pargfid))) + return NULL; - local->ctx = NULL; - local->contri = NULL; + LOCK(&ctx->lock); + { + contribution = __mq_add_new_contribution_node(this, ctx, loc); + if (contribution) + GF_REF_GET(contribution); + } + UNLOCK(&ctx->lock); -out: - return local; + return contribution; } -quota_local_t * -mq_local_ref (quota_local_t *local) +int32_t +mq_dict_set_contribution(xlator_t *this, dict_t *dict, loc_t *loc, uuid_t gfid, + char *contri_key) { - LOCK (&local->lock); - { - local->ref ++; + int32_t ret = -1; + char key[QUOTA_KEY_MAX] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("marker", this, out); + GF_VALIDATE_OR_GOTO("marker", dict, out); + GF_VALIDATE_OR_GOTO("marker", loc, out); + + if (gfid && !gf_uuid_is_null(gfid)) { + GET_CONTRI_KEY(this, key, gfid, ret); + } else if (loc->parent) { + GET_CONTRI_KEY(this, key, loc->parent->gfid, ret); + } else { + /* nameless lookup, fetch contributions to all parents */ + GET_CONTRI_KEY(this, key, NULL, ret); + } + + if (ret < 0) + goto out; + + ret = dict_set_int64(dict, key, 0); + if (ret < 0) + goto out; + + if (contri_key) + if (snprintf(contri_key, QUOTA_KEY_MAX, "%s", key) >= QUOTA_KEY_MAX) { + ret = -1; + goto out; } - UNLOCK (&local->lock); - return local; -} +out: + if (ret < 0) + gf_log_callingfn(this ? this->name : "Marker", GF_LOG_ERROR, + "dict set failed"); + return ret; +} int32_t -mq_local_unref (xlator_t *this, quota_local_t *local) +mq_inode_ctx_get(inode_t *inode, xlator_t *this, quota_inode_ctx_t **ctx) { - int32_t ref = 0; - if (local == NULL) - goto out; + int32_t ret = -1; + uint64_t ctx_int = 0; + marker_inode_ctx_t *mark_ctx = NULL; - QUOTA_SAFE_DECREMENT (&local->lock, local->ref, ref); + GF_VALIDATE_OR_GOTO("marker", inode, out); + GF_VALIDATE_OR_GOTO("marker", this, out); + GF_VALIDATE_OR_GOTO("marker", ctx, out); - if (ref != 0) - goto out; + ret = inode_ctx_get(inode, this, &ctx_int); + if (ret < 0) { + ret = -1; + *ctx = NULL; + goto out; + } - if (local->fd != NULL) - fd_unref (local->fd); + mark_ctx = (marker_inode_ctx_t *)(unsigned long)ctx_int; + if (mark_ctx->quota_ctx == NULL) { + ret = -1; + goto out; + } - loc_wipe (&local->loc); + *ctx = mark_ctx->quota_ctx; - loc_wipe (&local->parent_loc); + ret = 0; - LOCK_DESTROY (&local->lock); - - mem_put (local); out: - return 0; + return ret; } - -inode_contribution_t * -mq_get_contribution_from_loc (xlator_t *this, loc_t *loc) +quota_inode_ctx_t * +__mq_inode_ctx_new(inode_t *inode, xlator_t *this) { - int32_t ret = 0; - quota_inode_ctx_t *ctx = NULL; - inode_contribution_t *contribution = NULL; - - ret = mq_inode_ctx_get (loc->inode, this, &ctx); - if (ret < 0) { - gf_log_callingfn (this->name, GF_LOG_WARNING, - "cannot get marker-quota context from inode " - "(gfid:%s, path:%s)", - uuid_utoa (loc->inode->gfid), loc->path); - goto err; + int32_t ret = -1; + quota_inode_ctx_t *quota_ctx = NULL; + marker_inode_ctx_t *mark_ctx = NULL; + + ret = marker_force_inode_ctx_get(inode, this, &mark_ctx); + if (ret < 0) { + gf_log(this->name, GF_LOG_ERROR, "marker_force_inode_ctx_get() failed"); + goto out; + } + + LOCK(&inode->lock); + { + if (mark_ctx->quota_ctx == NULL) { + quota_ctx = mq_alloc_inode_ctx(); + if (quota_ctx == NULL) { + ret = -1; + goto unlock; + } + mark_ctx->quota_ctx = quota_ctx; + } else { + quota_ctx = mark_ctx->quota_ctx; } - contribution = mq_get_contribution_node (loc->parent, ctx); - if (contribution == NULL) { - gf_log_callingfn (this->name, GF_LOG_WARNING, - "inode (gfid:%s, path:%s) has " - "no contribution towards parent (gfid:%s)", - uuid_utoa (loc->inode->gfid), - loc->path, uuid_utoa (loc->parent->gfid)); - goto err; - } + ret = 0; + } +unlock: + UNLOCK(&inode->lock); +out: + return quota_ctx; +} -err: - return contribution; +quota_inode_ctx_t * +mq_inode_ctx_new(inode_t *inode, xlator_t *this) +{ + return __mq_inode_ctx_new(inode, this); } diff --git a/xlators/features/marker/src/marker-quota-helper.h b/xlators/features/marker/src/marker-quota-helper.h index 6cdd148810b..d4091dd2180 100644 --- a/xlators/features/marker/src/marker-quota-helper.h +++ b/xlators/features/marker/src/marker-quota-helper.h @@ -9,68 +9,58 @@ */ #ifndef _MARKER_QUOTA_HELPER_H -#define _MARKER_QUOTA_HELPER - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif +#define _MARKER_QUOTA_HELPER_H #include "marker.h" -#define QUOTA_FREE_CONTRIBUTION_NODE(_contribution) \ - do { \ - list_del (&_contribution->contri_list); \ - GF_FREE (_contribution); \ - } while (0) - -#define QUOTA_SAFE_INCREMENT(lock, var) \ - do { \ - LOCK (lock); \ - var ++; \ - UNLOCK (lock); \ - } while (0) - -#define QUOTA_SAFE_DECREMENT(lock, var, value) \ - do { \ - LOCK (lock); \ - { \ - value = --var; \ - } \ - UNLOCK (lock); \ - } while (0) +#define QUOTA_FREE_CONTRIBUTION_NODE(ctx, _contribution) \ + do { \ + LOCK(&ctx->lock); \ + { \ + list_del_init(&_contribution->contri_list); \ + GF_REF_PUT(_contribution); \ + } \ + UNLOCK(&ctx->lock); \ + } while (0) + +#define QUOTA_SAFE_INCREMENT(lock, var) \ + do { \ + LOCK(lock); \ + var++; \ + UNLOCK(lock); \ + } while (0) + +#define QUOTA_SAFE_DECREMENT(lock, var, value) \ + do { \ + LOCK(lock); \ + { \ + value = --var; \ + } \ + UNLOCK(lock); \ + } while (0) inode_contribution_t * -mq_add_new_contribution_node (xlator_t *, quota_inode_ctx_t *, loc_t *); +mq_add_new_contribution_node(xlator_t *, quota_inode_ctx_t *, loc_t *); int32_t -mq_dict_set_contribution (xlator_t *, dict_t *, loc_t *); +mq_dict_set_contribution(xlator_t *, dict_t *, loc_t *, uuid_t, char *); quota_inode_ctx_t * -mq_inode_ctx_new (inode_t *, xlator_t *); +mq_inode_ctx_new(inode_t *, xlator_t *); int32_t -mq_inode_ctx_get (inode_t *, xlator_t *, quota_inode_ctx_t **); +mq_inode_ctx_get(inode_t *, xlator_t *, quota_inode_ctx_t **); int32_t -mq_delete_contribution_node (dict_t *, char *, inode_contribution_t *); - -int32_t -mq_inode_loc_fill (const char *, inode_t *, loc_t *); - -quota_local_t * -mq_local_new (); - -quota_local_t * -mq_local_ref (quota_local_t *); +mq_delete_contribution_node(dict_t *, char *, inode_contribution_t *); int32_t -mq_local_unref (xlator_t *, quota_local_t *); +mq_inode_loc_fill(const char *, inode_t *, loc_t *); inode_contribution_t * -mq_get_contribution_node (inode_t *, quota_inode_ctx_t *); +mq_contri_init(inode_t *inode); inode_contribution_t * -mq_get_contribution_from_loc (xlator_t *this, loc_t *loc); +mq_get_contribution_node(inode_t *, quota_inode_ctx_t *); #endif diff --git a/xlators/features/marker/src/marker-quota.c b/xlators/features/marker/src/marker-quota.c index d972d7f85b0..3de2ea1c92c 100644 --- a/xlators/features/marker/src/marker-quota.c +++ b/xlators/features/marker/src/marker-quota.c @@ -7,2697 +7,2291 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "dict.h" -#include "xlator.h" -#include "defaults.h" +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> #include "libxlator.h" -#include "common-utils.h" -#include "byte-order.h" +#include <glusterfs/common-utils.h> +#include <glusterfs/byte-order.h> #include "marker-quota.h" #include "marker-quota-helper.h" +#include <glusterfs/syncop.h> +#include <glusterfs/quota-common-utils.h> int -mq_loc_copy (loc_t *dst, loc_t *src) +mq_loc_copy(loc_t *dst, loc_t *src) { - int ret = -1; + int ret = -1; - GF_VALIDATE_OR_GOTO ("marker", dst, out); - GF_VALIDATE_OR_GOTO ("marker", src, out); + GF_VALIDATE_OR_GOTO("marker", dst, out); + GF_VALIDATE_OR_GOTO("marker", src, out); - if (src->inode == NULL || - ((src->parent == NULL) && (uuid_is_null (src->pargfid)) - && !__is_root_gfid (src->inode->gfid))) { - gf_log ("marker", GF_LOG_WARNING, - "src loc is not valid"); - goto out; - } + if (src->inode == NULL || + ((src->parent == NULL) && (gf_uuid_is_null(src->pargfid)) && + !__is_root_gfid(src->inode->gfid))) { + gf_log("marker", GF_LOG_WARNING, "src loc is not valid"); + goto out; + } - ret = loc_copy (dst, src); + ret = loc_copy(dst, src); out: - return ret; + return ret; } -int32_t -mq_get_local_err (quota_local_t *local, - int32_t *val) +static void +mq_set_ctx_status(quota_inode_ctx_t *ctx, gf_boolean_t *flag, + gf_boolean_t status) { - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("marker", local, out); - GF_VALIDATE_OR_GOTO ("marker", val, out); - - LOCK (&local->lock); - { - *val = local->err; - } - UNLOCK (&local->lock); - - ret = 0; -out: - return ret; + LOCK(&ctx->lock); + { + *flag = status; + } + UNLOCK(&ctx->lock); } -int32_t -mq_get_ctx_updation_status (quota_inode_ctx_t *ctx, - gf_boolean_t *status) +static void +mq_test_and_set_ctx_status(quota_inode_ctx_t *ctx, gf_boolean_t *flag, + gf_boolean_t *status) { - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("marker", ctx, out); - GF_VALIDATE_OR_GOTO ("marker", status, out); - - LOCK (&ctx->lock); - { - *status = ctx->updation_status; - } - UNLOCK (&ctx->lock); - - ret = 0; -out: - return ret; + gf_boolean_t temp = _gf_false; + + LOCK(&ctx->lock); + { + temp = *status; + *status = *flag; + *flag = temp; + } + UNLOCK(&ctx->lock); } +static void +mq_get_ctx_status(quota_inode_ctx_t *ctx, gf_boolean_t *flag, + gf_boolean_t *status) +{ + LOCK(&ctx->lock); + { + *status = *flag; + } + UNLOCK(&ctx->lock); +} int32_t -mq_set_ctx_updation_status (quota_inode_ctx_t *ctx, - gf_boolean_t status) +mq_get_ctx_updation_status(quota_inode_ctx_t *ctx, gf_boolean_t *status) { - int32_t ret = -1; - - if (ctx == NULL) - goto out; - - LOCK (&ctx->lock); - { - ctx->updation_status = status; - } - UNLOCK (&ctx->lock); + GF_VALIDATE_OR_GOTO("marker", ctx, out); + GF_VALIDATE_OR_GOTO("marker", status, out); - ret = 0; + mq_get_ctx_status(ctx, &ctx->updation_status, status); + return 0; out: - return ret; + return -1; } int32_t -mq_test_and_set_ctx_updation_status (quota_inode_ctx_t *ctx, - gf_boolean_t *status) +mq_set_ctx_updation_status(quota_inode_ctx_t *ctx, gf_boolean_t status) { - int32_t ret = -1; - gf_boolean_t temp = _gf_false; - - GF_VALIDATE_OR_GOTO ("marker", ctx, out); - GF_VALIDATE_OR_GOTO ("marker", status, out); + GF_VALIDATE_OR_GOTO("marker", ctx, out); - LOCK (&ctx->lock); - { - temp = *status; - *status = ctx->updation_status; - ctx->updation_status = temp; - } - UNLOCK (&ctx->lock); - - ret = 0; + mq_set_ctx_status(ctx, &ctx->updation_status, status); + return 0; out: - return ret; + return -1; } -void -mq_assign_lk_owner (xlator_t *this, call_frame_t *frame) +int32_t +mq_test_and_set_ctx_updation_status(quota_inode_ctx_t *ctx, + gf_boolean_t *status) { - marker_conf_t *conf = NULL; - uint64_t lk_owner = 0; - - conf = this->private; - - LOCK (&conf->lock); - { - if (++conf->quota_lk_owner == 0) { - ++conf->quota_lk_owner; - } - - lk_owner = conf->quota_lk_owner; - } - UNLOCK (&conf->lock); - - set_lk_owner_from_uint64 (&frame->root->lk_owner, lk_owner); + GF_VALIDATE_OR_GOTO("marker", ctx, out); + GF_VALIDATE_OR_GOTO("marker", status, out); - return; + mq_test_and_set_ctx_status(ctx, &ctx->updation_status, status); + return 0; +out: + return -1; } - int32_t -mq_loc_fill_from_name (xlator_t *this, loc_t *newloc, loc_t *oldloc, - uint64_t ino, char *name) +mq_set_ctx_create_status(quota_inode_ctx_t *ctx, gf_boolean_t status) { - int32_t ret = -1; - int32_t len = 0; - char *path = NULL; - - GF_VALIDATE_OR_GOTO ("marker", this, out); - GF_VALIDATE_OR_GOTO ("marker", newloc, out); - GF_VALIDATE_OR_GOTO ("marker", oldloc, out); - GF_VALIDATE_OR_GOTO ("marker", name, out); + GF_VALIDATE_OR_GOTO("marker", ctx, out); - newloc->inode = inode_new (oldloc->inode->table); - - if (!newloc->inode) { - ret = -1; - goto out; - } - - newloc->parent = inode_ref (oldloc->inode); - uuid_copy (newloc->pargfid, oldloc->inode->gfid); - - len = strlen (oldloc->path); - - if (oldloc->path [len - 1] == '/') - ret = gf_asprintf ((char **) &path, "%s%s", - oldloc->path, name); - else - ret = gf_asprintf ((char **) &path, "%s/%s", - oldloc->path, name); - - if (ret < 0) - goto out; - - newloc->path = path; - - newloc->name = strrchr (newloc->path, '/'); - - if (newloc->name) - newloc->name++; - - gf_log (this->name, GF_LOG_DEBUG, "path = %s name =%s", - newloc->path, newloc->name); + mq_set_ctx_status(ctx, &ctx->create_status, status); + return 0; out: - return ret; + return -1; } int32_t -mq_dirty_inode_updation_done (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +mq_test_and_set_ctx_create_status(quota_inode_ctx_t *ctx, gf_boolean_t *status) { - QUOTA_STACK_DESTROY (frame, this); + GF_VALIDATE_OR_GOTO("marker", ctx, out); + GF_VALIDATE_OR_GOTO("marker", status, out); - return 0; + mq_test_and_set_ctx_status(ctx, &ctx->create_status, status); + return 0; +out: + return -1; } -int32_t -mq_release_lock_on_dirty_inode (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +static void +mq_set_ctx_dirty_status(quota_inode_ctx_t *ctx, gf_boolean_t status) { - struct gf_flock lock = {0, }; - quota_local_t *local = NULL; - loc_t loc = {0, }; - int ret = -1; - - local = frame->local; - - if (op_ret == -1) { - local->err = -1; - - mq_dirty_inode_updation_done (frame, NULL, this, 0, 0, NULL); - - return 0; - } - - if (op_ret == 0) - local->ctx->dirty = 0; - - lock.l_type = F_UNLCK; - lock.l_whence = SEEK_SET; - lock.l_start = 0; - lock.l_len = 0; - lock.l_pid = 0; - - ret = loc_copy (&loc, &local->loc); - if (ret == -1) { - local->err = -1; - frame->local = NULL; - mq_dirty_inode_updation_done (frame, NULL, this, 0, 0, NULL); - return 0; - } - - if (local->loc.inode == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "Inode is NULL, so can't stackwind."); - goto out; - } + GF_VALIDATE_OR_GOTO("marker", ctx, out); - STACK_WIND (frame, - mq_dirty_inode_updation_done, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, - this->name, &loc, F_SETLKW, &lock, NULL); - - loc_wipe (&loc); - - return 0; + mq_set_ctx_status(ctx, &ctx->dirty_status, status); out: - mq_dirty_inode_updation_done (frame, NULL, this, -1, 0, NULL); - - return 0; + return; } -int32_t -mq_mark_inode_undirty (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) +int +mq_build_ancestry(xlator_t *this, loc_t *loc) { - int32_t ret = -1; - int64_t *size = NULL; - dict_t *newdict = NULL; - quota_local_t *local = NULL; - - local = (quota_local_t *) frame->local; - - if (op_ret == -1) - goto err; - - if (!dict) - goto wind; - - ret = dict_get_bin (dict, QUOTA_SIZE_KEY, (void **) &size); - if (ret) - goto wind; + int32_t ret = -1; + fd_t *fd = NULL; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + dict_t *xdata = NULL; + inode_t *tmp_parent = NULL; + inode_t *tmp_inode = NULL; + inode_t *linked_inode = NULL; + quota_inode_ctx_t *ctx = NULL; + + INIT_LIST_HEAD(&entries.list); + + xdata = dict_new(); + if (xdata == NULL) { + gf_log(this->name, GF_LOG_ERROR, "dict_new failed"); + ret = -ENOMEM; + goto out; + } + + ret = dict_set_int8(xdata, GET_ANCESTRY_DENTRY_KEY, 1); + if (ret < 0) + goto out; + + fd = fd_anonymous(loc->inode); + if (fd == NULL) { + gf_log(this->name, GF_LOG_ERROR, "fd creation failed"); + ret = -ENOMEM; + goto out; + } + + fd_bind(fd); + + ret = syncop_readdirp(this, fd, 131072, 0, &entries, xdata, NULL); + if (ret < 0) { + gf_log(this->name, + (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "readdirp failed " + "for %s: %s", + loc->path, strerror(-ret)); + goto out; + } + + if (list_empty(&entries.list)) { + ret = -1; + goto out; + } + + list_for_each_entry(entry, &entries.list, list) + { + if (__is_root_gfid(entry->inode->gfid)) { + /* The list contains a sub-list for each possible path + * to the target inode. Each sub-list starts with the + * root entry of the tree and is followed by the child + * entries for a particular path to the target entry. + * The root entry is an implied sub-list delimiter, + * as it denotes we have started processing a new path. + * Reset the parent pointer and continue + */ + + tmp_parent = NULL; + } else { + linked_inode = inode_link(entry->inode, tmp_parent, entry->d_name, + &entry->d_stat); + if (linked_inode) { + tmp_inode = entry->inode; + entry->inode = linked_inode; + inode_unref(tmp_inode); + } else { + gf_log(this->name, GF_LOG_ERROR, "inode link failed"); + ret = -EINVAL; + goto out; + } + } - LOCK (&local->ctx->lock); - { - local->ctx->size = ntoh64 (*size); + ctx = mq_inode_ctx_new(entry->inode, this); + if (ctx == NULL) { + gf_log(this->name, GF_LOG_WARNING, + "mq_inode_ctx_new " + "failed for %s", + uuid_utoa(entry->inode->gfid)); + ret = -ENOMEM; + goto out; } - UNLOCK (&local->ctx->lock); -wind: - newdict = dict_new (); - if (!newdict) - goto err; + /* For non-directory, posix_get_ancestry_non_directory returns + * all hard-links that are represented by nodes adjacent to + * each other in the dentry-list. + * (Unlike the directory case where adjacent nodes either have + * a parent/child relationship or belong to different paths). + */ + if (entry->inode->ia_type == IA_IFDIR) + tmp_parent = entry->inode; + } - ret = dict_set_int8 (newdict, QUOTA_DIRTY_KEY, 0); - if (ret) - goto err; + if (loc->parent) + inode_unref(loc->parent); - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, local->loc.inode->gfid); + loc->parent = inode_parent(loc->inode, 0, NULL); + if (loc->parent == NULL) { + ret = -1; + goto out; + } - GF_UUID_ASSERT (local->loc.gfid); - STACK_WIND (frame, mq_release_lock_on_dirty_inode, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, - &local->loc, newdict, 0, NULL); - ret = 0; + ret = 0; -err: - if (op_ret == -1 || ret == -1) { - local->err = -1; +out: + gf_dirent_free(&entries); - mq_release_lock_on_dirty_inode (frame, NULL, this, 0, 0, NULL); - } + if (fd) + fd_unref(fd); - if (newdict) - dict_unref (newdict); + if (xdata) + dict_unref(xdata); - return 0; + return ret; } -int32_t -mq_update_size_xattr (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *dict, struct iatt *postparent) +/* This function should be used only in inspect_directory and inspect_file + * function to heal quota xattrs. + * Inode quota feature is introduced in 3.7. + * If gluster setup is upgraded from 3.6 to 3.7, there can be a + * getxattr and setxattr spikes with quota heal as inode quota is missing. + * So this wrapper function is to avoid xattrs spikes during upgrade. + * This function returns success even is inode-quota xattrs are missing and + * hence no healing performed. + */ +static int32_t +_quota_dict_get_meta(xlator_t *this, dict_t *dict, char *key, const int keylen, + quota_meta_t *meta, ia_type_t ia_type, + gf_boolean_t add_delta) { - int32_t ret = -1; - dict_t *new_dict = NULL; - int64_t *size = NULL; - int64_t *delta = NULL; - quota_local_t *local = NULL; - - local = frame->local; - - if (op_ret == -1) - goto err; - - if (dict == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "Dict is null while updating the size xattr %s", - local->loc.path?local->loc.path:""); - goto err; - } - - ret = dict_get_bin (dict, QUOTA_SIZE_KEY, (void **) &size); - if (!size) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get the size, %s", - local->loc.path?local->loc.path:""); - goto err; - } - - QUOTA_ALLOC_OR_GOTO (delta, int64_t, ret, err); - - *delta = hton64 (local->sum - ntoh64 (*size)); - - gf_log (this->name, GF_LOG_DEBUG, "calculated size = %"PRId64", " - "original size = %"PRIu64 - " path = %s diff = %"PRIu64, local->sum, ntoh64 (*size), - local->loc.path, ntoh64 (*delta)); - - new_dict = dict_new (); - if (!new_dict); + int32_t ret = 0; + marker_conf_t *priv = NULL; - ret = dict_set_bin (new_dict, QUOTA_SIZE_KEY, delta, 8); - if (ret) - goto err; + priv = this->private; - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, buf->ia_gfid); - - GF_UUID_ASSERT (local->loc.gfid); - - STACK_WIND (frame, mq_mark_inode_undirty, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->xattrop, &local->loc, - GF_XATTROP_ADD_ARRAY64, new_dict, NULL); + ret = quota_dict_get_inode_meta(dict, key, keylen, meta); + if (ret == -2 && (priv->feature_enabled & GF_INODE_QUOTA) == 0) { + /* quota_dict_get_inode_meta returns -2 if + * inode quota xattrs are not present. + * if inode quota self heal is turned off, + * then we should skip healing inode quotas + */ + gf_log(this->name, GF_LOG_DEBUG, + "inode quota disabled. " + "inode quota self heal will not be performed"); ret = 0; - -err: - if (op_ret == -1 || ret == -1) { - local->err = -1; - - mq_release_lock_on_dirty_inode (frame, NULL, this, 0, 0, NULL); + if (add_delta) { + if (ia_type == IA_IFDIR) + meta->dir_count = 1; + else + meta->file_count = 1; } + } - if (new_dict) - dict_unref (new_dict); - - return 0; + return ret; } int32_t -mq_test_and_set_local_err(quota_local_t *local, - int32_t *val) +quota_dict_set_size_meta(xlator_t *this, dict_t *dict, const quota_meta_t *meta) { - int tmp = 0; - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("marker", local, out); - GF_VALIDATE_OR_GOTO ("marker", val, out); - - LOCK (&local->lock); - { - tmp = local->err; - local->err = *val; - *val = tmp; - } - UNLOCK (&local->lock); - - ret = 0; + int32_t ret = -ENOMEM; + quota_meta_t *value = NULL; + char size_key[QUOTA_KEY_MAX] = { + 0, + }; + + value = GF_MALLOC(2 * sizeof(quota_meta_t), gf_common_quota_meta_t); + if (value == NULL) { + goto out; + } + value[0].size = hton64(meta->size); + value[0].file_count = hton64(meta->file_count); + value[0].dir_count = hton64(meta->dir_count); + + value[1].size = 0; + value[1].file_count = 0; + value[1].dir_count = hton64(1); + + GET_SIZE_KEY(this, size_key, ret); + if (ret < 0) + goto out; + ret = dict_set_bin(dict, size_key, value, (sizeof(quota_meta_t) * 2)); + if (ret < 0) { + gf_log_callingfn("quota", GF_LOG_ERROR, "dict set failed"); + GF_FREE(value); + } out: - return ret; + return ret; } -int32_t -mq_get_dirty_inode_size (call_frame_t *frame, xlator_t *this) +void +mq_compute_delta(quota_meta_t *delta, const quota_meta_t *op1, + const quota_meta_t *op2) { - int32_t ret = -1; - dict_t *dict = NULL; - quota_local_t *local = NULL; - - local = (quota_local_t *) frame->local; - - dict = dict_new (); - if (!dict) { - ret = -1; - goto err; - } - - ret = dict_set_int64 (dict, QUOTA_SIZE_KEY, 0); - if (ret) - goto err; - - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, local->loc.inode->gfid); - - GF_UUID_ASSERT (local->loc.gfid); - - STACK_WIND (frame, mq_update_size_xattr, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, &local->loc, dict); - ret =0; - -err: - if (ret) { - local->err = -1; - - mq_release_lock_on_dirty_inode (frame, NULL, this, 0, 0, NULL); - } + delta->size = op1->size - op2->size; + delta->file_count = op1->file_count - op2->file_count; + delta->dir_count = op1->dir_count - op2->dir_count; +} - if (dict) - dict_unref (dict); +void +mq_add_meta(quota_meta_t *dst, const quota_meta_t *src) +{ + dst->size += src->size; + dst->file_count += src->file_count; + dst->dir_count += src->dir_count; +} - return 0; +void +mq_sub_meta(quota_meta_t *dst, const quota_meta_t *src) +{ + if (src == NULL) { + dst->size = -dst->size; + dst->file_count = -dst->file_count; + dst->dir_count = -dst->dir_count; + } else { + dst->size = src->size - dst->size; + dst->file_count = src->file_count - dst->file_count; + dst->dir_count = src->dir_count - dst->dir_count; + } } int32_t -mq_get_child_contribution (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) +mq_are_xattrs_set(xlator_t *this, loc_t *loc, gf_boolean_t *contri_set, + gf_boolean_t *size_set) { - int32_t ret = -1; - int32_t val = 0; - char contri_key [512] = {0, }; - int64_t *contri = NULL; - quota_local_t *local = NULL; - - local = frame->local; - - frame->local = NULL; - - QUOTA_STACK_DESTROY (frame, this); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, "%s", - strerror (op_errno)); - val = -2; - if (!mq_test_and_set_local_err (local, &val) && - val != -2) - mq_release_lock_on_dirty_inode (local->frame, NULL, - this, 0, 0, NULL); - - goto exit; - } - - ret = mq_get_local_err (local, &val); - if (!ret && val == -2) - goto exit; - - GET_CONTRI_KEY (contri_key, local->loc.inode->gfid, ret); + int32_t ret = -1; + char contri_key[QUOTA_KEY_MAX] = { + 0, + }; + char size_key[QUOTA_KEY_MAX] = { + 0, + }; + quota_meta_t meta = { + 0, + }; + struct iatt stbuf = { + 0, + }; + dict_t *dict = NULL; + dict_t *rsp_dict = NULL; + + dict = dict_new(); + if (dict == NULL) { + gf_log(this->name, GF_LOG_ERROR, "dict_new failed"); + goto out; + } + + ret = mq_req_xattr(this, loc, dict, contri_key, size_key); + if (ret < 0) + goto out; + + ret = syncop_lookup(FIRST_CHILD(this), loc, &stbuf, NULL, dict, &rsp_dict); + if (ret < 0) { + gf_log_callingfn( + this->name, + (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "lookup failed " + "for %s: %s", + loc->path, strerror(-ret)); + goto out; + } + + if (rsp_dict == NULL) + goto out; + + *contri_set = _gf_true; + *size_set = _gf_true; + if (loc->inode->ia_type == IA_IFDIR) { + ret = quota_dict_get_inode_meta(rsp_dict, size_key, strlen(size_key), + &meta); + if (ret < 0 || meta.dir_count == 0) + *size_set = _gf_false; + } + + if (!loc_is_root(loc)) { + ret = quota_dict_get_inode_meta(rsp_dict, contri_key, + strlen(contri_key), &meta); if (ret < 0) - goto out; - - if (!dict) - goto out; - - if (dict_get_bin (dict, contri_key, (void **) &contri) == 0) - local->sum += ntoh64 (*contri); + *contri_set = _gf_false; + } + ret = 0; out: - LOCK (&local->lock); - { - val = --local->dentry_child_count; - } - UNLOCK (&local->lock); + if (dict) + dict_unref(dict); - if (val == 0) { - mq_dirty_inode_readdir (local->frame, NULL, this, - 0, 0, NULL, NULL); - } - mq_local_unref (this, local); + if (rsp_dict) + dict_unref(rsp_dict); - return 0; -exit: - mq_local_unref (this, local); - return 0; + return ret; } int32_t -mq_readdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) +mq_create_size_xattrs(xlator_t *this, quota_inode_ctx_t *ctx, loc_t *loc) { - char contri_key [512] = {0, }; - int32_t ret = 0; - int32_t val = 0; - off_t offset = 0; - int32_t count = 0; - dict_t *dict = NULL; - quota_local_t *local = NULL; - gf_dirent_t *entry = NULL; - call_frame_t *newframe = NULL; - loc_t loc = {0, }; - - local = mq_local_ref (frame->local); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "readdir failed %s", strerror (op_errno)); - local->err = -1; - - mq_release_lock_on_dirty_inode (frame, NULL, this, 0, 0, NULL); - - goto end; - } else if (op_ret == 0) { - mq_get_dirty_inode_size (frame, this); - - goto end; - } - - local->dentry_child_count = 0; - - list_for_each_entry (entry, (&entries->list), list) { - gf_log (this->name, GF_LOG_DEBUG, "entry = %s", entry->d_name); + int32_t ret = -1; + quota_meta_t size = { + 0, + }; + dict_t *dict = NULL; - if ((!strcmp (entry->d_name, ".")) || (!strcmp (entry->d_name, - ".."))) { - gf_log (this->name, GF_LOG_DEBUG, "entry = %s", - entry->d_name); - continue; - } - - offset = entry->d_off; - count++; - } + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", loc->inode, out); - if (count == 0) { - mq_get_dirty_inode_size (frame, this); - goto end; - - } - - local->frame = frame; - - LOCK (&local->lock); - { - local->dentry_child_count = count; - local->d_off = offset; - } - UNLOCK (&local->lock); - - - list_for_each_entry (entry, (&entries->list), list) { - gf_log (this->name, GF_LOG_DEBUG, "entry = %s", entry->d_name); - - if ((!strcmp (entry->d_name, ".")) || (!strcmp (entry->d_name, - ".."))) { - gf_log (this->name, GF_LOG_DEBUG, "entry = %s", - entry->d_name); - continue; - } - - ret = mq_loc_fill_from_name (this, &loc, &local->loc, - entry->d_ino, entry->d_name); - if (ret < 0) - goto out; - - ret = 0; - - LOCK (&local->lock); - { - if (local->err != -2) { - newframe = copy_frame (frame); - if (!newframe) { - ret = -1; - } - } else - ret = -1; - } - UNLOCK (&local->lock); - - if (ret == -1) - goto out; - - newframe->local = mq_local_ref (local); - - dict = dict_new (); - if (!dict) { - ret = -1; - goto out; - } - - GET_CONTRI_KEY (contri_key, local->loc.inode->gfid, ret); - if (ret < 0) - goto out; - - ret = dict_set_int64 (dict, contri_key, 0); - if (ret) - goto out; - - STACK_WIND (newframe, - mq_get_child_contribution, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, - &loc, dict); - - offset = entry->d_off; - - loc_wipe (&loc); - - newframe = NULL; - - out: - if (dict) { - dict_unref (dict); - dict = NULL; - } - - if (ret) { - val = -2; - mq_test_and_set_local_err (local, &val); - - if (newframe) { - newframe->local = NULL; - mq_local_unref(this, local); - QUOTA_STACK_DESTROY (newframe, this); - } - - break; - } - } + if (loc->inode->ia_type != IA_IFDIR) { + ret = 0; + goto out; + } + + dict = dict_new(); + if (!dict) { + gf_log(this->name, GF_LOG_ERROR, "dict_new failed"); + ret = -1; + goto out; + } + + ret = quota_dict_set_size_meta(this, dict, &size); + if (ret < 0) + goto out; + + ret = syncop_xattrop(FIRST_CHILD(this), loc, + GF_XATTROP_ADD_ARRAY64_WITH_DEFAULT, dict, NULL, NULL, + NULL); + + if (ret < 0) { + gf_log_callingfn( + this->name, + (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "xattrop failed " + "for %s: %s", + loc->path, strerror(-ret)); + goto out; + } - if (ret && val != -2) { - mq_release_lock_on_dirty_inode (frame, NULL, this, 0, 0, NULL); - } -end: - mq_local_unref (this, local); +out: + if (dict) + dict_unref(dict); - return 0; + return ret; } int32_t -mq_dirty_inode_readdir (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, dict_t *xdata) +mq_lock(xlator_t *this, loc_t *loc, short l_type) { - quota_local_t *local = NULL; - - local = frame->local; - - if (op_ret == -1) { - local->err = -1; - mq_release_lock_on_dirty_inode (frame, NULL, this, 0, 0, NULL); - return 0; - } + struct gf_flock lock = { + 0, + }; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", loc->inode, out); + + gf_log(this->name, GF_LOG_DEBUG, "set lock type %d on %s", l_type, + loc->path); + + lock.l_len = 0; + lock.l_start = 0; + lock.l_type = l_type; + lock.l_whence = SEEK_SET; + + ret = syncop_inodelk(FIRST_CHILD(this), this->name, loc, F_SETLKW, &lock, + NULL, NULL); + if (ret < 0) + gf_log_callingfn( + this->name, + (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "inodelk failed " + "for %s: %s", + loc->path, strerror(-ret)); - if (local->fd == NULL) - local->fd = fd_ref (fd); - - STACK_WIND (frame, - mq_readdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdir, - local->fd, READDIR_BUF, local->d_off, xdata); +out: - return 0; + return ret; } int32_t -mq_check_if_still_dirty (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) +mq_get_dirty(xlator_t *this, loc_t *loc, int32_t *dirty) { - int8_t dirty = -1; - int32_t ret = -1; - fd_t *fd = NULL; - quota_local_t *local = NULL; - - local = frame->local; + int32_t ret = -1; + int8_t value = 0; + dict_t *dict = NULL; + dict_t *rsp_dict = NULL; + struct iatt stbuf = { + 0, + }; + + dict = dict_new(); + if (dict == NULL) { + gf_log(this->name, GF_LOG_ERROR, "dict_new failed"); + goto out; + } + + ret = dict_set_int64(dict, QUOTA_DIRTY_KEY, 0); + if (ret < 0) { + gf_log(this->name, GF_LOG_WARNING, "dict set failed"); + goto out; + } + + ret = syncop_lookup(FIRST_CHILD(this), loc, &stbuf, NULL, dict, &rsp_dict); + if (ret < 0) { + gf_log_callingfn( + this->name, + (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "lookup failed " + "for %s: %s", + loc->path, strerror(-ret)); + goto out; + } + + ret = dict_get_int8(rsp_dict, QUOTA_DIRTY_KEY, &value); + if (ret < 0) + goto out; + + *dirty = value; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, "failed to get " - "the dirty xattr for %s", local->loc.path); - goto err; - } - - if (!dict) { - ret = -1; - goto err; - } - - ret = dict_get_int8 (dict, QUOTA_DIRTY_KEY, &dirty); - if (ret) - goto err; - - //the inode is not dirty anymore - if (dirty == 0) { - mq_release_lock_on_dirty_inode (frame, NULL, this, 0, 0, NULL); - - return 0; - } - - fd = fd_create (local->loc.inode, frame->root->pid); - - local->d_off = 0; - - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, buf->ia_gfid); - - GF_UUID_ASSERT (local->loc.gfid); - STACK_WIND(frame, - mq_dirty_inode_readdir, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->opendir, - &local->loc, fd, NULL); - - ret = 0; - -err: - if (op_ret == -1 || ret == -1) { - local->err = -1; - mq_release_lock_on_dirty_inode (frame, NULL, this, 0, 0, NULL); - } +out: + if (dict) + dict_unref(dict); - if (fd != NULL) { - fd_unref (fd); - } + if (rsp_dict) + dict_unref(rsp_dict); - return 0; + return ret; } int32_t -mq_get_dirty_xattr (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +mq_get_set_dirty(xlator_t *this, loc_t *loc, int32_t dirty, int32_t *prev_dirty) { - int32_t ret = -1; - dict_t *xattr_req = NULL; - quota_local_t *local = NULL; - - if (op_ret == -1) { - mq_dirty_inode_updation_done (frame, NULL, this, 0, 0, NULL); - return 0; - } - - local = frame->local; - - xattr_req = dict_new (); - if (xattr_req == NULL) { - ret = -1; - goto err; - } - - ret = dict_set_int8 (xattr_req, QUOTA_DIRTY_KEY, 0); - if (ret) - goto err; + int32_t ret = -1; + int8_t value = 0; + quota_inode_ctx_t *ctx = NULL; + dict_t *dict = NULL; + dict_t *rsp_dict = NULL; + + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", loc->inode, out); + GF_VALIDATE_OR_GOTO("marker", prev_dirty, out); + + ret = mq_inode_ctx_get(loc->inode, this, &ctx); + if (ret < 0) { + gf_log(this->name, GF_LOG_ERROR, + "failed to get inode ctx for " + "%s", + loc->path); + goto out; + } + + dict = dict_new(); + if (!dict) { + gf_log(this->name, GF_LOG_ERROR, "dict_new failed"); + ret = -1; + goto out; + } + + ret = dict_set_int8(dict, QUOTA_DIRTY_KEY, dirty); + if (ret < 0) { + gf_log(this->name, GF_LOG_ERROR, "dict_set failed"); + goto out; + } + + ret = syncop_xattrop(FIRST_CHILD(this), loc, GF_XATTROP_GET_AND_SET, dict, + NULL, NULL, &rsp_dict); + if (ret < 0) { + gf_log_callingfn( + this->name, + (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "xattrop failed " + "for %s: %s", + loc->path, strerror(-ret)); + goto out; + } + + *prev_dirty = 0; + if (rsp_dict) { + ret = dict_get_int8(rsp_dict, QUOTA_DIRTY_KEY, &value); + if (ret == 0) + *prev_dirty = value; + } + + LOCK(&ctx->lock); + { + ctx->dirty = dirty; + } + UNLOCK(&ctx->lock); + ret = 0; +out: + if (dict) + dict_unref(dict); - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, local->loc.inode->gfid); + if (rsp_dict) + dict_unref(rsp_dict); - GF_UUID_ASSERT (local->loc.gfid); + return ret; +} - STACK_WIND (frame, - mq_check_if_still_dirty, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, - &local->loc, - xattr_req); +int32_t +mq_mark_dirty(xlator_t *this, loc_t *loc, int32_t dirty) +{ + int32_t ret = -1; + dict_t *dict = NULL; + quota_inode_ctx_t *ctx = NULL; + + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", loc->inode, out); + + ret = mq_inode_ctx_get(loc->inode, this, &ctx); + if (ret < 0) { + gf_log(this->name, GF_LOG_ERROR, + "failed to get inode ctx for " + "%s", + loc->path); ret = 0; + goto out; + } + + dict = dict_new(); + if (!dict) { + ret = -1; + gf_log(this->name, GF_LOG_ERROR, "dict_new failed"); + goto out; + } + + ret = dict_set_int8(dict, QUOTA_DIRTY_KEY, dirty); + if (ret < 0) { + gf_log(this->name, GF_LOG_ERROR, "dict_set failed"); + goto out; + } + + ret = syncop_setxattr(FIRST_CHILD(this), loc, dict, 0, NULL, NULL); + if (ret < 0) { + gf_log_callingfn( + this->name, + (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "setxattr dirty = %d " + "failed for %s: %s", + dirty, loc->path, strerror(-ret)); + goto out; + } + + LOCK(&ctx->lock); + { + ctx->dirty = dirty; + } + UNLOCK(&ctx->lock); -err: - if (ret) { - local->err = -1; - mq_release_lock_on_dirty_inode(frame, NULL, this, 0, 0, NULL); - } - - if (xattr_req) - dict_unref (xattr_req); +out: + if (dict) + dict_unref(dict); - return 0; + return ret; } -/* return 1 when dirty updation started - * 0 other wise - */ int32_t -mq_update_dirty_inode (xlator_t *this, - loc_t *loc, - quota_inode_ctx_t *ctx, - inode_contribution_t *contribution) +_mq_get_metadata(xlator_t *this, loc_t *loc, quota_meta_t *contri, + quota_meta_t *size, uuid_t contri_gfid) { - int32_t ret = -1; - quota_local_t *local = NULL; - gf_boolean_t status = _gf_false; - struct gf_flock lock = {0, }; - call_frame_t *frame = NULL; - - ret = mq_get_ctx_updation_status (ctx, &status); - if (ret == -1 || status == _gf_true) { - ret = 0; - goto out; - } - - frame = create_frame (this, this->ctx->pool); - if (frame == NULL) { - ret = -1; - goto out; + int32_t ret = -1; + quota_meta_t meta = { + 0, + }; + char contri_key[QUOTA_KEY_MAX] = { + 0, + }; + char size_key[QUOTA_KEY_MAX] = { + 0, + }; + int keylen = 0; + dict_t *dict = NULL; + dict_t *rsp_dict = NULL; + struct iatt stbuf = { + 0, + }; + + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", loc->inode, out); + + if (size == NULL && contri == NULL) + goto out; + + dict = dict_new(); + if (dict == NULL) { + gf_log(this->name, GF_LOG_ERROR, "dict_new failed"); + goto out; + } + + if (size && loc->inode->ia_type == IA_IFDIR) { + GET_SIZE_KEY(this, size_key, keylen); + if (keylen < 0) + goto out; + ret = dict_set_int64(dict, size_key, 0); + if (ret < 0) { + gf_log(this->name, GF_LOG_ERROR, "dict_set failed."); + goto out; } + } - mq_assign_lk_owner (this, frame); - - local = mq_local_new (); - if (local == NULL) - goto fr_destroy; - - frame->local = local; - ret = mq_loc_copy (&local->loc, loc); + if (contri && !loc_is_root(loc)) { + ret = mq_dict_set_contribution(this, dict, loc, contri_gfid, + contri_key); if (ret < 0) - goto fr_destroy; - - local->ctx = ctx; - - local->contri = contribution; + goto out; + } + + ret = syncop_lookup(FIRST_CHILD(this), loc, &stbuf, NULL, dict, &rsp_dict); + if (ret < 0) { + gf_log_callingfn( + this->name, + (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "lookup failed " + "for %s: %s", + loc->path, strerror(-ret)); + goto out; + } + + if (size) { + if (loc->inode->ia_type == IA_IFDIR) { + ret = quota_dict_get_meta(rsp_dict, size_key, keylen, &meta); + if (ret < 0) { + gf_log(this->name, GF_LOG_ERROR, "dict_get failed."); + goto out; + } - lock.l_type = F_WRLCK; - lock.l_whence = SEEK_SET; - lock.l_start = 0; - lock.l_len = 0; + size->size = meta.size; + size->file_count = meta.file_count; + size->dir_count = meta.dir_count; + } else { + size->size = stbuf.ia_blocks * 512; + size->file_count = 1; + size->dir_count = 0; + } + } - if (local->loc.inode == NULL) { - ret = -1; - gf_log (this->name, GF_LOG_WARNING, - "Inode is NULL, so can't stackwind."); - goto fr_destroy; + if (contri && !loc_is_root(loc)) { + ret = quota_dict_get_meta(rsp_dict, contri_key, strlen(contri_key), + &meta); + if (ret < 0) { + contri->size = 0; + contri->file_count = 0; + contri->dir_count = 0; + } else { + contri->size = meta.size; + contri->file_count = meta.file_count; + contri->dir_count = meta.dir_count; } + } - STACK_WIND (frame, - mq_get_dirty_xattr, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, - this->name, &local->loc, F_SETLKW, &lock, NULL); - return 1; + ret = 0; -fr_destroy: - QUOTA_STACK_DESTROY (frame, this); out: + if (dict) + dict_unref(dict); - return 0; -} + if (rsp_dict) + dict_unref(rsp_dict); + return ret; +} int32_t -mq_inode_creation_done (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +mq_get_metadata(xlator_t *this, loc_t *loc, quota_meta_t *contri, + quota_meta_t *size, quota_inode_ctx_t *ctx, + inode_contribution_t *contribution) { - quota_local_t *local = NULL; + int32_t ret = -1; - if (frame == NULL) - return 0; + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", loc->inode, out); + GF_VALIDATE_OR_GOTO("marker", ctx, out); + GF_VALIDATE_OR_GOTO("marker", contribution, out); + + if (size == NULL && contri == NULL) { + ret = 0; + goto out; + } - local = frame->local; + ret = _mq_get_metadata(this, loc, contri, size, contribution->gfid); + if (ret < 0) + goto out; - if (local != NULL) { - mq_initiate_quota_txn (this, &local->loc); + if (size) { + LOCK(&ctx->lock); + { + ctx->size = size->size; + ctx->file_count = size->file_count; + ctx->dir_count = size->dir_count; } + UNLOCK(&ctx->lock); + } - QUOTA_STACK_DESTROY (frame, this); + if (contri) { + LOCK(&contribution->lock); + { + contribution->contribution = contri->size; + contribution->file_count = contri->file_count; + contribution->dir_count = contri->dir_count; + } + UNLOCK(&contribution->lock); + } - return 0; +out: + return ret; } - int32_t -mq_xattr_creation_release_lock (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xdata) +mq_get_delta(xlator_t *this, loc_t *loc, quota_meta_t *delta, + quota_inode_ctx_t *ctx, inode_contribution_t *contribution) { - struct gf_flock lock = {0, }; - quota_local_t *local = NULL; + int32_t ret = -1; + quota_meta_t size = { + 0, + }; + quota_meta_t contri = { + 0, + }; - local = frame->local; + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", loc->inode, out); + GF_VALIDATE_OR_GOTO("marker", ctx, out); + GF_VALIDATE_OR_GOTO("marker", contribution, out); - lock.l_type = F_UNLCK; - lock.l_whence = SEEK_SET; - lock.l_start = 0; - lock.l_len = 0; - lock.l_pid = 0; + ret = mq_get_metadata(this, loc, &contri, &size, ctx, contribution); + if (ret < 0) + goto out; - STACK_WIND (frame, - mq_inode_creation_done, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, - this->name, &local->loc, - F_SETLKW, &lock, NULL); + mq_compute_delta(delta, &size, &contri); - return 0; +out: + return ret; } - int32_t -mq_create_dirty_xattr (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) +mq_remove_contri(xlator_t *this, loc_t *loc, quota_inode_ctx_t *ctx, + inode_contribution_t *contri, quota_meta_t *delta, + uint32_t nlink) { - int32_t ret = -1; - dict_t *newdict = NULL; - quota_local_t *local = NULL; - - if (op_ret < 0) { - goto err; - } - - local = frame->local; + int32_t ret = -1; + char contri_key[QUOTA_KEY_MAX] = { + 0, + }; - if (local->loc.inode->ia_type == IA_IFDIR) { - newdict = dict_new (); - if (!newdict) { - goto err; - } - - ret = dict_set_int8 (newdict, QUOTA_DIRTY_KEY, 0); - if (ret == -1) { - goto err; - } - - uuid_copy (local->loc.gfid, local->loc.inode->gfid); - GF_UUID_ASSERT (local->loc.gfid); - - STACK_WIND (frame, mq_xattr_creation_release_lock, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, - &local->loc, newdict, 0, NULL); + if (nlink == 1) { + /*File was a last link and has been deleted */ + ret = 0; + goto done; + } + + GET_CONTRI_KEY(this, contri_key, contri->gfid, ret); + if (ret < 0) { + gf_log(this->name, GF_LOG_ERROR, + "get contri_key " + "failed for %s", + uuid_utoa(contri->gfid)); + goto out; + } + + ret = syncop_removexattr(FIRST_CHILD(this), loc, contri_key, 0, NULL); + if (ret < 0) { + if (-ret == ENOENT || -ret == ESTALE || -ret == ENODATA || + -ret == ENOATTR) { + /* Remove contri in done when unlink operation is + * performed, so return success on ENOENT/ESTSLE + * rename operation removes xattr earlier, + * so return success on ENODATA + */ + ret = 0; } else { - mq_xattr_creation_release_lock (frame, NULL, this, 0, 0, NULL); + gf_log_callingfn(this->name, GF_LOG_ERROR, + "removexattr %s failed for %s: %s", contri_key, + loc->path, strerror(-ret)); + goto out; } + } - ret = 0; +done: + LOCK(&contri->lock); + { + contri->contribution += delta->size; + contri->file_count += delta->file_count; + contri->dir_count += delta->dir_count; + } + UNLOCK(&contri->lock); -err: - if (ret < 0) { - mq_xattr_creation_release_lock (frame, NULL, this, 0, 0, NULL); - } + ret = 0; - if (newdict != NULL) - dict_unref (newdict); +out: + QUOTA_FREE_CONTRIBUTION_NODE(ctx, contri); - return 0; + return ret; } - int32_t -mq_create_xattr (xlator_t *this, call_frame_t *frame) +mq_update_contri(xlator_t *this, loc_t *loc, inode_contribution_t *contri, + quota_meta_t *delta) { - int32_t ret = 0; - int64_t *value = NULL; - int64_t *size = NULL; - dict_t *dict = NULL; - char key[512] = {0, }; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - inode_contribution_t *contri = NULL; - - if (frame == NULL || this == NULL) - return 0; - - local = frame->local; - - ret = mq_inode_ctx_get (local->loc.inode, this, &ctx); - if (ret < 0) { - ctx = mq_inode_ctx_new (local->loc.inode, this); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "mq_inode_ctx_new failed"); - ret = -1; - goto out; - } - } - - dict = dict_new (); - if (!dict) - goto out; - - if (local->loc.inode->ia_type == IA_IFDIR) { - QUOTA_ALLOC_OR_GOTO (size, int64_t, ret, err); - ret = dict_set_bin (dict, QUOTA_SIZE_KEY, size, 8); - if (ret < 0) - goto free_size; - } - - if ((local->loc.path && strcmp (local->loc.path, "/") != 0) - || (local->loc.inode && !uuid_is_null (local->loc.inode->gfid) && - !__is_root_gfid (local->loc.inode->gfid)) - || (!uuid_is_null (local->loc.gfid) - && !__is_root_gfid (local->loc.gfid))) { - contri = mq_add_new_contribution_node (this, ctx, &local->loc); - if (contri == NULL) - goto err; - - QUOTA_ALLOC_OR_GOTO (value, int64_t, ret, err); - GET_CONTRI_KEY (key, local->loc.parent->gfid, ret); - - ret = dict_set_bin (dict, key, value, 8); - if (ret < 0) - goto free_value; - } - - GF_UUID_ASSERT (local->loc.gfid); - - STACK_WIND (frame, mq_create_dirty_xattr, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->xattrop, &local->loc, - GF_XATTROP_ADD_ARRAY64, dict, NULL); + int32_t ret = -1; + char contri_key[QUOTA_KEY_MAX] = { + 0, + }; + dict_t *dict = NULL; + + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", loc->inode, out); + GF_VALIDATE_OR_GOTO("marker", delta, out); + GF_VALIDATE_OR_GOTO("marker", contri, out); + + if (quota_meta_is_null(delta)) { ret = 0; - -free_size: - if (ret < 0) { - GF_FREE (size); - } - -free_value: - if (ret < 0) { - GF_FREE (value); - } - -err: - dict_unref (dict); + goto out; + } + + dict = dict_new(); + if (!dict) { + gf_log(this->name, GF_LOG_ERROR, "dict_new failed"); + ret = -1; + goto out; + } + + GET_CONTRI_KEY(this, contri_key, contri->gfid, ret); + if (ret < 0) { + gf_log(this->name, GF_LOG_ERROR, + "get contri_key " + "failed for %s", + uuid_utoa(contri->gfid)); + goto out; + } + + ret = quota_dict_set_meta(dict, contri_key, delta, loc->inode->ia_type); + if (ret < 0) + goto out; + + ret = syncop_xattrop(FIRST_CHILD(this), loc, GF_XATTROP_ADD_ARRAY64, dict, + NULL, NULL, NULL); + if (ret < 0) { + gf_log_callingfn( + this->name, + (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "xattrop failed " + "for %s: %s", + loc->path, strerror(-ret)); + goto out; + } + + LOCK(&contri->lock); + { + contri->contribution += delta->size; + contri->file_count += delta->file_count; + contri->dir_count += delta->dir_count; + } + UNLOCK(&contri->lock); out: - if (ret < 0) { - mq_xattr_creation_release_lock (frame, NULL, this, 0, 0, NULL); - } + if (dict) + dict_unref(dict); - return 0; + return ret; } - int32_t -mq_check_n_set_inode_xattr (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *dict, - struct iatt *postparent) +mq_update_size(xlator_t *this, loc_t *loc, quota_meta_t *delta) { - quota_local_t *local = NULL; - int64_t *size = NULL, *contri = NULL; - int8_t dirty = 0; - int32_t ret = 0; - char contri_key[512] = {0, }; - - if (op_ret < 0) { - goto out; - } - - local = frame->local; + int32_t ret = -1; + quota_inode_ctx_t *ctx = NULL; + dict_t *dict = NULL; - ret = dict_get_bin (dict, QUOTA_SIZE_KEY, (void **) &size); - if (ret < 0) - goto create_xattr; + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", loc->inode, out); + GF_VALIDATE_OR_GOTO("marker", delta, out); - ret = dict_get_int8 (dict, QUOTA_DIRTY_KEY, &dirty); - if (ret < 0) - goto create_xattr; - - //check contribution xattr if not root - if ((local->loc.path && strcmp (local->loc.path, "/") != 0) - || (!uuid_is_null (local->loc.gfid) - && !__is_root_gfid (local->loc.gfid)) - || (local->loc.inode - && !uuid_is_null (local->loc.inode->gfid) - && !__is_root_gfid (local->loc.inode->gfid))) { - GET_CONTRI_KEY (contri_key, local->loc.parent->gfid, ret); - if (ret < 0) - goto out; - - ret = dict_get_bin (dict, contri_key, (void **) &contri); - if (ret < 0) - goto create_xattr; - } + if (quota_meta_is_null(delta)) { + ret = 0; + goto out; + } + + ret = mq_inode_ctx_get(loc->inode, this, &ctx); + if (ret < 0) { + gf_log(this->name, GF_LOG_ERROR, + "failed to get inode ctx for " + "%s", + loc->path); + goto out; + } + + dict = dict_new(); + if (!dict) { + gf_log(this->name, GF_LOG_ERROR, "dict_new failed"); + ret = -1; + goto out; + } + + ret = quota_dict_set_size_meta(this, dict, delta); + if (ret < 0) + goto out; + + ret = syncop_xattrop(FIRST_CHILD(this), loc, + GF_XATTROP_ADD_ARRAY64_WITH_DEFAULT, dict, NULL, NULL, + NULL); + if (ret < 0) { + gf_log_callingfn( + this->name, + (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "xattrop failed " + "for %s: %s", + loc->path, strerror(-ret)); + goto out; + } + + LOCK(&ctx->lock); + { + ctx->size += delta->size; + ctx->file_count += delta->file_count; + if (ctx->dir_count == 0) + ctx->dir_count += delta->dir_count + 1; + else + ctx->dir_count += delta->dir_count; + } + UNLOCK(&ctx->lock); out: - mq_xattr_creation_release_lock (frame, NULL, this, 0, 0, NULL); - return 0; + if (dict) + dict_unref(dict); -create_xattr: - if (uuid_is_null (local->loc.gfid)) { - uuid_copy (local->loc.gfid, buf->ia_gfid); - } - - mq_create_xattr (this, frame); - return 0; + return ret; } - -int32_t -mq_get_xattr (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +int +mq_synctask_cleanup(int ret, call_frame_t *frame, void *opaque) { - dict_t *xattr_req = NULL; - quota_local_t *local = NULL; - int32_t ret = 0; - - if (op_ret < 0) { - goto lock_err; - } - - local = frame->local; - - xattr_req = dict_new (); - if (xattr_req == NULL) { - goto err; - } - - ret = mq_req_xattr (this, &local->loc, xattr_req); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "cannot request xattr"); - goto err; - } - - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, local->loc.inode->gfid); - - GF_UUID_ASSERT (local->loc.gfid); + quota_synctask_t *args = NULL; - STACK_WIND (frame, mq_check_n_set_inode_xattr, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, &local->loc, xattr_req); + GF_ASSERT(opaque); - dict_unref (xattr_req); + args = (quota_synctask_t *)opaque; + loc_wipe(&args->loc); - return 0; + if (args->stub) + call_resume(args->stub); -err: - mq_xattr_creation_release_lock (frame, NULL, this, 0, 0, NULL); + if (!args->is_static) + GF_FREE(args); - if (xattr_req) - dict_unref (xattr_req); - return 0; - -lock_err: - mq_inode_creation_done (frame, NULL, this, 0, 0, NULL); - return 0; + return 0; } - -int32_t -mq_set_inode_xattr (xlator_t *this, loc_t *loc) +int +mq_synctask1(xlator_t *this, synctask_fn_t task, gf_boolean_t spawn, loc_t *loc, + quota_meta_t *contri, uint32_t nlink, call_stub_t *stub) { - struct gf_flock lock = {0, }; - quota_local_t *local = NULL; - int32_t ret = 0; - call_frame_t *frame = NULL; - - frame = create_frame (this, this->ctx->pool); - if (!frame) { - ret = -1; - goto err; - } - - local = mq_local_new (); - if (local == NULL) { - goto err; - } - - frame->local = local; - - ret = loc_copy (&local->loc, loc); - if (ret < 0) { - goto err; + int32_t ret = -1; + quota_synctask_t *args = NULL; + quota_synctask_t static_args = { + 0, + }; + + if (spawn) { + QUOTA_ALLOC_OR_GOTO(args, quota_synctask_t, ret, out); + args->is_static = _gf_false; + } else { + args = &static_args; + args->is_static = _gf_true; + } + + args->this = this; + args->stub = stub; + loc_copy(&args->loc, loc); + args->ia_nlink = nlink; + + if (contri) { + args->contri = *contri; + } else { + args->contri.size = -1; + args->contri.file_count = -1; + args->contri.dir_count = -1; + } + + if (spawn) { + ret = synctask_new1(this->ctx->env, 1024 * 16, task, + mq_synctask_cleanup, NULL, args); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "Failed to spawn " + "new synctask"); + mq_synctask_cleanup(ret, NULL, args); } + } else { + ret = task(args); + mq_synctask_cleanup(ret, NULL, args); + } - frame->local = local; - - lock.l_len = 0; - lock.l_start = 0; - lock.l_type = F_WRLCK; - lock.l_whence = SEEK_SET; - - STACK_WIND (frame, - mq_get_xattr, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, - this->name, &local->loc, F_SETLKW, &lock, NULL); - - return 0; - -err: - QUOTA_STACK_DESTROY (frame, this); - - return 0; +out: + return ret; } +int +mq_synctask(xlator_t *this, synctask_fn_t task, gf_boolean_t spawn, loc_t *loc) +{ + return mq_synctask1(this, task, spawn, loc, NULL, -1, NULL); +} int32_t -mq_get_parent_inode_local (xlator_t *this, quota_local_t *local) +mq_prevalidate_txn(xlator_t *this, loc_t *origin_loc, loc_t *loc, + quota_inode_ctx_t **ctx, struct iatt *buf) { - int32_t ret = -1; - quota_inode_ctx_t *ctx = NULL; - inode_contribution_t *contribution = NULL; + int32_t ret = -1; + quota_inode_ctx_t *ctxtmp = NULL; + + if (buf) { + if (buf->ia_type == IA_IFREG && IS_DHT_LINKFILE_MODE(buf)) + goto out; + + if (buf->ia_type != IA_IFREG && buf->ia_type != IA_IFLNK && + buf->ia_type != IA_IFDIR) + goto out; + } + + if (origin_loc == NULL || origin_loc->inode == NULL || + gf_uuid_is_null(origin_loc->inode->gfid)) + goto out; + + loc_copy(loc, origin_loc); + + if (gf_uuid_is_null(loc->gfid)) + gf_uuid_copy(loc->gfid, loc->inode->gfid); + + if (!loc_is_root(loc) && loc->parent == NULL) + loc->parent = inode_parent(loc->inode, 0, NULL); + + ret = mq_inode_ctx_get(loc->inode, this, &ctxtmp); + if (ret < 0) { + gf_log_callingfn(this->name, GF_LOG_WARNING, + "inode ctx for " + "is NULL for %s", + loc->path); + goto out; + } + if (ctx) + *ctx = ctxtmp; + + ret = 0; +out: + return ret; +} - GF_VALIDATE_OR_GOTO ("marker", this, out); - GF_VALIDATE_OR_GOTO ("marker", local, out); +int +mq_create_xattrs_task(void *opaque) +{ + int32_t ret = -1; + gf_boolean_t locked = _gf_false; + gf_boolean_t contri_set = _gf_false; + gf_boolean_t size_set = _gf_false; + gf_boolean_t need_txn = _gf_false; + quota_synctask_t *args = NULL; + quota_inode_ctx_t *ctx = NULL; + xlator_t *this = NULL; + loc_t *loc = NULL; + gf_boolean_t status = _gf_false; + + GF_ASSERT(opaque); + + args = (quota_synctask_t *)opaque; + loc = &args->loc; + this = args->this; + THIS = this; + + ret = mq_inode_ctx_get(loc->inode, this, &ctx); + if (ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "Failed to" + "get inode ctx, aborting quota create txn"); + goto out; + } + + if (loc->inode->ia_type == IA_IFDIR) { + /* lock not required for files */ + ret = mq_lock(this, loc, F_WRLCK); + if (ret < 0) + goto out; + locked = _gf_true; + } - local->contri = NULL; + ret = mq_are_xattrs_set(this, loc, &contri_set, &size_set); + if (ret < 0 || (contri_set && size_set)) + goto out; - loc_wipe (&local->loc); + mq_set_ctx_create_status(ctx, _gf_false); + status = _gf_true; - ret = mq_loc_copy (&local->loc, &local->parent_loc); - if (ret < 0) { - gf_log_callingfn (this->name, GF_LOG_WARNING, - "loc copy failed"); - goto out; - } + if (loc->inode->ia_type == IA_IFDIR && size_set == _gf_false) { + ret = mq_create_size_xattrs(this, ctx, loc); + if (ret < 0) + goto out; + } - loc_wipe (&local->parent_loc); + need_txn = _gf_true; +out: + if (locked) + ret = mq_lock(this, loc, F_UNLCK); - ret = mq_inode_loc_fill (NULL, local->loc.parent, - &local->parent_loc); - if (ret < 0) { - gf_log_callingfn (this->name, GF_LOG_WARNING, - "failed to build parent loc of %s", - local->loc.path); - goto out; - } + if (status == _gf_false) + mq_set_ctx_create_status(ctx, _gf_false); - ret = mq_inode_ctx_get (local->loc.inode, this, &ctx); - if (ret < 0) { - gf_log_callingfn (this->name, GF_LOG_WARNING, - "inode ctx get failed"); - goto out; - } + if (need_txn) + ret = mq_initiate_quota_blocking_txn(this, loc, NULL); - local->ctx = ctx; + return ret; +} - if (list_empty (&ctx->contribution_head)) { - gf_log_callingfn (this->name, GF_LOG_WARNING, - "contribution node list is empty which " - "is an error"); - ret = -1; - goto out; +static int +_mq_create_xattrs_txn(xlator_t *this, loc_t *origin_loc, struct iatt *buf, + gf_boolean_t spawn) +{ + int32_t ret = -1; + quota_inode_ctx_t *ctx = NULL; + gf_boolean_t status = _gf_true; + loc_t loc = { + 0, + }; + inode_contribution_t *contribution = NULL; + + ret = mq_prevalidate_txn(this, origin_loc, &loc, &ctx, buf); + if (ret < 0) + goto out; + + ret = mq_test_and_set_ctx_create_status(ctx, &status); + if (ret < 0 || status == _gf_true) + goto out; + + if (!loc_is_root(&loc) && loc.parent) { + contribution = mq_add_new_contribution_node(this, ctx, &loc); + if (contribution == NULL) { + gf_log(this->name, GF_LOG_WARNING, + "cannot add a new contribution node " + "(%s)", + uuid_utoa(loc.gfid)); + ret = -1; + goto out; + } else { + GF_REF_PUT(contribution); } + } - /* Earlier we used to get the next entry in the list maintained - by the context. In a good situation it works. i.e the next - parent in the directory hierarchy for this path is obtained. - - But consider the below situation: - mount-point: /mnt/point - quota enabled directories within mount point: /a, /b, /c - - Now when some file (file1) in the directory /c is written some data, - then to update the directories, marker has to get the contribution - object for the parent inode, i.e /c. - Beefore, it was being done by - local->contri = (inode_contribution_t *) ctx->contribution_head.next; - It works in the normal situations. But suppose /c is moved to /b. - Now /b's contribution object is added to the end of the list of - parents that the file file1 within /b/c is maintaining. Now if - the file /b/c/file1 is copied to /b/c/new, to update the parent in - the order c, b and / we cannot go to the next element in the list, - as in this case the next contribution object would be / and /b's - contribution will be at the end of the list. So get the proper - parent's contribution, by searching the entire list. - */ - contribution = mq_get_contribution_node (local->loc.parent, ctx); - GF_ASSERT (contribution != NULL); - local->contri = contribution; - - ret = 0; + ret = mq_synctask(this, mq_create_xattrs_task, spawn, &loc); out: - return ret; -} - + if (ret < 0 && status == _gf_false) + mq_set_ctx_create_status(ctx, _gf_false); -int32_t -mq_xattr_updation_done (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *dict, dict_t *xdata) -{ - QUOTA_STACK_DESTROY (frame, this); - return 0; + loc_wipe(&loc); + return ret; } - -int32_t -mq_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +int +mq_create_xattrs_txn(xlator_t *this, loc_t *loc, struct iatt *buf) { - int32_t ret = 0; - gf_boolean_t status = _gf_false; - quota_local_t *local = NULL; - - local = frame->local; - - if (op_ret == -1 || local->err) { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking failed on path (%s)(%s)", - local->parent_loc.path, strerror (op_errno)); - } - mq_xattr_updation_done (frame, NULL, this, 0, 0, NULL, NULL); - - return 0; - } + int32_t ret = -1; - gf_log (this->name, GF_LOG_DEBUG, - "inodelk released on %s", local->parent_loc.path); + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", loc->inode, out); - if ((strcmp (local->parent_loc.path, "/") == 0) - || (local->delta == 0)) { - mq_xattr_updation_done (frame, NULL, this, 0, 0, NULL, NULL); - } else { - ret = mq_get_parent_inode_local (this, local); - if (ret < 0) { - mq_xattr_updation_done (frame, NULL, this, 0, 0, NULL, - NULL); - goto out; - } - status = _gf_true; - - ret = mq_test_and_set_ctx_updation_status (local->ctx, &status); - if (ret == 0 && status == _gf_false) { - mq_get_lock_on_parent (frame, this); - } else { - mq_xattr_updation_done (frame, NULL, this, 0, 0, NULL, - NULL); - } - } + ret = _mq_create_xattrs_txn(this, loc, buf, _gf_true); out: - return 0; + return ret; } - -//now release lock on the parent inode int32_t -mq_release_parent_lock (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xdata) +mq_reduce_parent_size_task(void *opaque) { - int32_t ret = 0; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - struct gf_flock lock = {0, }; - - local = frame->local; - - if (local->err != 0) { - gf_log_callingfn (this->name, - (local->err == ENOENT) ? GF_LOG_DEBUG - : GF_LOG_WARNING, - "An operation during quota updation " - "of path (%s) failed (%s)", local->loc.path, - strerror (local->err)); + int32_t ret = -1; + int32_t prev_dirty = 0; + quota_inode_ctx_t *ctx = NULL; + quota_inode_ctx_t *parent_ctx = NULL; + inode_contribution_t *contribution = NULL; + quota_meta_t delta = { + 0, + }; + quota_meta_t contri = { + 0, + }; + loc_t parent_loc = { + 0, + }; + gf_boolean_t locked = _gf_false; + gf_boolean_t dirty = _gf_false; + quota_synctask_t *args = NULL; + xlator_t *this = NULL; + loc_t *loc = NULL; + gf_boolean_t remove_xattr = _gf_true; + uint32_t nlink = 0; + + GF_ASSERT(opaque); + + args = (quota_synctask_t *)opaque; + loc = &args->loc; + contri = args->contri; + nlink = args->ia_nlink; + this = args->this; + THIS = this; + + ret = mq_inode_loc_fill(NULL, loc->parent, &parent_loc); + if (ret < 0) { + gf_log(this->name, GF_LOG_ERROR, + "parent_loc fill failed for " + "child inode %s: ", + uuid_utoa(loc->inode->gfid)); + goto out; + } + + ret = mq_lock(this, &parent_loc, F_WRLCK); + if (ret < 0) + goto out; + locked = _gf_true; + + if (contri.size >= 0) { + /* contri parameter is supplied only for rename operation. + * remove xattr is alreday performed, we need to skip + * removexattr for rename operation + */ + remove_xattr = _gf_false; + delta.size = contri.size; + delta.file_count = contri.file_count; + delta.dir_count = contri.dir_count; + } else { + remove_xattr = _gf_true; + + ret = mq_inode_ctx_get(loc->inode, this, &ctx); + if (ret < 0) { + gf_log_callingfn(this->name, GF_LOG_WARNING, + "ctx for" + " the node %s is NULL", + loc->path); + goto out; } - ret = mq_inode_ctx_get (local->parent_loc.inode, this, &ctx); - if (ret < 0) - goto wind; - - LOCK (&ctx->lock); - { - ctx->dirty = 0; + contribution = mq_get_contribution_node(loc->parent, ctx); + if (contribution == NULL) { + ret = -1; + gf_log(this->name, GF_LOG_DEBUG, + "contribution for the node %s is NULL", loc->path); + goto out; } - UNLOCK (&ctx->lock); - if (local->parent_loc.inode == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "Invalid parent inode."); - goto err; + LOCK(&contribution->lock); + { + delta.size = contribution->contribution; + delta.file_count = contribution->file_count; + delta.dir_count = contribution->dir_count; } + UNLOCK(&contribution->lock); + } -wind: - lock.l_type = F_UNLCK; - lock.l_whence = SEEK_SET; - lock.l_start = 0; - lock.l_len = 0; - lock.l_pid = 0; - - STACK_WIND (frame, - mq_inodelk_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, - this->name, &local->parent_loc, - F_SETLKW, &lock, NULL); - - return 0; -err: - mq_xattr_updation_done (frame, NULL, this, - 0, 0 , NULL, NULL); - return 0; -} - - -int32_t -mq_mark_undirty (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *dict, dict_t *xdata) -{ - int32_t ret = -1; - int64_t *size = NULL; - dict_t *newdict = NULL; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; + ret = mq_get_set_dirty(this, &parent_loc, 1, &prev_dirty); + if (ret < 0) + goto out; + dirty = _gf_true; - local = frame->local; + mq_sub_meta(&delta, NULL); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "%s occurred while" - " updating the size of %s", strerror (op_errno), - local->parent_loc.path); - - goto err; - } - - //update the size of the parent inode - if (dict != NULL) { - ret = mq_inode_ctx_get (local->parent_loc.inode, this, &ctx); - if (ret < 0) { - op_errno = EINVAL; - goto err; - } - - ret = dict_get_bin (dict, QUOTA_SIZE_KEY, (void **) &size); - if (ret < 0) { - op_errno = EINVAL; - goto err; - } - - LOCK (&ctx->lock); - { - if (size) - ctx->size = ntoh64 (*size); - gf_log (this->name, GF_LOG_DEBUG, "%s %"PRId64, - local->parent_loc.path, ctx->size); - } - UNLOCK (&ctx->lock); - } + if (remove_xattr) { + ret = mq_remove_contri(this, loc, ctx, contribution, &delta, nlink); + if (ret < 0) + goto out; + } - newdict = dict_new (); - if (!newdict) { - op_errno = ENOMEM; - goto err; - } + if (quota_meta_is_null(&delta)) + goto out; - ret = dict_set_int8 (newdict, QUOTA_DIRTY_KEY, 0); + ret = mq_update_size(this, &parent_loc, &delta); + if (ret < 0) + goto out; - if (ret == -1) { - op_errno = -ret; - goto err; +out: + if (dirty) { + if (ret < 0 || prev_dirty) { + /* On failure clear dirty status flag. + * In the next lookup inspect_directory_xattr + * can set the status flag and fix the + * dirty directory. + * Do the same if dir was dirty before + * the txn + */ + ret = mq_inode_ctx_get(parent_loc.inode, this, &parent_ctx); + if (ret == 0) + mq_set_ctx_dirty_status(parent_ctx, _gf_false); + } else { + ret = mq_mark_dirty(this, &parent_loc, 0); } + } - uuid_copy (local->parent_loc.gfid, local->parent_loc.inode->gfid); - GF_UUID_ASSERT (local->parent_loc.gfid); + if (locked) + ret = mq_lock(this, &parent_loc, F_UNLCK); - STACK_WIND (frame, mq_release_parent_lock, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, - &local->parent_loc, newdict, 0, NULL); - - ret = 0; -err: - if (op_ret == -1 || ret == -1) { - local->err = op_errno; + if (ret >= 0) + ret = mq_initiate_quota_blocking_txn(this, &parent_loc, NULL); - mq_release_parent_lock (frame, NULL, this, 0, 0, NULL); - } + loc_wipe(&parent_loc); - if (newdict) - dict_unref (newdict); + if (contribution) + GF_REF_PUT(contribution); - return 0; + return ret; } - int32_t -mq_update_parent_size (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *dict, dict_t *xdata) +mq_reduce_parent_size_txn(xlator_t *this, loc_t *origin_loc, + quota_meta_t *contri, uint32_t nlink, + call_stub_t *stub) { - int64_t *size = NULL; - int32_t ret = -1; - dict_t *newdict = NULL; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - - local = frame->local; + int32_t ret = -1; + loc_t loc = { + 0, + }; + gf_boolean_t resume_stub = _gf_true; - if (op_ret == -1) { - gf_log (this->name, ((op_errno == ENOENT) ? GF_LOG_DEBUG : - GF_LOG_WARNING), - "xattrop call failed: %s", strerror (op_errno)); + GF_VALIDATE_OR_GOTO("marker", this, out); + GF_VALIDATE_OR_GOTO("marker", origin_loc, out); - goto err; - } + ret = mq_prevalidate_txn(this, origin_loc, &loc, NULL, NULL); + if (ret < 0) + goto out; - LOCK (&local->contri->lock); - { - local->contri->contribution += local->delta; - } - UNLOCK (&local->contri->lock); - - gf_log_callingfn (this->name, GF_LOG_DEBUG, "path: %s size: %"PRId64 - " contribution:%"PRId64, - local->loc.path, local->ctx->size, - local->contri->contribution); - - if (dict == NULL) { - op_errno = EINVAL; - goto err; - } - - ret = mq_inode_ctx_get (local->parent_loc.inode, this, &ctx); - if (ret < 0) { - op_errno = EINVAL; - goto err; - } - - newdict = dict_new (); - if (!newdict) { - op_errno = ENOMEM; - ret = -1; - goto err; - } - - QUOTA_ALLOC_OR_GOTO (size, int64_t, ret, err); - - *size = hton64 (local->delta); + if (loc_is_root(&loc)) { + ret = 0; + goto out; + } - ret = dict_set_bin (newdict, QUOTA_SIZE_KEY, size, 8); - if (ret < 0) { - op_errno = -ret; - goto err; - } + resume_stub = _gf_false; + ret = mq_synctask1(this, mq_reduce_parent_size_task, _gf_true, &loc, contri, + nlink, stub); +out: + loc_wipe(&loc); - if (uuid_is_null (local->parent_loc.gfid)) - uuid_copy (local->parent_loc.gfid, - local->parent_loc.inode->gfid); - GF_UUID_ASSERT (local->parent_loc.gfid); - - STACK_WIND (frame, - mq_mark_undirty, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->xattrop, - &local->parent_loc, - GF_XATTROP_ADD_ARRAY64, - newdict, NULL); - ret = 0; -err: - if (op_ret == -1 || ret < 0) { - local->err = op_errno; - mq_release_parent_lock (frame, NULL, this, 0, 0, NULL); - } + if (resume_stub && stub) + call_resume(stub); - if (newdict) - dict_unref (newdict); + if (ret) + gf_log_callingfn(this ? this->name : "Marker", GF_LOG_ERROR, + "mq_reduce_parent_size_txn failed"); - return 0; + return ret; } -int32_t -mq_update_inode_contribution (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *dict, - struct iatt *postparent) +int +mq_initiate_quota_task(void *opaque) { - int32_t ret = -1; - int64_t *size = NULL, size_int = 0, contri_int = 0; - int64_t *contri = NULL; - int64_t *delta = NULL; - char contri_key [512] = {0, }; - dict_t *newdict = NULL; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - inode_contribution_t *contribution = NULL; - - local = frame->local; - - if (op_ret == -1) { - gf_log (this->name, ((op_errno == ENOENT) ? GF_LOG_DEBUG : - GF_LOG_WARNING), - "failed to get size and contribution of path (%s)(%s)", - local->loc.path, strerror (op_errno)); - goto err; - } - - ctx = local->ctx; - contribution = local->contri; - - //prepare to update size & contribution of the inode - GET_CONTRI_KEY (contri_key, contribution->gfid, ret); - if (ret == -1) { - op_errno = ENOMEM; - goto err; - } - - LOCK (&ctx->lock); - { - if (local->loc.inode->ia_type == IA_IFDIR ) { - ret = dict_get_bin (dict, QUOTA_SIZE_KEY, - (void **) &size); - if (ret < 0) { - op_errno = EINVAL; - goto unlock; - } - - ctx->size = ntoh64 (*size); - } else - ctx->size = buf->ia_blocks * 512; - - size_int = ctx->size; - } -unlock: - UNLOCK (&ctx->lock); - + int32_t ret = -1; + int32_t prev_dirty = 0; + loc_t child_loc = { + 0, + }; + loc_t parent_loc = { + 0, + }; + gf_boolean_t locked = _gf_false; + gf_boolean_t dirty = _gf_false; + gf_boolean_t status = _gf_false; + quota_meta_t delta = { + 0, + }; + quota_synctask_t *args = NULL; + xlator_t *this = NULL; + loc_t *loc = NULL; + inode_contribution_t *contri = NULL; + quota_inode_ctx_t *ctx = NULL; + quota_inode_ctx_t *parent_ctx = NULL; + inode_t *tmp_parent = NULL; + + GF_VALIDATE_OR_GOTO("marker", opaque, out); + + args = (quota_synctask_t *)opaque; + loc = &args->loc; + this = args->this; + + GF_VALIDATE_OR_GOTO("marker", this, out); + THIS = this; + + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + ret = mq_loc_copy(&child_loc, loc); + if (ret < 0) { + gf_log(this->name, GF_LOG_ERROR, "loc copy failed"); + goto out; + } + + while (!__is_root_gfid(child_loc.gfid)) { + ret = mq_inode_ctx_get(child_loc.inode, this, &ctx); if (ret < 0) { - goto err; - } - - ret = dict_get_bin (dict, contri_key, (void **) &contri); - - LOCK (&contribution->lock); - { - if (ret < 0) - contribution->contribution = 0; - else - contribution->contribution = ntoh64 (*contri); - - contri_int = contribution->contribution; + gf_log(this->name, GF_LOG_WARNING, + "inode ctx get failed for %s, " + "aborting update txn", + child_loc.path); + goto out; } - UNLOCK (&contribution->lock); - - gf_log (this->name, GF_LOG_DEBUG, "%s %"PRId64 "%"PRId64, - local->loc.path, size_int, contri_int); - - local->delta = size_int - contri_int; - if (local->delta == 0) { - mq_mark_undirty (frame, NULL, this, 0, 0, NULL, NULL); - return 0; + /* To improve performance, abort current transaction + * if one is already in progress for same inode + */ + if (status == _gf_true) { + /* status will already set before txn start, + * so it should not be set in first + * loop iteration + */ + ret = mq_test_and_set_ctx_updation_status(ctx, &status); + if (ret < 0 || status == _gf_true) + goto out; } - newdict = dict_new (); - if (newdict == NULL) { - op_errno = ENOMEM; + if (child_loc.parent == NULL) { + ret = mq_build_ancestry(this, &child_loc); + if (ret < 0 || child_loc.parent == NULL) { + /* If application performs parallel remove + * operations on same set of files/directories + * then we may get ENOENT/ESTALE + */ + gf_log(this->name, + (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG + : GF_LOG_ERROR, + "build ancestry failed for inode %s", + uuid_utoa(child_loc.inode->gfid)); ret = -1; - goto err; + goto out; + } } - QUOTA_ALLOC_OR_GOTO (delta, int64_t, ret, err); - - *delta = hton64 (local->delta); - - ret = dict_set_bin (newdict, contri_key, delta, 8); + ret = mq_inode_loc_fill(NULL, child_loc.parent, &parent_loc); if (ret < 0) { - op_errno = -ret; - ret = -1; - goto err; + gf_log(this->name, GF_LOG_ERROR, + "parent_loc fill " + "failed for child inode %s: ", + uuid_utoa(child_loc.inode->gfid)); + goto out; } - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, buf->ia_gfid); - - GF_UUID_ASSERT (local->loc.gfid); - - STACK_WIND (frame, - mq_update_parent_size, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY64, - newdict, NULL); - ret = 0; - -err: - if (op_ret == -1 || ret < 0) { - local->err = op_errno; - - mq_release_parent_lock (frame, NULL, this, 0, 0, NULL); - } - - if (newdict) - dict_unref (newdict); - - return 0; -} + ret = mq_lock(this, &parent_loc, F_WRLCK); + if (ret < 0) + goto out; + locked = _gf_true; -int32_t -mq_fetch_child_size_and_contri (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xdata) -{ - int32_t ret = -1; - char contri_key [512] = {0, }; - dict_t *newdict = NULL; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - - local = frame->local; - - if (op_ret == -1) { - gf_log (this->name, (op_errno == ENOENT) ? GF_LOG_DEBUG - : GF_LOG_WARNING, - "couldnt mark inode corresponding to path (%s) dirty " - "(%s)", local->parent_loc.path, strerror (op_errno)); - goto err; - } + mq_set_ctx_updation_status(ctx, _gf_false); + status = _gf_true; - VALIDATE_OR_GOTO (local->ctx, err); - VALIDATE_OR_GOTO (local->contri, err); + /* Contribution node can be NULL in below scenarios and + create if needed: - gf_log (this->name, GF_LOG_DEBUG, "%s marked dirty", - local->parent_loc.path); + Scenario 1) + In this case create a new contribution node + Suppose hard link for a file f1 present in a directory d1 is + created in the directory d2 (as f2). Now, since d2's + contribution is not there in f1's inode ctx, d2's + contribution xattr won't be created and will create problems + for quota operations. - //update parent ctx - ret = mq_inode_ctx_get (local->parent_loc.inode, this, &ctx); - if (ret == -1) { - op_errno = EINVAL; - goto err; + Don't create contribution if parent has been changed after + taking a lock, this can happen when rename is performed + and writes is still in-progress for the same file + + Scenario 2) + When a rename operation is performed, contribution node + for olp path will be removed. + + Create contribution node only if oldparent is same as + newparent. + Consider below example + 1) rename FOP invoked on file 'x' + 2) write is still in progress for file 'x' + 3) rename takes a lock on old-parent + 4) write-update txn blocked on old-parent to acquire lock + 5) in rename_cbk, contri xattrs are removed and contribution + is deleted and lock is released + 6) now write-update txn gets the lock and updates the + wrong parent as it was holding lock on old parent + so validate parent once the lock is acquired + + For more information on this problem, please see + doc for marker_rename in file marker.c + */ + contri = mq_get_contribution_node(child_loc.parent, ctx); + if (contri == NULL) { + tmp_parent = inode_parent(child_loc.inode, 0, NULL); + if (tmp_parent == NULL) { + /* This can happen if application performs + * parallel remove operations on same set + * of files/directories + */ + gf_log(this->name, GF_LOG_WARNING, + "parent is " + "NULL for inode %s", + uuid_utoa(child_loc.inode->gfid)); + ret = -1; + goto out; + } + if (gf_uuid_compare(tmp_parent->gfid, parent_loc.gfid)) { + /* abort txn if parent has changed */ + ret = 0; + goto out; + } + + inode_unref(tmp_parent); + tmp_parent = NULL; + + contri = mq_add_new_contribution_node(this, ctx, &child_loc); + if (contri == NULL) { + gf_log(this->name, GF_LOG_ERROR, + "Failed to " + "create contribution node for %s, " + "abort update txn", + child_loc.path); + ret = -1; + goto out; + } } - LOCK (&ctx->lock); - { - ctx->dirty = 1; - } - UNLOCK (&ctx->lock); + ret = mq_get_delta(this, &child_loc, &delta, ctx, contri); + if (ret < 0) + goto out; - newdict = dict_new (); - if (newdict == NULL) { - op_errno = ENOMEM; - goto err; - } + if (quota_meta_is_null(&delta)) + goto out; - if (local->loc.inode->ia_type == IA_IFDIR) { - ret = dict_set_int64 (newdict, QUOTA_SIZE_KEY, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "dict_set failed."); - goto err; - } - } + ret = mq_get_set_dirty(this, &parent_loc, 1, &prev_dirty); + if (ret < 0) + goto out; + dirty = _gf_true; - GET_CONTRI_KEY (contri_key, local->contri->gfid, ret); - if (ret < 0) { - op_errno = ENOMEM; - goto err; - } + ret = mq_update_contri(this, &child_loc, contri, &delta); + if (ret < 0) + goto out; - ret = dict_set_int64 (newdict, contri_key, 0); + ret = mq_update_size(this, &parent_loc, &delta); if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "dict_set failed."); - goto err; + gf_log(this->name, GF_LOG_DEBUG, + "rollback " + "contri updation"); + mq_sub_meta(&delta, NULL); + mq_update_contri(this, &child_loc, contri, &delta); + goto out; } - mq_set_ctx_updation_status (local->ctx, _gf_false); - - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, local->loc.inode->gfid); - - GF_UUID_ASSERT (local->loc.gfid); - - STACK_WIND (frame, mq_update_inode_contribution, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, &local->loc, newdict); - - ret = 0; - -err: - if ((op_ret == -1) || (ret < 0)) { - local->err = op_errno; - - mq_set_ctx_updation_status (local->ctx, _gf_false); - - mq_release_parent_lock (frame, NULL, this, 0, 0, NULL); + if (prev_dirty == 0) { + ret = mq_mark_dirty(this, &parent_loc, 0); + } else { + ret = mq_inode_ctx_get(parent_loc.inode, this, &parent_ctx); + if (ret == 0) + mq_set_ctx_dirty_status(parent_ctx, _gf_false); } + dirty = _gf_false; + prev_dirty = 0; - if (newdict) - dict_unref (newdict); - - return 0; -} - -int32_t -mq_markdirty (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - int32_t ret = -1; - dict_t *dict = NULL; - quota_local_t *local = NULL; - - local = frame->local; + ret = mq_lock(this, &parent_loc, F_UNLCK); + locked = _gf_false; - if (op_ret == -1){ - gf_log (this->name, (op_errno == ENOENT) ? GF_LOG_DEBUG - : GF_LOG_WARNING, "acquiring locks failed on %s (%s)", - local->parent_loc.path, strerror (op_errno)); + if (__is_root_gfid(parent_loc.gfid)) + break; - local->err = op_errno; - - mq_set_ctx_updation_status (local->ctx, _gf_false); - - mq_inodelk_cbk (frame, NULL, this, 0, 0, NULL); - - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "inodelk succeeded on %s", local->parent_loc.path); - - dict = dict_new (); - if (!dict) { - ret = -1; - goto err; - } + /* Repeate above steps upwards till the root */ + loc_wipe(&child_loc); + ret = mq_loc_copy(&child_loc, &parent_loc); + if (ret < 0) + goto out; - ret = dict_set_int8 (dict, QUOTA_DIRTY_KEY, 1); - if (ret == -1) - goto err; + loc_wipe(&parent_loc); + GF_REF_PUT(contri); + contri = NULL; + } - uuid_copy (local->parent_loc.gfid, - local->parent_loc.inode->gfid); - GF_UUID_ASSERT (local->parent_loc.gfid); +out: + if ((dirty) && (ret < 0)) { + /* On failure clear dirty status flag. + * In the next lookup inspect_directory_xattr + * can set the status flag and fix the + * dirty directory. + * Do the same if the dir was dirty before + * txn + */ + ret = mq_inode_ctx_get(parent_loc.inode, this, &parent_ctx); + if (ret == 0) + mq_set_ctx_dirty_status(parent_ctx, _gf_false); + } - STACK_WIND (frame, mq_fetch_child_size_and_contri, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, - &local->parent_loc, dict, 0, NULL); + if (locked) + ret = mq_lock(this, &parent_loc, F_UNLCK); - ret = 0; -err: - if (ret == -1) { - local->err = 1; + if (ctx && status == _gf_false) + mq_set_ctx_updation_status(ctx, _gf_false); - mq_set_ctx_updation_status (local->ctx, _gf_false); + loc_wipe(&child_loc); + loc_wipe(&parent_loc); - mq_release_parent_lock (frame, NULL, this, 0, 0, NULL); - } + if (tmp_parent) + inode_unref(tmp_parent); - if (dict) - dict_unref (dict); + if (contri) + GF_REF_PUT(contri); - return 0; + return 0; } - -int32_t -mq_get_lock_on_parent (call_frame_t *frame, xlator_t *this) +int +_mq_initiate_quota_txn(xlator_t *this, loc_t *origin_loc, struct iatt *buf, + gf_boolean_t spawn) { - struct gf_flock lock = {0, }; - quota_local_t *local = NULL; - - GF_VALIDATE_OR_GOTO ("marker", frame, fr_destroy); - - local = frame->local; - gf_log (this->name, GF_LOG_DEBUG, "taking lock on %s", - local->parent_loc.path); - - if (local->parent_loc.inode == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "parent inode is not valid, aborting " - "transaction."); - goto fr_destroy; - } - - lock.l_len = 0; - lock.l_start = 0; - lock.l_type = F_WRLCK; - lock.l_whence = SEEK_SET; + int32_t ret = -1; + quota_inode_ctx_t *ctx = NULL; + gf_boolean_t status = _gf_true; + loc_t loc = { + 0, + }; + + ret = mq_prevalidate_txn(this, origin_loc, &loc, &ctx, buf); + if (ret < 0) + goto out; + + if (loc_is_root(&loc)) { + ret = 0; + goto out; + } - STACK_WIND (frame, - mq_markdirty, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, - this->name, &local->parent_loc, F_SETLKW, &lock, NULL); + ret = mq_test_and_set_ctx_updation_status(ctx, &status); + if (ret < 0 || status == _gf_true) + goto out; - return 0; + ret = mq_synctask(this, mq_initiate_quota_task, spawn, &loc); -fr_destroy: - QUOTA_STACK_DESTROY (frame, this); +out: + if (ret < 0 && status == _gf_false) + mq_set_ctx_updation_status(ctx, _gf_false); - return -1; + loc_wipe(&loc); + return ret; } int -mq_prepare_txn_frame (xlator_t *this, loc_t *loc, - quota_inode_ctx_t *ctx, - inode_contribution_t *contri, - call_frame_t **new_frame) +mq_initiate_quota_txn(xlator_t *this, loc_t *loc, struct iatt *buf) { - call_frame_t *frame = NULL; - int ret = -1; - quota_local_t *local = NULL; - - if (!this || !loc || !new_frame) - goto err; - - frame = create_frame (this, this->ctx->pool); - if (frame == NULL) - goto err; + int32_t ret = -1; - mq_assign_lk_owner (this, frame); + GF_VALIDATE_OR_GOTO("marker", this, out); + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", loc->inode, out); - local = mq_local_new (); - if (local == NULL) - goto fr_destroy; - - frame->local = local; - - ret = mq_loc_copy (&local->loc, loc); - if (ret < 0) - goto fr_destroy; - - ret = mq_inode_loc_fill (NULL, local->loc.parent, - &local->parent_loc); - if (ret < 0) - goto fr_destroy; - - local->ctx = ctx; - local->contri = contri; - - ret = 0; - *new_frame = frame; - - return ret; - -fr_destroy: - QUOTA_STACK_DESTROY (frame, this); -err: - return ret; + ret = _mq_initiate_quota_txn(this, loc, buf, _gf_true); +out: + return ret; } int -mq_start_quota_txn (xlator_t *this, loc_t *loc, - quota_inode_ctx_t *ctx, - inode_contribution_t *contri) +mq_initiate_quota_blocking_txn(xlator_t *this, loc_t *loc, struct iatt *buf) { - int32_t ret = -1; - call_frame_t *frame = NULL; + int32_t ret = -1; - ret = mq_prepare_txn_frame (this, loc, ctx, - contri, &frame); - if (ret) - goto err; + GF_VALIDATE_OR_GOTO("marker", this, out); + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", loc->inode, out); - ret = mq_get_lock_on_parent (frame, this); - if (ret == -1) - goto err; - - return 0; - -err: - mq_set_ctx_updation_status (ctx, _gf_false); - - return -1; + ret = _mq_initiate_quota_txn(this, loc, buf, _gf_false); +out: + return ret; } - int -mq_initiate_quota_txn (xlator_t *this, loc_t *loc) +mq_update_dirty_inode_task(void *opaque) { - int32_t ret = -1; - gf_boolean_t status = _gf_false; - quota_inode_ctx_t *ctx = NULL; - inode_contribution_t *contribution = NULL; - - GF_VALIDATE_OR_GOTO ("marker", this, out); - GF_VALIDATE_OR_GOTO ("marker", loc, out); - GF_VALIDATE_OR_GOTO ("marker", loc->inode, out); - - ret = mq_inode_ctx_get (loc->inode, this, &ctx); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "inode ctx get failed, aborting quota txn"); - ret = -1; - goto out; - } - - /* Create the contribution node if its absent. Is it right to - assume that if the contribution node is not there, then - create one and proceed instead of returning? - Reason for this assumption is for hard links. Suppose - hard link for a file f1 present in a directory d1 is - created in the directory d2 (as f2). Now, since d2's - contribution is not there in f1's inode ctx, d2's - contribution xattr wont be created and will create problems - for quota operations. - */ - contribution = mq_get_contribution_node (loc->parent, ctx); - if (!contribution) { - if ((loc->path && strcmp (loc->path, "/")) - || (!uuid_is_null (loc->gfid) - && !__is_root_gfid (loc->gfid)) - || (loc->inode && !uuid_is_null (loc->inode->gfid) - && !__is_root_gfid (loc->inode->gfid))) - gf_log_callingfn (this->name, GF_LOG_TRACE, - "contribution node for the " - "path (%s) with parent (%s) " - "not found", loc->path, - loc->parent? - uuid_utoa (loc->parent->gfid): - NULL); - - contribution = mq_add_new_contribution_node (this, ctx, loc); - if (!contribution) { - if(loc->path && strcmp (loc->path, "/")) - gf_log_callingfn (this->name, GF_LOG_WARNING, - "could not allocate " - " contribution node for (%s) " - "parent: (%s)", loc->path, - loc->parent? - uuid_utoa (loc->parent->gfid): - NULL); - goto out; - } - } - - /* To improve performance, do not start another transaction - * if one is already in progress for same inode - */ - status = _gf_true; - - ret = mq_test_and_set_ctx_updation_status (ctx, &status); - if (ret < 0) - goto out; - - if (status == _gf_false) { - mq_start_quota_txn (this, loc, ctx, contribution); - } - + int32_t ret = -1; + fd_t *fd = NULL; + off_t offset = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + gf_boolean_t locked = _gf_false; + gf_boolean_t updated = _gf_false; + int32_t dirty = 0; + quota_meta_t contri = { + 0, + }; + quota_meta_t size = { + 0, + }; + quota_meta_t contri_sum = { + 0, + }; + quota_meta_t delta = { + 0, + }; + quota_synctask_t *args = NULL; + xlator_t *this = NULL; + loc_t *loc = NULL; + quota_inode_ctx_t *ctx = NULL; + dict_t *xdata = NULL; + char contri_key[QUOTA_KEY_MAX] = { + 0, + }; + int keylen = 0; + + GF_ASSERT(opaque); + + args = (quota_synctask_t *)opaque; + loc = &args->loc; + this = args->this; + THIS = this; + INIT_LIST_HEAD(&entries.list); + + ret = mq_inode_ctx_get(loc->inode, this, &ctx); + if (ret < 0) + goto out; + + GET_CONTRI_KEY(this, contri_key, loc->gfid, keylen); + if (keylen < 0) { + ret = keylen; + goto out; + } + + xdata = dict_new(); + if (xdata == NULL) { + gf_log(this->name, GF_LOG_ERROR, "dict_new failed"); + ret = -1; + goto out; + } + + ret = dict_set_int64(xdata, contri_key, 0); + if (ret < 0) { + gf_log(this->name, GF_LOG_ERROR, "dict_set failed"); + goto out; + } + + ret = mq_lock(this, loc, F_WRLCK); + if (ret < 0) + goto out; + locked = _gf_true; + + ret = mq_get_dirty(this, loc, &dirty); + if (ret < 0 || dirty == 0) { ret = 0; -out: - return ret; -} + goto out; + } + + fd = fd_create(loc->inode, 0); + if (!fd) { + gf_log(this->name, GF_LOG_ERROR, "Failed to create fd"); + ret = -1; + goto out; + } + + ret = syncop_opendir(this, loc, fd, NULL, NULL); + if (ret < 0) { + gf_log(this->name, + (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "opendir failed " + "for %s: %s", + loc->path, strerror(-ret)); + goto out; + } + + fd_bind(fd); + while ((ret = syncop_readdirp(this, fd, 131072, offset, &entries, xdata, + NULL)) != 0) { + if (ret < 0) { + gf_log(this->name, + (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG + : GF_LOG_ERROR, + "readdirp failed " + "for %s: %s", + loc->path, strerror(-ret)); + goto out; + } + if (list_empty(&entries.list)) + break; + list_for_each_entry(entry, &entries.list, list) + { + offset = entry->d_off; + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) + continue; + memset(&contri, 0, sizeof(contri)); + quota_dict_get_meta(entry->dict, contri_key, keylen, &contri); + if (quota_meta_is_null(&contri)) + continue; -int32_t -mq_inspect_directory_xattr (xlator_t *this, - loc_t *loc, - dict_t *dict, - struct iatt buf) -{ - int32_t ret = 0; - int8_t dirty = -1; - int64_t *size = NULL, size_int = 0; - int64_t *contri = NULL, contri_int = 0; - char contri_key [512] = {0, }; - gf_boolean_t not_root = _gf_false; - quota_inode_ctx_t *ctx = NULL; - inode_contribution_t *contribution = NULL; - - ret = mq_inode_ctx_get (loc->inode, this, &ctx); - if (ret < 0) { - ctx = mq_inode_ctx_new (loc->inode, this); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "mq_inode_ctx_new failed"); - ret = -1; - goto err; - } + mq_add_meta(&contri_sum, &contri); } - if (!loc->path || (loc->path && strcmp (loc->path, "/") != 0)) { - contribution = mq_add_new_contribution_node (this, ctx, loc); - if (contribution == NULL) { - if (!uuid_is_null (loc->inode->gfid)) - gf_log (this->name, GF_LOG_DEBUG, - "cannot add a new contribution node " - "(%s)", uuid_utoa (loc->inode->gfid)); - ret = -1; - goto err; - } - } + gf_dirent_free(&entries); + } + /* Inculde for self */ + contri_sum.dir_count++; - ret = dict_get_bin (dict, QUOTA_SIZE_KEY, (void **) &size); - if (ret < 0) - goto out; + ret = _mq_get_metadata(this, loc, NULL, &size, 0); + if (ret < 0) + goto out; - ret = dict_get_int8 (dict, QUOTA_DIRTY_KEY, &dirty); - if (ret < 0) - goto out; + mq_compute_delta(&delta, &contri_sum, &size); - if ((loc->path && strcmp (loc->path, "/") != 0) - || (!uuid_is_null (loc->gfid) && !__is_root_gfid (loc->gfid)) - || (loc->inode && !uuid_is_null (loc->inode->gfid) && - !__is_root_gfid (loc->inode->gfid))) { - not_root = _gf_true; - - GET_CONTRI_KEY (contri_key, contribution->gfid, ret); - if (ret < 0) - goto out; - - ret = dict_get_bin (dict, contri_key, (void **) &contri); - if (ret < 0) - goto out; - - LOCK (&contribution->lock); - { - contribution->contribution = ntoh64 (*contri); - contri_int = contribution->contribution; - } - UNLOCK (&contribution->lock); - } + if (quota_meta_is_null(&delta)) + goto out; - LOCK (&ctx->lock); - { - ctx->size = ntoh64 (*size); - ctx->dirty = dirty; - size_int = ctx->size; - } - UNLOCK (&ctx->lock); + gf_log(this->name, GF_LOG_INFO, + "calculated size = %" PRId64 ", original size = %" PRIu64 + ", diff = %" PRIu64 ", path = %s ", + contri_sum.size, size.size, delta.size, loc->path); - gf_log (this->name, GF_LOG_DEBUG, "size=%"PRId64 - " contri=%"PRId64, size_int, contri_int); + gf_log(this->name, GF_LOG_INFO, + "calculated f_count = %" PRId64 ", original f_count = %" PRIu64 + ", diff = %" PRIu64 ", path = %s ", + contri_sum.file_count, size.file_count, delta.file_count, loc->path); - if (dirty) { - ret = mq_update_dirty_inode (this, loc, ctx, contribution); - } + gf_log(this->name, GF_LOG_INFO, + "calculated d_count = %" PRId64 ", original d_count = %" PRIu64 + ", diff = %" PRIu64 ", path = %s ", + contri_sum.dir_count, size.dir_count, delta.dir_count, loc->path); - if ((!dirty || ret == 0) && (not_root == _gf_true) && - (size_int != contri_int)) { - mq_initiate_quota_txn (this, loc); - } + ret = mq_update_size(this, loc, &delta); + if (ret < 0) + goto out; + + updated = _gf_true; - ret = 0; out: - if (ret) - mq_set_inode_xattr (this, loc); -err: - return ret; -} + gf_dirent_free(&entries); -int32_t -mq_inspect_file_xattr (xlator_t *this, - loc_t *loc, - dict_t *dict, - struct iatt buf) -{ - int32_t ret = -1; - uint64_t contri_int = 0, size = 0; - int64_t *contri_ptr = NULL; - char contri_key [512] = {0, }; - quota_inode_ctx_t *ctx = NULL; - inode_contribution_t *contribution = NULL; - - ret = mq_inode_ctx_get (loc->inode, this, &ctx); - if (ret < 0) { - ctx = mq_inode_ctx_new (loc->inode, this); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "mq_inode_ctx_new failed"); - ret = -1; - goto out; - } - } + if (fd) + fd_unref(fd); - contribution = mq_add_new_contribution_node (this, ctx, loc); - if (contribution == NULL) { - gf_log_callingfn (this->name, GF_LOG_DEBUG, "cannot allocate " - "contribution node (path:%s)", loc->path); - goto out; - } + if (xdata) + dict_unref(xdata); - LOCK (&ctx->lock); - { - ctx->size = 512 * buf.ia_blocks; - size = ctx->size; - } - UNLOCK (&ctx->lock); - - list_for_each_entry (contribution, &ctx->contribution_head, - contri_list) { - GET_CONTRI_KEY (contri_key, contribution->gfid, ret); - if (ret < 0) - continue; - - ret = dict_get_bin (dict, contri_key, (void **) &contri_int); - if (ret == 0) { - contri_ptr = (int64_t *)(unsigned long)contri_int; - - LOCK (&contribution->lock); - { - contribution->contribution = ntoh64 (*contri_ptr); - contri_int = contribution->contribution; - } - UNLOCK (&contribution->lock); - - gf_log (this->name, GF_LOG_DEBUG, - "size=%"PRId64 " contri=%"PRId64, size, contri_int); - - if (size != contri_int) { - mq_initiate_quota_txn (this, loc); - } - } else { - if (size) - mq_initiate_quota_txn (this, loc); - else - mq_set_inode_xattr (this, loc); - } - } + if (ret < 0) { + /* On failure clear dirty status flag. + * In the next lookup inspect_directory_xattr + * can set the status flag and fix the + * dirty directory + */ + if (ctx) + mq_set_ctx_dirty_status(ctx, _gf_false); + } else if (dirty) { + mq_mark_dirty(this, loc, 0); + } -out: - return ret; -} + if (locked) + mq_lock(this, loc, F_UNLCK); -int32_t -mq_xattr_state (xlator_t *this, - loc_t *loc, - dict_t *dict, - struct iatt buf) -{ - if (buf.ia_type == IA_IFREG || - buf.ia_type == IA_IFLNK) { - mq_inspect_file_xattr (this, loc, dict, buf); - } else if (buf.ia_type == IA_IFDIR) - mq_inspect_directory_xattr (this, loc, dict, buf); + if (updated) + mq_initiate_quota_blocking_txn(this, loc, NULL); - return 0; + return ret; } int32_t -mq_req_xattr (xlator_t *this, - loc_t *loc, - dict_t *dict) +mq_update_dirty_inode_txn(xlator_t *this, loc_t *loc, quota_inode_ctx_t *ctx) { - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("marker", this, out); - GF_VALIDATE_OR_GOTO ("marker", dict, out); - - if (!loc) - goto set_size; - - //if not "/" then request contribution - if (loc->path && strcmp (loc->path, "/") == 0) - goto set_size; - - ret = mq_dict_set_contribution (this, dict, loc); - if (ret == -1) - goto out; - -set_size: - ret = dict_set_uint64 (dict, QUOTA_SIZE_KEY, 0); - if (ret < 0) { - ret = -1; - goto out; - } + int32_t ret = -1; + gf_boolean_t status = _gf_true; - ret = dict_set_int8 (dict, QUOTA_DIRTY_KEY, 0); - if (ret < 0) { - ret = -1; - goto out; - } + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", loc->inode, out); - ret = 0; + mq_test_and_set_ctx_status(ctx, &ctx->dirty_status, &status); + if (status == _gf_true) + goto out; + ret = mq_synctask(this, mq_update_dirty_inode_task, _gf_true, loc); out: - return ret; -} - + if (ret < 0 && status == _gf_false) + mq_set_ctx_dirty_status(ctx, _gf_false); -int32_t -mq_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - QUOTA_STACK_DESTROY (frame, this); - - return 0; + return ret; } int32_t -_mq_inode_remove_done (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +mq_inspect_directory_xattr(xlator_t *this, quota_inode_ctx_t *ctx, + inode_contribution_t *contribution, loc_t *loc, + dict_t *dict) { - int32_t ret = 0; - char contri_key [512] = {0, }; - quota_local_t *local = NULL; - inode_t *inode = NULL; - dentry_t *tmp = NULL; - gf_boolean_t last_dentry = _gf_true; - loc_t loc = {0, }; - dentry_t *other_dentry = NULL; - gf_boolean_t remove = _gf_false; - - local = (quota_local_t *) frame->local; - - if (op_ret == -1 || local->err == -1) { - mq_removexattr_cbk (frame, NULL, this, -1, 0, NULL); - return 0; - } + int32_t ret = -1; + int8_t dirty = -1; + quota_meta_t size = { + 0, + }; + quota_meta_t contri = { + 0, + }; + quota_meta_t delta = { + 0, + }; + char contri_key[QUOTA_KEY_MAX] = { + 0, + }; + char size_key[QUOTA_KEY_MAX] = { + 0, + }; + int keylen = 0; + gf_boolean_t status = _gf_false; + + ret = dict_get_int8(dict, QUOTA_DIRTY_KEY, &dirty); + if (ret < 0) { + /* dirty is set only on the first file write operation + * so ignore this error + */ + ret = 0; + dirty = 0; + } + + GET_SIZE_KEY(this, size_key, keylen); + if (keylen < 0) { + ret = -1; + goto out; + } + ret = _quota_dict_get_meta(this, dict, size_key, keylen, &size, IA_IFDIR, + _gf_false); + if (ret < 0) + goto create_xattr; + + if (!contribution) + goto create_xattr; + + if (!loc_is_root(loc)) { + GET_CONTRI_KEY(this, contri_key, contribution->gfid, keylen); + if (keylen < 0) { + ret = -1; + goto out; + } + ret = _quota_dict_get_meta(this, dict, contri_key, keylen, &contri, + IA_IFDIR, _gf_false); + if (ret < 0) + goto create_xattr; - frame->local = NULL; + LOCK(&contribution->lock); + { + contribution->contribution = contri.size; + contribution->file_count = contri.file_count; + contribution->dir_count = contri.dir_count; + } + UNLOCK(&contribution->lock); + } + + LOCK(&ctx->lock); + { + ctx->size = size.size; + ctx->file_count = size.file_count; + ctx->dir_count = size.dir_count; + ctx->dirty = dirty; + } + UNLOCK(&ctx->lock); + + ret = mq_get_ctx_updation_status(ctx, &status); + if (ret < 0 || status == _gf_true) { + /* If the update txn is in progress abort inspection */ + ret = 0; + goto out; + } - GET_CONTRI_KEY (contri_key, local->contri->gfid, ret); + mq_compute_delta(&delta, &size, &contri); - if (!local->loc.inode) - inode = inode_grep (local->loc.parent->table, local->loc.parent, - local->loc.name); - else - inode = inode_ref (local->loc.inode); - - /* Suppose there are 2 directories dir1 and dir2. Quota limit is set on - both the directories. There is a file (f1) in dir1. A hark link is - created for that file inside the directory dir2 (say f2). Now one - more xattr is set in the inode as a new hard link is created in a - separate directory. - i.e trusted.glusterfs.quota.<gfid of dir2>.contri=<contribution> - - Now when the hardlink f2 is removed, then the new xattr added (i.e - the xattr indicating its contribution to ITS parent directory) should - be removed (IFF there is not another hardlink for that file in the - same directory). - - To do that upon getting unlink first check whether any other hard - links for the same inode exists in the same directory. If so do not - do anything and proceed for quota transaction. - Otherwise, if the removed entry was the only link for that inode - within that directory, then get another dentry for the inode - (by traversing the list of dentries for the inode) and using the - the dentry's parent and name, send removexattr so that the xattr - is removed. - - If it is not done, then if the volume is restarted or the brick - process is restarted, then wrong quota usage will be shown for the - directory dir2. - */ - if (inode) { - tmp = NULL; - list_for_each_entry (tmp, &inode->dentry_list, inode_list) { - if (local->loc.parent == tmp->parent) { - if (strcmp (local->loc.name, local->loc.name)) { - last_dentry = _gf_false; - break; - } - } - } - remove = last_dentry; - } + if (dirty) { + ret = mq_update_dirty_inode_txn(this, loc, ctx); + goto out; + } - if (remove) { - if (!other_dentry) { - list_for_each_entry (tmp, &inode->dentry_list, - inode_list) { - if (local->loc.parent != tmp->parent) { - other_dentry = tmp; - break; - } - } - } - - if (!other_dentry) - mq_removexattr_cbk (frame, NULL, this, 0, 0, NULL); - else { - loc.parent = inode_ref (other_dentry->parent); - loc.name = gf_strdup (other_dentry->name); - uuid_copy (loc.pargfid , other_dentry->parent->gfid); - loc.inode = inode_ref (inode); - uuid_copy (loc.gfid, inode->gfid); - inode_path (other_dentry->parent, other_dentry->name, - (char **)&loc.path); - - STACK_WIND (frame, mq_removexattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->removexattr, - &loc, contri_key, NULL); - } - } else - mq_removexattr_cbk (frame, NULL, this, 0, 0, NULL); + if (!loc_is_root(loc) && !quota_meta_is_null(&delta)) + mq_initiate_quota_txn(this, loc, NULL); - ret = 0; + ret = 0; + goto out; - if (strcmp (local->parent_loc.path, "/") != 0) { - ret = mq_get_parent_inode_local (this, local); - if (ret < 0) - goto out; +create_xattr: + if (ret < 0) + ret = mq_create_xattrs_txn(this, loc, NULL); - mq_start_quota_txn (this, &local->loc, local->ctx, local->contri); - } out: - mq_local_unref (this, local); - - loc_wipe (&loc); - inode_unref (inode); - return 0; + return ret; } int32_t -mq_inode_remove_done (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) +mq_inspect_file_xattr(xlator_t *this, quota_inode_ctx_t *ctx, + inode_contribution_t *contribution, loc_t *loc, + dict_t *dict, struct iatt *buf) { - int32_t ret = -1; - struct gf_flock lock = {0, }; - quota_inode_ctx_t *ctx = NULL; - quota_local_t *local = NULL; - int64_t contribution = 0; - - local = frame->local; - if (op_ret == -1) - local->err = -1; - - ret = mq_inode_ctx_get (local->parent_loc.inode, this, &ctx); - - LOCK (&local->contri->lock); + int32_t ret = -1; + quota_meta_t size = { + 0, + }; + quota_meta_t contri = { + 0, + }; + quota_meta_t delta = { + 0, + }; + char contri_key[QUOTA_KEY_MAX] = { + 0, + }; + int keylen = 0; + gf_boolean_t status = _gf_false; + + if (!buf || !contribution || !ctx) + goto out; + + LOCK(&ctx->lock); + { + ctx->size = 512 * buf->ia_blocks; + ctx->file_count = 1; + ctx->dir_count = 0; + + size.size = ctx->size; + size.file_count = ctx->file_count; + size.dir_count = ctx->dir_count; + } + UNLOCK(&ctx->lock); + + GET_CONTRI_KEY(this, contri_key, contribution->gfid, keylen); + if (keylen < 0) { + ret = -1; + goto out; + } + + ret = _quota_dict_get_meta(this, dict, contri_key, keylen, &contri, + IA_IFREG, _gf_true); + if (ret < 0) { + ret = mq_create_xattrs_txn(this, loc, NULL); + } else { + LOCK(&contribution->lock); { - contribution = local->contri->contribution; - } - UNLOCK (&local->contri->lock); - - if (contribution == local->size) { - if (ret == 0) { - LOCK (&ctx->lock); - { - ctx->size -= contribution; - } - UNLOCK (&ctx->lock); - - LOCK (&local->contri->lock); - { - local->contri->contribution = 0; - } - UNLOCK (&local->contri->lock); - } - } - - lock.l_type = F_UNLCK; - lock.l_whence = SEEK_SET; - lock.l_start = 0; - lock.l_len = 0; - lock.l_pid = 0; - - STACK_WIND (frame, - _mq_inode_remove_done, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, - this->name, &local->parent_loc, - F_SETLKW, &lock, NULL); - return 0; -} - -int32_t -mq_reduce_parent_size_xattr (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - int32_t ret = -1; - int64_t *size = NULL; - dict_t *dict = NULL; - quota_local_t *local = NULL; - - local = frame->local; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "inodelk set failed on %s", local->parent_loc.path); - QUOTA_STACK_DESTROY (frame, this); - return 0; + contribution->contribution = contri.size; + contribution->file_count = contri.file_count; + contribution->dir_count = contri.dir_count; } + UNLOCK(&contribution->lock); - VALIDATE_OR_GOTO (local->contri, err); - - dict = dict_new (); - if (dict == NULL) { - ret = -1; - goto err; + ret = mq_get_ctx_updation_status(ctx, &status); + if (ret < 0 || status == _gf_true) { + /* If the update txn is in progress abort inspection */ + ret = 0; + goto out; } - QUOTA_ALLOC_OR_GOTO (size, int64_t, ret, err); - - *size = hton64 (-local->size); + mq_compute_delta(&delta, &size, &contri); + if (!quota_meta_is_null(&delta)) + mq_initiate_quota_txn(this, loc, NULL); + } + /* TODO: revist this code when fixing hardlinks */ - ret = dict_set_bin (dict, QUOTA_SIZE_KEY, size, 8); - if (ret < 0) - goto err; - - uuid_copy (local->parent_loc.gfid, - local->parent_loc.inode->gfid); - GF_UUID_ASSERT (local->parent_loc.gfid); - - STACK_WIND (frame, mq_inode_remove_done, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->xattrop, &local->parent_loc, - GF_XATTROP_ADD_ARRAY64, dict, NULL); - dict_unref (dict); - return 0; - -err: - local->err = 1; - mq_inode_remove_done (frame, NULL, this, -1, 0, NULL, NULL); - if (dict) - dict_unref (dict); - return 0; +out: + return ret; } int32_t -mq_reduce_parent_size (xlator_t *this, loc_t *loc, int64_t contri) +mq_xattr_state(xlator_t *this, loc_t *origin_loc, dict_t *dict, + struct iatt *buf) { - int32_t ret = -1; - struct gf_flock lock = {0,}; - call_frame_t *frame = NULL; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - inode_contribution_t *contribution = NULL; - - GF_VALIDATE_OR_GOTO ("marker", this, out); - GF_VALIDATE_OR_GOTO ("marker", loc, out); - - ret = mq_inode_ctx_get (loc->inode, this, &ctx); - if (ret < 0) - goto out; - - contribution = mq_get_contribution_node (loc->parent, ctx); + int32_t ret = -1; + quota_inode_ctx_t *ctx = NULL; + loc_t loc = { + 0, + }; + inode_contribution_t *contribution = NULL; + + ret = mq_prevalidate_txn(this, origin_loc, &loc, &ctx, buf); + if (ret < 0 || loc.parent == NULL) + goto out; + + if (!loc_is_root(&loc)) { + contribution = mq_add_new_contribution_node(this, ctx, &loc); if (contribution == NULL) { - gf_log_callingfn (this->name, GF_LOG_WARNING, "contribution for" - " the node %s is NULL", loc->path); - goto out; - } - - local = mq_local_new (); - if (local == NULL) { - ret = -1; - goto out; - } - - if (contri >= 0) { - local->size = contri; - } else { - LOCK (&contribution->lock); - { - local->size = contribution->contribution; - } - UNLOCK (&contribution->lock); - } - - if (local->size == 0) { - gf_log_callingfn (this->name, GF_LOG_TRACE, - "local->size is 0 " "path: (%s)", loc->path); - ret = 0; - goto out; - } - - ret = mq_loc_copy (&local->loc, loc); - if (ret < 0) - goto out; - - local->ctx = ctx; - local->contri = contribution; - - ret = mq_inode_loc_fill (NULL, loc->parent, &local->parent_loc); - if (ret < 0) { - gf_log_callingfn (this->name, GF_LOG_INFO, "building parent loc" - " failed. (gfid: %s)", - uuid_utoa (loc->parent->gfid)); - goto out; - } - - frame = create_frame (this, this->ctx->pool); - if (!frame) { - ret = -1; - goto out; - } - - mq_assign_lk_owner (this, frame); - - frame->local = local; - - lock.l_len = 0; - lock.l_start = 0; - lock.l_type = F_WRLCK; - lock.l_whence = SEEK_SET; - - if (local->parent_loc.inode == NULL) { - ret = -1; - gf_log (this->name, GF_LOG_DEBUG, - "Inode is NULL, so can't stackwind."); - goto out; - } - - STACK_WIND (frame, - mq_reduce_parent_size_xattr, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, - this->name, &local->parent_loc, F_SETLKW, &lock, NULL); - local = NULL; - ret = 0; + if (!gf_uuid_is_null(loc.inode->gfid)) + gf_log(this->name, GF_LOG_WARNING, + "cannot add a new contribution node " + "(%s)", + uuid_utoa(loc.gfid)); + ret = -1; + goto out; + } + if (buf->ia_type == IA_IFDIR) + mq_inspect_directory_xattr(this, ctx, contribution, &loc, dict); + else + mq_inspect_file_xattr(this, ctx, contribution, &loc, dict, buf); + } else { + mq_inspect_directory_xattr(this, ctx, 0, &loc, dict); + } out: - if (local != NULL) - mq_local_unref (this, local); - - return ret; -} + loc_wipe(&loc); + if (contribution) + GF_REF_PUT(contribution); -int32_t -init_quota_priv (xlator_t *this) -{ - return 0; + return ret; } - int32_t -mq_rename_update_newpath (xlator_t *this, loc_t *loc) +mq_req_xattr(xlator_t *this, loc_t *loc, dict_t *dict, char *contri_key, + char *size_key) { - int32_t ret = -1; - quota_inode_ctx_t *ctx = NULL; - inode_contribution_t *contribution = NULL; + int32_t ret = -1; + char key[QUOTA_KEY_MAX] = { + 0, + }; - GF_VALIDATE_OR_GOTO ("marker", this, out); - GF_VALIDATE_OR_GOTO ("marker", loc, out); - GF_VALIDATE_OR_GOTO ("marker", loc->inode, out); + GF_VALIDATE_OR_GOTO("marker", this, out); + GF_VALIDATE_OR_GOTO("marker", loc, out); + GF_VALIDATE_OR_GOTO("marker", dict, out); - ret = mq_inode_ctx_get (loc->inode, this, &ctx); + if (!loc_is_root(loc)) { + ret = mq_dict_set_contribution(this, dict, loc, NULL, contri_key); if (ret < 0) - goto out; + goto out; + } - contribution = mq_add_new_contribution_node (this, ctx, loc); - if (contribution == NULL) { - ret = -1; - goto out; + GET_SIZE_KEY(this, key, ret); + if (ret < 0) + goto out; + if (size_key) + if (snprintf(size_key, QUOTA_KEY_MAX, "%s", key) >= QUOTA_KEY_MAX) { + ret = -1; + goto out; } - mq_initiate_quota_txn (this, loc); + ret = dict_set_uint64(dict, key, 0); + if (ret < 0) + goto out; + + ret = dict_set_int8(dict, QUOTA_DIRTY_KEY, 0); + out: - return ret; + if (ret < 0) + gf_log_callingfn(this ? this->name : "Marker", GF_LOG_ERROR, + "dict set failed"); + return ret; } int32_t -mq_forget (xlator_t *this, quota_inode_ctx_t *ctx) +mq_forget(xlator_t *this, quota_inode_ctx_t *ctx) { - inode_contribution_t *contri = NULL; - inode_contribution_t *next = NULL; + inode_contribution_t *contri = NULL; + inode_contribution_t *next = NULL; - GF_VALIDATE_OR_GOTO ("marker", this, out); - GF_VALIDATE_OR_GOTO ("marker", ctx, out); + GF_VALIDATE_OR_GOTO("marker", this, out); + GF_VALIDATE_OR_GOTO("marker", ctx, out); - list_for_each_entry_safe (contri, next, &ctx->contribution_head, - contri_list) { - list_del (&contri->contri_list); - GF_FREE (contri); - } + list_for_each_entry_safe(contri, next, &ctx->contribution_head, contri_list) + { + list_del_init(&contri->contri_list); + GF_REF_PUT(contri); + } - LOCK_DESTROY (&ctx->lock); - GF_FREE (ctx); + LOCK_DESTROY(&ctx->lock); + GF_FREE(ctx); out: - return 0; + return 0; } diff --git a/xlators/features/marker/src/marker-quota.h b/xlators/features/marker/src/marker-quota.h index 42def9d22dc..4bbf6878b22 100644 --- a/xlators/features/marker/src/marker-quota.h +++ b/xlators/features/marker/src/marker-quota.h @@ -10,126 +10,131 @@ #ifndef _MARKER_QUOTA_H #define _MARKER_QUOTA_H -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" +#include <glusterfs/xlator.h> #include "marker-mem-types.h" +#include <glusterfs/refcount.h> +#include <glusterfs/quota-common-utils.h> +#include <glusterfs/call-stub.h> #define QUOTA_XATTR_PREFIX "trusted.glusterfs" #define QUOTA_DIRTY_KEY "trusted.glusterfs.quota.dirty" #define CONTRIBUTION "contri" -#define CONTRI_KEY_MAX 512 +#define QUOTA_KEY_MAX 512 #define READDIR_BUF 4096 - -#define QUOTA_STACK_DESTROY(_frame, _this) \ - do { \ - quota_local_t *_local = NULL; \ - _local = _frame->local; \ - _frame->local = NULL; \ - STACK_DESTROY (_frame->root); \ - mq_local_unref (_this, _local); \ - } while (0) - - -#define QUOTA_ALLOC(var, type, ret) \ - do { \ - ret = 0; \ - var = GF_CALLOC (sizeof (type), 1, \ - gf_marker_mt_##type); \ - if (!var) { \ - ret = -1; \ - } \ - } while (0); - -#define QUOTA_ALLOC_OR_GOTO(var, type, ret, label) \ - do { \ - var = GF_CALLOC (sizeof (type), 1, \ - gf_marker_mt_##type); \ - if (!var) { \ - gf_log ("", GF_LOG_ERROR, \ - "out of memory"); \ - ret = -1; \ - goto label; \ - } \ - ret = 0; \ - } while (0); - -#define GET_CONTRI_KEY(var, _gfid, _ret) \ - do { \ - if (_gfid != NULL) { \ - char _gfid_unparsed[40]; \ - uuid_unparse (_gfid, _gfid_unparsed); \ - _ret = snprintf (var, CONTRI_KEY_MAX, \ - QUOTA_XATTR_PREFIX \ - ".%s.%s." CONTRIBUTION, "quota", \ - _gfid_unparsed); \ - } else { \ - _ret = snprintf (var, CONTRI_KEY_MAX, \ - QUOTA_XATTR_PREFIX \ - ".%s.." CONTRIBUTION, "quota"); \ - } \ - } while (0); - -#define QUOTA_SAFE_INCREMENT(lock, var) \ - do { \ - LOCK (lock); \ - var ++; \ - UNLOCK (lock); \ - } while (0) +#define QUOTA_ALLOC(var, type, ret) \ + do { \ + ret = 0; \ + var = GF_CALLOC(sizeof(type), 1, gf_marker_mt_##type); \ + if (!var) { \ + ret = -1; \ + } \ + } while (0); + +#define QUOTA_ALLOC_OR_GOTO(var, type, ret, label) \ + do { \ + var = GF_CALLOC(sizeof(type), 1, gf_marker_mt_##type); \ + if (!var) { \ + gf_log("", GF_LOG_ERROR, "out of memory"); \ + ret = -1; \ + goto label; \ + } \ + ret = 0; \ + } while (0); + +#define GET_QUOTA_KEY(_this, var, key, _ret) \ + do { \ + marker_conf_t *_priv = _this->private; \ + if (_priv->version > 0) \ + _ret = snprintf(var, QUOTA_KEY_MAX, "%s.%d", key, _priv->version); \ + else \ + _ret = snprintf(var, QUOTA_KEY_MAX, "%s", key); \ + } while (0) + +#define GET_CONTRI_KEY(_this, var, _gfid, _ret) \ + do { \ + char _tmp_var[QUOTA_KEY_MAX] = { \ + 0, \ + }; \ + if (_gfid != NULL) { \ + char _gfid_unparsed[40]; \ + gf_uuid_unparse(_gfid, _gfid_unparsed); \ + _ret = snprintf(_tmp_var, QUOTA_KEY_MAX, \ + QUOTA_XATTR_PREFIX ".%s.%s." CONTRIBUTION, \ + "quota", _gfid_unparsed); \ + } else { \ + _ret = snprintf(_tmp_var, QUOTA_KEY_MAX, \ + QUOTA_XATTR_PREFIX ".%s.." CONTRIBUTION, "quota"); \ + } \ + GET_QUOTA_KEY(_this, var, _tmp_var, _ret); \ + } while (0) + +#define GET_SIZE_KEY(_this, var, _ret) \ + { \ + GET_QUOTA_KEY(_this, var, QUOTA_SIZE_KEY, _ret); \ + } + +#define QUOTA_SAFE_INCREMENT(lock, var) \ + do { \ + LOCK(lock); \ + var++; \ + UNLOCK(lock); \ + } while (0) struct quota_inode_ctx { - int64_t size; - int8_t dirty; - gf_boolean_t updation_status; - gf_lock_t lock; - struct list_head contribution_head; + int64_t size; + int64_t file_count; + int64_t dir_count; + int8_t dirty; + gf_boolean_t create_status; + gf_boolean_t updation_status; + gf_boolean_t dirty_status; + gf_lock_t lock; + struct list_head contribution_head; }; typedef struct quota_inode_ctx quota_inode_ctx_t; +struct quota_synctask { + xlator_t *this; + loc_t loc; + quota_meta_t contri; + gf_boolean_t is_static; + uint32_t ia_nlink; + call_stub_t *stub; +}; +typedef struct quota_synctask quota_synctask_t; + struct inode_contribution { - struct list_head contri_list; - int64_t contribution; - uuid_t gfid; - gf_lock_t lock; + struct list_head contri_list; + int64_t contribution; + int64_t file_count; + int64_t dir_count; + uuid_t gfid; + gf_lock_t lock; + GF_REF_DECL; }; typedef struct inode_contribution inode_contribution_t; int32_t -mq_get_lock_on_parent (call_frame_t *, xlator_t *); - -int32_t -mq_req_xattr (xlator_t *, loc_t *, dict_t *); - -int32_t -init_quota_priv (xlator_t *); +mq_req_xattr(xlator_t *, loc_t *, dict_t *, char *, char *); int32_t -mq_xattr_state (xlator_t *, loc_t *, dict_t *, struct iatt); - -int32_t -mq_set_inode_xattr (xlator_t *, loc_t *); +mq_xattr_state(xlator_t *, loc_t *, dict_t *, struct iatt *); int -mq_initiate_quota_txn (xlator_t *, loc_t *); - -int32_t -mq_dirty_inode_readdir (call_frame_t *, void *, xlator_t *, - int32_t, int32_t, fd_t *, dict_t *); +mq_initiate_quota_txn(xlator_t *, loc_t *, struct iatt *); -int32_t -mq_reduce_parent_size (xlator_t *, loc_t *, int64_t); +int +mq_initiate_quota_blocking_txn(xlator_t *, loc_t *, struct iatt *); -int32_t -mq_rename_update_newpath (xlator_t *, loc_t *); +int +mq_create_xattrs_txn(xlator_t *this, loc_t *loc, struct iatt *buf); int32_t -mq_inspect_file_xattr (xlator_t *this, loc_t *loc, dict_t *dict, struct iatt buf); +mq_reduce_parent_size_txn(xlator_t *, loc_t *, quota_meta_t *, uint32_t nlink, + call_stub_t *stub); int32_t -mq_forget (xlator_t *, quota_inode_ctx_t *); +mq_forget(xlator_t *, quota_inode_ctx_t *); #endif diff --git a/xlators/features/marker/src/marker.c b/xlators/features/marker/src/marker.c index e448bc08f67..1375ccc498c 100644 --- a/xlators/features/marker/src/marker.c +++ b/xlators/features/marker/src/marker.c @@ -7,2031 +7,2315 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "defaults.h" +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> #include "libxlator.h" #include "marker.h" #include "marker-mem-types.h" #include "marker-quota.h" #include "marker-quota-helper.h" #include "marker-common.h" -#include "byte-order.h" +#include <glusterfs/byte-order.h> +#include <glusterfs/syncop.h> +#include <glusterfs/syscall.h> + +#include <fnmatch.h> #define _GF_UID_GID_CHANGED 1 +static char *mq_ext_xattrs[] = { + QUOTA_SIZE_KEY, + QUOTA_LIMIT_KEY, + QUOTA_LIMIT_OBJECTS_KEY, + NULL, +}; + void -fini (xlator_t *this); +fini(xlator_t *this); int32_t -marker_start_setxattr (call_frame_t *, xlator_t *); +marker_start_setxattr(call_frame_t *, xlator_t *); -marker_local_t * -marker_local_ref (marker_local_t *local) +/* When client/quotad request for quota xattrs, + * replace the key-name by adding the version number + * in end of the key-name. + * In the cbk, result value of xattrs for original + * key-name. + * Below function marker_key_replace_with_ver and + * marker_key_set_ver is used for setting/removing + * version for the key-name + */ +int +marker_key_replace_with_ver(xlator_t *this, dict_t *dict) { - GF_VALIDATE_OR_GOTO ("marker", local, err); + int ret = -1; + int i = 0; + marker_conf_t *priv = NULL; + char key[QUOTA_KEY_MAX] = { + 0, + }; + + priv = this->private; - LOCK (&local->lock); - { - local->ref++; + if (dict == NULL || priv->version <= 0) { + ret = 0; + goto out; + } + + for (i = 0; mq_ext_xattrs[i]; i++) { + if (dict_get(dict, mq_ext_xattrs[i])) { + GET_QUOTA_KEY(this, key, mq_ext_xattrs[i], ret); + if (ret < 0) + goto out; + + ret = dict_set(dict, key, dict_get(dict, mq_ext_xattrs[i])); + if (ret < 0) + goto out; + + dict_del(dict, mq_ext_xattrs[i]); } - UNLOCK (&local->lock); + } + + ret = 0; + +out: + return ret; +} + +int +marker_key_set_ver(xlator_t *this, dict_t *dict) +{ + int ret = -1; + int i = -1; + marker_conf_t *priv = NULL; + char key[QUOTA_KEY_MAX] = { + 0, + }; - return local; + priv = this->private; + + if (dict == NULL || priv->version <= 0) { + ret = 0; + goto out; + } + + for (i = 0; mq_ext_xattrs[i]; i++) { + GET_QUOTA_KEY(this, key, mq_ext_xattrs[i], ret); + if (ret < 0) + goto out; + + if (dict_get(dict, key)) + dict_set(dict, mq_ext_xattrs[i], dict_get(dict, key)); + } + + ret = 0; +out: + return ret; +} + +marker_local_t * +marker_local_ref(marker_local_t *local) +{ + GF_VALIDATE_OR_GOTO("marker", local, err); + + LOCK(&local->lock); + { + local->ref++; + } + UNLOCK(&local->lock); + + return local; err: - return NULL; + return NULL; } int -marker_loc_fill (loc_t *loc, inode_t *inode, inode_t *parent, char *path) +marker_loc_fill(loc_t *loc, inode_t *inode, inode_t *parent, char *path) { - int ret = -1; + int ret = -1; - if (!loc) - return ret; + if (!loc) + return ret; - if (inode) { - loc->inode = inode_ref (inode); - if (uuid_is_null (loc->gfid)) { - uuid_copy (loc->gfid, loc->inode->gfid); - } + if (inode) { + loc->inode = inode_ref(inode); + if (gf_uuid_is_null(loc->gfid)) { + gf_uuid_copy(loc->gfid, loc->inode->gfid); } + } - if (parent) - loc->parent = inode_ref (parent); - - if (path) { - loc->path = gf_strdup (path); - if (!loc->path) { - gf_log ("loc fill", GF_LOG_ERROR, "strdup failed"); - goto loc_wipe; - } - - loc->name = strrchr (loc->path, '/'); - if (loc->name) - loc->name++; + if (parent) + loc->parent = inode_ref(parent); + + if (path) { + loc->path = gf_strdup(path); + if (!loc->path) { + gf_log("loc fill", GF_LOG_ERROR, "strdup failed"); + goto loc_wipe; } - ret = 0; + loc->name = strrchr(loc->path, '/'); + if (loc->name) + loc->name++; + } + + ret = 0; loc_wipe: - if (ret < 0) - loc_wipe (loc); + if (ret < 0) + loc_wipe(loc); - return ret; + return ret; } int -marker_inode_loc_fill (inode_t *inode, loc_t *loc) +_marker_inode_loc_fill(inode_t *inode, inode_t *parent, char *name, loc_t *loc) { - char *resolvedpath = NULL; - int ret = -1; - inode_t *parent = NULL; + char *resolvedpath = NULL; + int ret = -1; + gf_boolean_t free_parent = _gf_false; - if ((!inode) || (!loc)) - return ret; + if ((!inode) || (!loc)) + return ret; - parent = inode_parent (inode, NULL, NULL); + if (parent && name) + ret = inode_path(parent, name, &resolvedpath); + else + ret = inode_path(inode, NULL, &resolvedpath); + if (ret < 0) + goto err; - ret = inode_path (inode, NULL, &resolvedpath); - if (ret < 0) - goto err; + if (parent == NULL) { + parent = inode_parent(inode, NULL, NULL); + free_parent = _gf_true; + } - ret = marker_loc_fill (loc, inode, parent, resolvedpath); - if (ret < 0) - goto err; + ret = marker_loc_fill(loc, inode, parent, resolvedpath); + if (ret < 0) + goto err; err: - if (parent) - inode_unref (parent); + if (free_parent) + inode_unref(parent); - GF_FREE (resolvedpath); + GF_FREE(resolvedpath); - return ret; + return ret; +} + +int +marker_inode_loc_fill(inode_t *inode, loc_t *loc) +{ + return _marker_inode_loc_fill(inode, NULL, NULL, loc); } int32_t -marker_trav_parent (marker_local_t *local) +marker_trav_parent(marker_local_t *local) { - int32_t ret = 0; - loc_t loc = {0, }; - inode_t *parent = NULL; - int8_t need_unref = 0; + int32_t ret = 0; + loc_t loc = { + 0, + }; + inode_t *parent = NULL; + int8_t need_unref = 0; - if (!local->loc.parent) { - parent = inode_parent (local->loc.inode, NULL, NULL); - if (parent) - need_unref = 1; - } else - parent = local->loc.parent; + if (!local->loc.parent) { + parent = inode_parent(local->loc.inode, NULL, NULL); + if (parent) + need_unref = 1; + } else + parent = local->loc.parent; - ret = marker_inode_loc_fill (parent, &loc); + ret = marker_inode_loc_fill(parent, &loc); - if (ret < 0) { - ret = -1; - goto out; - } + if (ret < 0) { + ret = -1; + goto out; + } - loc_wipe (&local->loc); + loc_wipe(&local->loc); - local->loc = loc; + local->loc = loc; out: - if (need_unref) - inode_unref (parent); + if (need_unref) + inode_unref(parent); - return ret; + return ret; } -int32_t -marker_error_handler (xlator_t *this, marker_local_t *local, int32_t op_errno) +void +marker_error_handler(xlator_t *this, marker_local_t *local, int32_t op_errno) { - marker_conf_t *priv = NULL; - const char *path = NULL; - - priv = (marker_conf_t *) this->private; - path = local - ? (local->loc.path - ? local->loc.path : uuid_utoa(local->loc.gfid)) - : "<nul>"; - - gf_log (this->name, GF_LOG_CRITICAL, - "Indexing gone corrupt at %s (reason: %s)." - " Geo-replication slave content needs to be revalidated", - path, strerror (op_errno)); - unlink (priv->timestamp_file); + marker_conf_t *priv = (marker_conf_t *)this->private; + const char *path = local ? ((local->loc.path) ? local->loc.path + : uuid_utoa(local->loc.gfid)) + : "<nul>"; - return 0; + gf_log(this->name, GF_LOG_CRITICAL, + "Indexing gone corrupt at %s (reason: %s)." + " Geo-replication slave content needs to be revalidated", + path, strerror(op_errno)); + sys_unlink(priv->timestamp_file); } int32_t -marker_local_unref (marker_local_t *local) +marker_local_unref(marker_local_t *local) { - int32_t var = 0; - - if (local == NULL) - return -1; + int32_t var = 0; - LOCK (&local->lock); - { - var = --local->ref; - } - UNLOCK (&local->lock); - - if (var != 0) - goto out; - - loc_wipe (&local->loc); - loc_wipe (&local->parent_loc); - if (local->xdata) - dict_unref (local->xdata); + if (local == NULL) + return -1; - if (local->oplocal) { - marker_local_unref (local->oplocal); - local->oplocal = NULL; - } - mem_put (local); + LOCK(&local->lock); + { + var = --local->ref; + } + UNLOCK(&local->lock); + + if (var != 0) + goto out; + + loc_wipe(&local->loc); + loc_wipe(&local->parent_loc); + if (local->xdata) + dict_unref(local->xdata); + + if (local->lk_frame) { + STACK_DESTROY(local->lk_frame->root); + local->lk_frame = NULL; + } + + if (local->oplocal) { + marker_local_unref(local->oplocal); + local->oplocal = NULL; + } + mem_put(local); out: - return 0; + return 0; } int32_t -stat_stampfile (xlator_t *this, marker_conf_t *priv, - struct volume_mark **status) +stat_stampfile(xlator_t *this, marker_conf_t *priv, struct volume_mark **status) { - struct stat buf = {0, }; - struct volume_mark *vol_mark = NULL; + struct stat buf = { + 0, + }; + struct volume_mark *vol_mark = NULL; - vol_mark = GF_CALLOC (sizeof (struct volume_mark), 1, - gf_marker_mt_volume_mark); + vol_mark = GF_CALLOC(sizeof(struct volume_mark), 1, + gf_marker_mt_volume_mark); - vol_mark->major = 1; - vol_mark->minor = 0; + vol_mark->major = 1; + vol_mark->minor = 0; - GF_ASSERT (sizeof (priv->volume_uuid_bin) == 16); - memcpy (vol_mark->uuid, priv->volume_uuid_bin, 16); + GF_ASSERT(sizeof(priv->volume_uuid_bin) == 16); + memcpy(vol_mark->uuid, priv->volume_uuid_bin, 16); - if (stat (priv->timestamp_file, &buf) != -1) { - vol_mark->retval = 0; - vol_mark->sec = htonl (buf.st_ctime); - vol_mark->usec = htonl (ST_CTIM_NSEC (&buf)/1000); - } else - vol_mark->retval = 1; + if (sys_stat(priv->timestamp_file, &buf) != -1) { + vol_mark->retval = 0; + vol_mark->sec = htonl(buf.st_mtime); + vol_mark->usec = htonl(ST_MTIM_NSEC(&buf) / 1000); + } else + vol_mark->retval = 1; - *status = vol_mark; + *status = vol_mark; - return 0; + return 0; } int32_t -marker_getxattr_stampfile_cbk (call_frame_t *frame, xlator_t *this, - const char *name, struct volume_mark *vol_mark, - dict_t *xdata) +marker_getxattr_stampfile_cbk(call_frame_t *frame, xlator_t *this, + const char *name, struct volume_mark *vol_mark, + dict_t *xdata) { - int32_t ret = -1; - dict_t *dict = NULL; + int32_t ret = -1; + dict_t *dict = NULL; - if (vol_mark == NULL){ - STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL); + if (vol_mark == NULL) { + STACK_UNWIND_STRICT(getxattr, frame, -1, ENOMEM, NULL, NULL); - goto out; - } + goto out; + } - dict = dict_new (); + dict = dict_new(); - ret = dict_set_bin (dict, (char *)name, vol_mark, - sizeof (struct volume_mark)); - if (ret) - gf_log (this->name, GF_LOG_WARNING, "failed to set key %s", - name); + ret = dict_set_bin(dict, (char *)name, vol_mark, + sizeof(struct volume_mark)); + if (ret) { + GF_FREE(vol_mark); + gf_log(this->name, GF_LOG_WARNING, "failed to set key %s", name); + } - STACK_UNWIND_STRICT (getxattr, frame, 0, 0, dict, xdata); + STACK_UNWIND_STRICT(getxattr, frame, 0, 0, dict, xdata); - dict_unref (dict); + if (dict) + dict_unref(dict); out: - return 0; + return 0; } -int32_t -call_from_special_client (call_frame_t *frame, xlator_t *this, const char *name) +gf_boolean_t +call_from_special_client(call_frame_t *frame, xlator_t *this, const char *name) { - struct volume_mark *vol_mark = NULL; - marker_conf_t *priv = NULL; - gf_boolean_t ret = _gf_true; + struct volume_mark *vol_mark = NULL; + marker_conf_t *priv = NULL; + gf_boolean_t is_true = _gf_true; - priv = (marker_conf_t *)this->private; + priv = (marker_conf_t *)this->private; - if (frame->root->pid != GF_CLIENT_PID_GSYNCD || name == NULL || - strcmp (name, MARKER_XATTR_PREFIX "." VOLUME_MARK) != 0) { - ret = _gf_false; - goto out; - } + if (frame->root->pid != GF_CLIENT_PID_GSYNCD || name == NULL || + strcmp(name, MARKER_XATTR_PREFIX "." VOLUME_MARK) != 0) { + is_true = _gf_false; + goto out; + } - stat_stampfile (this, priv, &vol_mark); + stat_stampfile(this, priv, &vol_mark); - marker_getxattr_stampfile_cbk (frame, this, name, vol_mark, NULL); + marker_getxattr_stampfile_cbk(frame, this, name, vol_mark, NULL); out: - return ret; + return is_true; } -int32_t -marker_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) +static gf_boolean_t +_is_quota_internal_xattr(dict_t *d, char *k, data_t *v, void *data) { - if (cookie) { - gf_log (this->name, GF_LOG_DEBUG, - "Filtering the quota extended attributes"); + int i = 0; + char **external_xattrs = data; - dict_foreach_fnmatch (dict, "trusted.glusterfs.quota*", - marker_filter_quota_xattr, NULL); - } + for (i = 0; external_xattrs && external_xattrs[i]; i++) { + if (strcmp(k, external_xattrs[i]) == 0) + return _gf_false; + } - STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + if (fnmatch("trusted.glusterfs.quota*", k, 0) == 0) + return _gf_true; + + /* It would be nice if posix filters pgfid xattrs. But since marker + * also takes up responsibility to clean these up, adding the filtering + * here (Check 'quota_xattr_cleaner') + */ + if (fnmatch(PGFID_XATTR_KEY_PREFIX "*", k, 0) == 0) + return _gf_true; + + return _gf_false; } -int32_t -marker_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) -{ - gf_boolean_t ret = _gf_false; - marker_conf_t *priv = NULL; - unsigned long cookie = 0; - - priv = this->private; - - gf_log (this->name, GF_LOG_DEBUG, "USER:PID = %d", frame->root->pid); - - if (priv && priv->feature_enabled & GF_XTIME) - ret = call_from_special_client (frame, this, name); - - if (ret == _gf_false) { - if (name == NULL) { - /* Signifies that marker translator - * has to filter the quota's xattr's, - * this is to prevent afr from performing - * self healing on marker-quota xattrs' - */ - cookie = 1; - } - STACK_WIND_COOKIE (frame, marker_getxattr_cbk, (void *)cookie, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, loc, - name, xdata); - } +static void +marker_filter_internal_xattrs(xlator_t *this, dict_t *xattrs) +{ + marker_conf_t *priv = NULL; + char **ext = NULL; - return 0; + priv = this->private; + if (priv->feature_enabled & GF_QUOTA) + ext = mq_ext_xattrs; + + dict_foreach_match(xattrs, _is_quota_internal_xattr, ext, + dict_remove_foreach_fn, NULL); } +static void +marker_filter_gsyncd_xattrs(call_frame_t *frame, xlator_t *this, dict_t *xattrs) +{ + marker_conf_t *priv = NULL; + + priv = this->private; + GF_ASSERT(priv); + GF_ASSERT(frame); + + if (xattrs && frame->root->pid != GF_CLIENT_PID_GSYNCD) { + GF_REMOVE_INTERNAL_XATTR(GF_XATTR_XTIME_PATTERN, xattrs); + } + return; +} int32_t -marker_setxattr_done (call_frame_t *frame) +marker_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - marker_local_t *local = NULL; + int32_t ret = -1; + if (op_ret < 0) + goto unwind; - local = (marker_local_t *) frame->local; + ret = marker_key_set_ver(this, dict); + if (ret < 0) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } - frame->local = NULL; + if (cookie) { + gf_log(this->name, GF_LOG_DEBUG, + "Filtering the quota extended attributes"); - STACK_DESTROY (frame->root); + /* If the getxattr is from a non special client, then do not + copy the quota related xattrs (except the quota limit key + i.e trusted.glusterfs.quota.limit-set which has been set by + glusterd on the directory on which quota limit is set.) for + directories. Let the healing of xattrs happen upon lookup. + NOTE: setting of trusted.glusterfs.quota.limit-set as of now + happens from glusterd. It should be moved to quotad. Also + trusted.glusterfs.quota.limit-set is set on directory which + is permanent till quota is removed on that directory or limit + is changed. So let that xattr be healed by other xlators + properly whenever directory healing is done. + */ + /* + * Except limit-set xattr, rest of the xattrs are maintained + * by quota xlator. Don't expose them to other xlators. + * This filter makes sure quota xattrs are not healed as part of + * metadata self-heal + */ + marker_filter_internal_xattrs(frame->this, dict); + } - marker_local_unref (local); + /* Filter gsyncd xtime xattr for non gsyncd clients */ + marker_filter_gsyncd_xattrs(frame, frame->this, dict); - return 0; +unwind: + MARKER_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; } -int -marker_specific_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +int32_t +marker_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - int32_t ret = 0; - int32_t done = 0; - marker_local_t *local = NULL; + gf_boolean_t is_true = _gf_false; + marker_conf_t *priv = NULL; + unsigned long cookie = 0; + marker_local_t *local = NULL; + char key[QUOTA_KEY_MAX] = { + 0, + }; + int32_t ret = -1; + int32_t i = 0; + + priv = this->private; - local = (marker_local_t*) frame->local; + if (name) { + for (i = 0; mq_ext_xattrs[i]; i++) { + if (strcmp(name, mq_ext_xattrs[i])) + continue; - if (op_ret == -1 && op_errno == ENOSPC) { - marker_error_handler (this, local, op_errno); - done = 1; + GET_QUOTA_KEY(this, key, mq_ext_xattrs[i], ret); + if (ret < 0) goto out; + name = key; + break; } + } + + frame->local = mem_get0(this->local_pool); + local = frame->local; + if (local == NULL) + goto out; + + MARKER_INIT_LOCAL(frame, local); + + if ((loc_copy(&local->loc, loc)) < 0) + goto out; + + gf_log(this->name, GF_LOG_DEBUG, "USER:PID = %d", frame->root->pid); + + if (priv && priv->feature_enabled & GF_XTIME) + is_true = call_from_special_client(frame, this, name); - if (local) { - if (local->loc.path && strcmp (local->loc.path, "/") == 0) { - done = 1; - goto out; - } - if (__is_root_gfid (local->loc.gfid)) { - done = 1; - goto out; - } + if (is_true == _gf_false) { + if (name == NULL) { + /* Signifies that marker translator + * has to filter the quota's xattr's, + * this is to prevent afr from performing + * self healing on marker-quota xattrs' + */ + cookie = 1; } + STACK_WIND_COOKIE(frame, marker_getxattr_cbk, (void *)cookie, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->getxattr, + loc, name, xdata); + } - ret = marker_trav_parent (local); + return 0; +out: + MARKER_STACK_UNWIND(getxattr, frame, -1, ENOMEM, NULL, NULL); + return 0; +} - if (ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, "Error occurred " - "while traversing to the parent, stopping marker"); +int32_t +marker_setxattr_done(call_frame_t *frame) +{ + marker_local_t *local = NULL; - done = 1; + local = (marker_local_t *)frame->local; - goto out; + frame->local = NULL; + + STACK_DESTROY(frame->root); + + marker_local_unref(local); + + return 0; +} + +int +marker_specific_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + int32_t ret = 0; + int32_t done = 1; + marker_local_t *local = NULL; + + local = (marker_local_t *)frame->local; + + if (op_ret == -1 && op_errno == ENOSPC) { + marker_error_handler(this, local, op_errno); + goto out; + } + + if (local) { + if (local->loc.path && strcmp(local->loc.path, "/") == 0) { + goto out; } + if (__is_root_gfid(local->loc.gfid)) { + goto out; + } + } + + ret = (local) ? marker_trav_parent(local) : -1; - marker_start_setxattr (frame, this); + if (ret == -1) { + gf_log(this->name, GF_LOG_DEBUG, + "Error occurred " + "while traversing to the parent, stopping marker"); + goto out; + } + marker_start_setxattr(frame, this); + done = 0; out: - if (done) { - marker_setxattr_done (frame); - } + if (done) { + marker_setxattr_done(frame); + } - return 0; + return 0; } int32_t -marker_start_setxattr (call_frame_t *frame, xlator_t *this) +marker_start_setxattr(call_frame_t *frame, xlator_t *this) { - int32_t ret = -1; - dict_t *dict = NULL; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = -1; + dict_t *dict = NULL; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - local = (marker_local_t*) frame->local; + local = (marker_local_t *)frame->local; - if (!local) - goto out; + if (!local) + goto out; - dict = dict_new (); + dict = dict_new(); - if (!dict) - goto out; + if (!dict) + goto out; - if (local->loc.inode && uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, local->loc.inode->gfid); + if (local->loc.inode && gf_uuid_is_null(local->loc.gfid)) + gf_uuid_copy(local->loc.gfid, local->loc.inode->gfid); - GF_UUID_ASSERT (local->loc.gfid); + GF_UUID_ASSERT(local->loc.gfid); - ret = dict_set_static_bin (dict, priv->marker_xattr, - (void *)local->timebuf, 8); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "failed to set marker xattr (%s)", local->loc.path); - goto out; - } + ret = dict_set_static_bin(dict, priv->marker_xattr, (void *)local->timebuf, + 8); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, "failed to set marker xattr (%s)", + local->loc.path); + goto out; + } - STACK_WIND (frame, marker_specific_setxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, &local->loc, dict, 0, - NULL); + STACK_WIND(frame, marker_specific_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, &local->loc, dict, 0, NULL); - ret = 0; + ret = 0; out: - if (dict) - dict_unref (dict); + if (dict) + dict_unref(dict); - return ret; + return ret; } void -marker_gettimeofday (marker_local_t *local) +marker_gettimeofday(marker_local_t *local) { - struct timeval tv = {0, }; + struct timeval tv = { + 0, + }; - gettimeofday (&tv, NULL); + gettimeofday(&tv, NULL); - local->timebuf [0] = htonl (tv.tv_sec); - local->timebuf [1] = htonl (tv.tv_usec); + local->timebuf[0] = htonl(tv.tv_sec); + local->timebuf[1] = htonl(tv.tv_usec); - return; + return; } int32_t -marker_create_frame (xlator_t *this, marker_local_t *local) +marker_create_frame(xlator_t *this, marker_local_t *local) { - call_frame_t *frame = NULL; + call_frame_t *frame = NULL; - frame = create_frame (this, this->ctx->pool); + frame = create_frame(this, this->ctx->pool); - frame->local = (void *) local; + if (!frame) + return -1; - marker_start_setxattr (frame, this); + frame->local = (void *)local; - return 0; + marker_start_setxattr(frame, this); + + return 0; } int32_t -marker_xtime_update_marks (xlator_t *this, marker_local_t *local) +marker_xtime_update_marks(xlator_t *this, marker_local_t *local) { - marker_conf_t *priv = NULL; + marker_conf_t *priv = NULL; - GF_VALIDATE_OR_GOTO ("marker", this, out); - GF_VALIDATE_OR_GOTO (this->name, local, out); + GF_VALIDATE_OR_GOTO("marker", this, out); + GF_VALIDATE_OR_GOTO(this->name, local, out); - priv = this->private; + priv = this->private; - if ((local->pid == GF_CLIENT_PID_GSYNCD - && !(priv->feature_enabled & GF_XTIME_GSYNC_FORCE)) - || (local->pid == GF_CLIENT_PID_DEFRAG)) - goto out; + if ((local->pid == GF_CLIENT_PID_GSYNCD && + !(priv->feature_enabled & GF_XTIME_GSYNC_FORCE)) || + (local->pid == GF_CLIENT_PID_DEFRAG)) + goto out; - marker_gettimeofday (local); + marker_gettimeofday(local); - marker_local_ref (local); + marker_local_ref(local); - marker_create_frame (this, local); + marker_create_frame(this, local); out: - return 0; + return 0; } - int32_t -marker_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +marker_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - marker_conf_t *priv = NULL; - marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "error occurred " - "while Creating a file %s", strerror (op_errno)); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, + "error occurred " + "while creating directory %s", + strerror(op_errno)); + } - local = (marker_local_t *) frame->local; + local = (marker_local_t *)frame->local; - frame->local = NULL; + frame->local = NULL; + priv = this->private; - STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, - buf, preparent, postparent, xdata); + if (op_ret >= 0 && inode && (priv->feature_enabled & GF_QUOTA)) { + ctx = mq_inode_ctx_new(inode, this); + if (ctx == NULL) { + gf_log(this->name, GF_LOG_WARNING, + "mq_inode_ctx_new " + "failed for %s", + uuid_utoa(inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + } + } - if (op_ret == -1 || local == NULL) - goto out; + STACK_UNWIND_STRICT(mkdir, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, buf->ia_gfid); + if (op_ret == -1 || local == NULL) + goto out; - priv = this->private; + if (gf_uuid_is_null(local->loc.gfid)) + gf_uuid_copy(local->loc.gfid, buf->ia_gfid); - if (priv->feature_enabled & GF_QUOTA) - mq_set_inode_xattr (this, &local->loc); + if (priv->feature_enabled & GF_QUOTA) + mq_create_xattrs_txn(this, &local->loc, NULL); - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } int -marker_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - mode_t umask, dict_t *xdata) +marker_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - ret = loc_copy (&local->loc, loc); + ret = loc_copy(&local->loc, loc); - if (ret == -1) - goto err; + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_mkdir_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); + STACK_WIND(frame, marker_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); - return 0; + return 0; err: - STACK_UNWIND_STRICT (mkdir, frame, -1, ENOMEM, NULL, - NULL, NULL, NULL, NULL); - return 0; -} + MARKER_STACK_UNWIND(mkdir, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); + return 0; +} int32_t -marker_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +marker_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + quota_inode_ctx_t *ctx = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "error occurred " - "while Creating a file %s", strerror (op_errno)); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, + "error occurred " + "while creating file %s", + strerror(op_errno)); + } - local = (marker_local_t *) frame->local; + local = (marker_local_t *)frame->local; - frame->local = NULL; + frame->local = NULL; + priv = this->private; - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent, xdata); + if (op_ret >= 0 && inode && (priv->feature_enabled & GF_QUOTA)) { + ctx = mq_inode_ctx_new(inode, this); + if (ctx == NULL) { + gf_log(this->name, GF_LOG_WARNING, + "mq_inode_ctx_new " + "failed for %s", + uuid_utoa(inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + } + } - if (op_ret == -1 || local == NULL) - goto out; + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, buf->ia_gfid); + if (op_ret == -1 || local == NULL) + goto out; - priv = this->private; + if (gf_uuid_is_null(local->loc.gfid)) + gf_uuid_copy(local->loc.gfid, buf->ia_gfid); - if (priv->feature_enabled & GF_QUOTA) - mq_set_inode_xattr (this, &local->loc); + if (priv->feature_enabled & GF_QUOTA) + mq_create_xattrs_txn(this, &local->loc, buf); - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } int32_t -marker_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +marker_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - ret = loc_copy (&local->loc, loc); + ret = loc_copy(&local->loc, loc); - if (ret == -1) - goto err; + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_create_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, - fd, xdata); - return 0; + STACK_WIND(frame, marker_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; err: - STACK_UNWIND_STRICT (create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, - NULL, NULL); + MARKER_STACK_UNWIND(create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL, + NULL); - return 0; + return 0; } - int32_t -marker_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +marker_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - marker_conf_t *priv = NULL; - marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + marker_local_t *local = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "error occurred " - "while write, %s", strerror (op_errno)); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, + "error occurred " + "while write, %s", + strerror(op_errno)); + } - local = (marker_local_t *) frame->local; + local = (marker_local_t *)frame->local; - frame->local = NULL; + frame->local = NULL; - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); - if (op_ret == -1 || local == NULL) - goto out; + if (op_ret == -1 || local == NULL) + goto out; - priv = this->private; + priv = this->private; - if (priv->feature_enabled & GF_QUOTA) - mq_initiate_quota_txn (this, &local->loc); + if (priv->feature_enabled & GF_QUOTA) + mq_initiate_quota_txn(this, &local->loc, postbuf); - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } int32_t -marker_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t offset, uint32_t flags, - struct iobref *iobref, dict_t *xdata) +marker_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - ret = marker_inode_loc_fill (fd->inode, &local->loc); + ret = marker_inode_loc_fill(fd->inode, &local->loc); - if (ret == -1) - goto err; + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_writev_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, - flags, iobref, xdata); - return 0; + STACK_WIND(frame, marker_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; err: - STACK_UNWIND_STRICT (writev, frame, -1, ENOMEM, NULL, NULL, NULL); + MARKER_STACK_UNWIND(writev, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + return 0; } - int32_t -marker_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +marker_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - marker_conf_t *priv = NULL; - marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + call_stub_t *stub = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "error occurred " - "rmdir %s", strerror (op_errno)); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, + "error occurred " + "rmdir %s", + strerror(op_errno)); + } - local = (marker_local_t *) frame->local; + local = (marker_local_t *)frame->local; - frame->local = NULL; + frame->local = NULL; + priv = this->private; - STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, preparent, - postparent, xdata); + if (op_ret == -1 || local == NULL) + goto out; - if (op_ret == -1 || local == NULL) - goto out; + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); + + if (priv->feature_enabled & GF_QUOTA) { + /* If a 'rm -rf' is performed by a client, rmdir can be faster + than marker background mq_reduce_parent_size_txn. + In this case, as part of rmdir parent child association + will be removed in the server protocol. + This can lead to mq_reduce_parent_size_txn failures. - priv = this->private; + So perform mq_reduce_parent_size_txn in foreground + and unwind to server once txn is complete + */ - if (priv->feature_enabled & GF_QUOTA) - mq_reduce_parent_size (this, &local->loc, -1); + stub = fop_rmdir_cbk_stub(frame, default_rmdir_cbk, op_ret, op_errno, + preparent, postparent, xdata); + mq_reduce_parent_size_txn(this, &local->loc, NULL, 1, stub); + + if (stub) { + marker_local_unref(local); + return 0; + } + } - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); out: - marker_local_unref (local); + STACK_UNWIND_STRICT(rmdir, frame, op_ret, op_errno, preparent, postparent, + xdata); - return 0; + marker_local_unref(local); + + return 0; } int32_t -marker_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, - dict_t *xdata) +marker_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - ret = loc_copy (&local->loc, loc); + ret = loc_copy(&local->loc, loc); - if (ret == -1) - goto err; + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_rmdir_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata); - return 0; + STACK_WIND(frame, marker_rmdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata); + return 0; err: - STACK_UNWIND_STRICT (rmdir, frame, -1, ENOMEM, NULL, NULL, NULL); + MARKER_STACK_UNWIND(rmdir, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + return 0; } - int32_t -marker_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +marker_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - marker_conf_t *priv = NULL; - marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + uint32_t nlink = -1; + GF_UNUSED int32_t ret = 0; + call_stub_t *stub = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, - "%s occurred in unlink", strerror (op_errno)); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, "%s occurred in unlink", + strerror(op_errno)); + } - local = (marker_local_t *) frame->local; + local = (marker_local_t *)frame->local; - frame->local = NULL; + frame->local = NULL; + priv = this->private; - STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, preparent, - postparent, xdata); + if (op_ret == -1 || local == NULL) + goto out; - if (op_ret == -1 || local == NULL) - goto out; + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); - priv = this->private; + if (priv->feature_enabled & GF_QUOTA) { + if (local->skip_txn) + goto out; - if (priv->feature_enabled & GF_QUOTA) { - if (!local->skip_txn) - mq_reduce_parent_size (this, &local->loc, -1); + if (xdata) { + ret = dict_get_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA, &nlink); + if (ret) { + gf_log(this->name, GF_LOG_TRACE, "dict get failed %s ", + strerror(-ret)); + } } - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); -out: - marker_local_unref (local); + /* If a 'rm -rf' is performed by a client, unlink can be faster + than marker background mq_reduce_parent_size_txn. + In this case, as part of unlink parent child association + will be removed in the server protocol. + This can lead to mq_reduce_parent_size_txn failures. - return 0; -} + So perform mq_reduce_parent_size_txn in foreground + and unwind to server once txn is complete + */ + stub = fop_unlink_cbk_stub(frame, default_unlink_cbk, op_ret, op_errno, + preparent, postparent, xdata); + mq_reduce_parent_size_txn(this, &local->loc, NULL, nlink, stub); -int32_t -marker_unlink_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - dict_t *xdata) -{ - marker_local_t *local = NULL; - - local = frame->local; - if (op_ret < 0) { - goto err; + if (stub) { + marker_local_unref(local); + return 0; } + } - if (local == NULL) { - op_errno = EINVAL; - goto err; - } +out: + STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); - local->ia_nlink = buf->ia_nlink; + marker_local_unref(local); - STACK_WIND (frame, marker_unlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag, - local->xdata); - return 0; -err: - frame->local = NULL; - STACK_UNWIND_STRICT (unlink, frame, -1, op_errno, NULL, NULL, NULL); - marker_local_unref (local); - return 0; + return 0; } - int32_t -marker_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, - dict_t *xdata) +marker_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + gf_boolean_t dict_free = _gf_false; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto unlink_wind; + if (priv->feature_enabled == 0) + goto unlink_wind; - local = mem_get0 (this->local_pool); - local->xflag = xflag; - if (xdata) - local->xdata = dict_ref (xdata); - MARKER_INIT_LOCAL (frame, local); + local = mem_get0(this->local_pool); + local->xflag = xflag; + if (xdata) + local->xdata = dict_ref(xdata); + MARKER_INIT_LOCAL(frame, local); - ret = loc_copy (&local->loc, loc); + ret = loc_copy(&local->loc, loc); - if (ret == -1) - goto err; + if (ret == -1) + goto err; - if (xdata && dict_get (xdata, GLUSTERFS_INTERNAL_FOP_KEY)) { - local->skip_txn = 1; - goto unlink_wind; - } + if (xdata && dict_get(xdata, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY)) { + local->skip_txn = 1; + goto unlink_wind; + } - if (uuid_is_null (loc->gfid) && loc->inode) - uuid_copy (loc->gfid, loc->inode->gfid); + if (xdata == NULL) { + xdata = dict_new(); + dict_free = _gf_true; + } - STACK_WIND (frame, marker_unlink_stat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, loc, xdata); - return 0; + ret = dict_set_int32(xdata, GF_REQUEST_LINK_COUNT_XDATA, 1); + if (ret < 0) + goto err; unlink_wind: - STACK_WIND (frame, marker_unlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); - return 0; + STACK_WIND(frame, marker_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); + goto out; + err: - frame->local = NULL; - STACK_UNWIND_STRICT (unlink, frame, -1, ENOMEM, NULL, NULL, NULL); - marker_local_unref (local); - return 0; -} + MARKER_STACK_UNWIND(unlink, frame, -1, ENOMEM, NULL, NULL, NULL); +out: + if (dict_free) + dict_unref(xdata); + return 0; +} int32_t -marker_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +marker_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "%s occurred while " - "linking a file ", strerror (op_errno)); - } + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - local = (marker_local_t *) frame->local; + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, + "%s occurred while " + "linking a file ", + strerror(op_errno)); + } - frame->local = NULL; + local = (marker_local_t *)frame->local; - STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); + frame->local = NULL; - if (op_ret == -1 || local == NULL) - goto out; + STACK_UNWIND_STRICT(link, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); - priv = this->private; + if (op_ret == -1 || local == NULL) + goto out; - if (priv->feature_enabled & GF_QUOTA) { - if (!local->skip_txn) - mq_set_inode_xattr (this, &local->loc); - } + priv = this->private; + if (priv->feature_enabled & GF_QUOTA) { + if (!local->skip_txn) + mq_create_xattrs_txn(this, &local->loc, buf); + } - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } int32_t -marker_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, - dict_t *xdata) +marker_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - ret = loc_copy (&local->loc, newloc); + ret = loc_copy(&local->loc, newloc); - if (ret == -1) - goto err; + if (ret == -1) + goto err; - if (xdata && dict_get (xdata, GLUSTERFS_INTERNAL_FOP_KEY)) - local->skip_txn = 1; + if (xdata && dict_get(xdata, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY)) + local->skip_txn = 1; wind: - STACK_WIND (frame, marker_link_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); - return 0; + STACK_WIND(frame, marker_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; err: - STACK_UNWIND_STRICT (link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, - NULL); + MARKER_STACK_UNWIND(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } - int32_t -marker_rename_done (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +marker_rename_done(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - marker_local_t *local = NULL, *oplocal = NULL; - loc_t newloc = {0, }; - marker_conf_t *priv = NULL; + marker_local_t *local = NULL, *oplocal = NULL; + loc_t newloc = { + 0, + }; + marker_conf_t *priv = NULL; - local = frame->local; - oplocal = local->oplocal; + local = frame->local; + oplocal = local->oplocal; - priv = this->private; + priv = this->private; - frame->local = NULL; + frame->local = NULL; - if (op_ret < 0) { - if (local->err == 0) { - local->err = op_errno; - } + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "inodelk (UNLOCK) failed on path:%s (gfid:%s) (%s)", + oplocal->parent_loc.path, + uuid_utoa(oplocal->parent_loc.inode->gfid), strerror(op_errno)); + } - gf_log (this->name, GF_LOG_WARNING, - "inodelk (UNLOCK) failed on path:%s (gfid:%s) (%s)", - local->parent_loc.path, - uuid_utoa (local->parent_loc.inode->gfid), - strerror (op_errno)); - } + if (local->err != 0) + goto err; - if (local->stub != NULL) { - call_resume (local->stub); - local->stub = NULL; - } else if (local->err != 0) { - STACK_UNWIND_STRICT (rename, frame, -1, local->err, NULL, NULL, - NULL, NULL, NULL, NULL); - } - - mq_reduce_parent_size (this, &oplocal->loc, oplocal->contribution); - - if (local->loc.inode != NULL) { - mq_reduce_parent_size (this, &local->loc, local->contribution); - } - - newloc.inode = inode_ref (oplocal->loc.inode); - newloc.path = gf_strdup (local->loc.path); - newloc.name = strrchr (newloc.path, '/'); - if (newloc.name) - newloc.name++; - newloc.parent = inode_ref (local->loc.parent); + mq_reduce_parent_size_txn(this, &oplocal->loc, &oplocal->contribution, -1, + NULL); - mq_set_inode_xattr (this, &newloc); - - loc_wipe (&newloc); + if (local->loc.inode != NULL) { + /* If destination file exits before rename, it would have + * been unlinked while renaming a file + */ + mq_reduce_parent_size_txn(this, &local->loc, NULL, local->ia_nlink, + NULL); + } + + newloc.inode = inode_ref(oplocal->loc.inode); + newloc.path = gf_strdup(local->loc.path); + newloc.name = strrchr(newloc.path, '/'); + if (newloc.name) + newloc.name++; + newloc.parent = inode_ref(local->loc.parent); + + mq_create_xattrs_txn(this, &newloc, &local->buf); + + loc_wipe(&newloc); + + if (priv->feature_enabled & GF_XTIME) { + if (!local->loc.inode) + local->loc.inode = inode_ref(oplocal->loc.inode); + // update marks on oldpath + gf_uuid_copy(local->loc.gfid, oplocal->loc.inode->gfid); + marker_xtime_update_marks(this, oplocal); + marker_xtime_update_marks(this, local); + } - if (priv->feature_enabled & GF_XTIME) { - //update marks on oldpath - uuid_copy (local->loc.gfid, oplocal->loc.inode->gfid); - marker_xtime_update_marks (this, oplocal); - marker_xtime_update_marks (this, local); - } +err: + marker_local_unref(local); + marker_local_unref(oplocal); - marker_local_unref (local); - marker_local_unref (oplocal); - return 0; + return 0; } - -int32_t -marker_rename_release_newp_lock (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xdata) +void +marker_rename_release_oldp_lock(marker_local_t *local, xlator_t *this) { - marker_local_t *local = NULL, *oplocal = NULL; - struct gf_flock lock = {0, }; - - local = frame->local; - oplocal = local->oplocal; + marker_local_t *oplocal = NULL; + call_frame_t *lk_frame = NULL; + struct gf_flock lock = { + 0, + }; - if (op_ret < 0) { - if (local->err == 0) { - local->err = op_errno; - } + oplocal = local->oplocal; + lk_frame = local->lk_frame; - gf_log (this->name, GF_LOG_WARNING, - "inodelk (UNLOCK) failed on %s (gfid:%s) (%s)", - oplocal->parent_loc.path, - uuid_utoa (oplocal->parent_loc.inode->gfid), - strerror (op_errno)); - } + if (lk_frame == NULL) + goto err; - if (local->next_lock_on == NULL) { - marker_rename_done (frame, NULL, this, 0, 0, NULL); - goto out; - } + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + lock.l_pid = 0; - lock.l_type = F_UNLCK; - lock.l_whence = SEEK_SET; - lock.l_start = 0; - lock.l_len = 0; - lock.l_pid = 0; + STACK_WIND(lk_frame, marker_rename_done, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->inodelk, this->name, + &oplocal->parent_loc, F_SETLKW, &lock, NULL); - STACK_WIND (frame, - marker_rename_done, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, - this->name, &local->parent_loc, F_SETLKW, &lock, NULL); + return; -out: - return 0; +err: + marker_local_unref(local); + marker_local_unref(oplocal); } - int32_t -marker_rename_release_oldp_lock (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xdata) +marker_rename_unwind(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - marker_local_t *local = NULL, *oplocal = NULL; - struct gf_flock lock = {0, }; + marker_local_t *local = NULL; + marker_local_t *oplocal = NULL; + quota_inode_ctx_t *ctx = NULL; + inode_contribution_t *contri = NULL; - local = frame->local; - oplocal = local->oplocal; + local = frame->local; + oplocal = local->oplocal; + frame->local = NULL; - if ((op_ret < 0) && (op_errno != ENOATTR)) { - local->err = op_errno; - } + // Reset frame uid and gid if set. + if (cookie == (void *)_GF_UID_GID_CHANGED) + MARKER_RESET_UID_GID(frame, frame->root, local); - //Reset frame uid and gid if set. - if (cookie == (void *) _GF_UID_GID_CHANGED) - MARKER_RESET_UID_GID (frame, frame->root, local); - - lock.l_type = F_UNLCK; - lock.l_whence = SEEK_SET; - lock.l_start = 0; - lock.l_len = 0; - lock.l_pid = 0; - - STACK_WIND (frame, - marker_rename_release_newp_lock, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, - this->name, &oplocal->parent_loc, F_SETLKW, &lock, NULL); - return 0; -} + if (op_ret < 0) + local->err = op_errno ? op_errno : EINVAL; + if (local->stub != NULL) { + /* Remove contribution node from in-memory even if + * remove-xattr has failed as the rename is already performed + * if local->stub is set, which means rename was successful + */ + (void)mq_inode_ctx_get(oplocal->loc.inode, this, &ctx); + if (ctx) { + contri = mq_get_contribution_node(oplocal->loc.parent, ctx); + if (contri) { + QUOTA_FREE_CONTRIBUTION_NODE(ctx, contri); + GF_REF_PUT(contri); + } + } + + call_resume(local->stub); + local->stub = NULL; + local->err = 0; + } else if (local->err != 0) { + STACK_UNWIND_STRICT(rename, frame, -1, local->err, NULL, NULL, NULL, + NULL, NULL, NULL); + } else { + gf_log(this->name, GF_LOG_CRITICAL, + "continuation stub to unwind the call is absent, hence " + "call will be hung (call-stack id = %" PRIu64 ")", + frame->root->unique); + } + + /* If there are in-progress writes on old-path when during rename + * operation, update txn will update the wrong path if lock + * is released before rename unwind. + * So release lock only after rename unwind + */ + marker_rename_release_oldp_lock(local, this); + + return 0; +} + +int32_t +marker_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + marker_local_t *oplocal = NULL; + call_stub_t *stub = NULL; + int32_t ret = 0; + char contri_key[QUOTA_KEY_MAX] = { + 0, + }; + loc_t newloc = { + 0, + }; + + local = (marker_local_t *)frame->local; + + if (local != NULL) { + oplocal = local->oplocal; + } -int32_t -marker_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent, - dict_t *xdata) -{ - marker_conf_t *priv = NULL; - marker_local_t *local = NULL; - marker_local_t *oplocal = NULL; - call_stub_t *stub = NULL; - int32_t ret = 0; - char contri_key [512] = {0, }; - loc_t newloc = {0, }; - - local = (marker_local_t *) frame->local; + priv = this->private; + if (op_ret < 0) { if (local != NULL) { - oplocal = local->oplocal; + local->err = op_errno; } - priv = this->private; - - if (op_ret < 0) { - if (local != NULL) { - local->err = op_errno; - } - - gf_log (this->name, GF_LOG_TRACE, "%s occurred while " - "renaming a file ", strerror (op_errno)); - } + gf_log(this->name, GF_LOG_TRACE, + "%s occurred while " + "renaming a file ", + strerror(op_errno)); + } - if (priv->feature_enabled & GF_QUOTA) { - if ((op_ret < 0) || (local == NULL)) { - goto quota_err; - } - - stub = fop_rename_cbk_stub (frame, default_rename_cbk, op_ret, - op_errno, buf, preoldparent, - postoldparent, prenewparent, - postnewparent, xdata); - if (stub == NULL) { - local->err = ENOMEM; - goto quota_err; - } - - local->stub = stub; - - GET_CONTRI_KEY (contri_key, oplocal->loc.parent->gfid, ret); - if (ret < 0) { - local->err = ENOMEM; - goto quota_err; - } - - /* Removexattr requires uid and gid to be 0, - * reset them in the callback. - */ - MARKER_SET_UID_GID (frame, local, frame->root); - - newloc.inode = inode_ref (oplocal->loc.inode); - newloc.path = gf_strdup (local->loc.path); - newloc.name = strrchr (newloc.path, '/'); - if (newloc.name) - newloc.name++; - newloc.parent = inode_ref (local->loc.parent); - uuid_copy (newloc.gfid, oplocal->loc.inode->gfid); - - STACK_WIND_COOKIE (frame, marker_rename_release_oldp_lock, - frame->cookie, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->removexattr, - &newloc, contri_key, NULL); - - loc_wipe (&newloc); - } else { - frame->local = NULL; - - STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf, - preoldparent, postoldparent, prenewparent, - postnewparent, xdata); - - if ((op_ret < 0) || (local == NULL)) { - goto out; - } - - if (priv->feature_enabled & GF_XTIME) { - //update marks on oldpath - uuid_copy (local->loc.gfid, oplocal->loc.inode->gfid); - marker_xtime_update_marks (this, oplocal); - marker_xtime_update_marks (this, local); - } + if (priv->feature_enabled & GF_QUOTA) { + if ((op_ret < 0) || (local == NULL)) { + goto quota_err; } -out: - if (!(priv->feature_enabled & GF_QUOTA)) { - marker_local_unref (local); - marker_local_unref (oplocal); - } - - return 0; - -quota_err: - marker_rename_release_oldp_lock (frame, NULL, this, 0, 0, NULL); - return 0; -} - - -int32_t -marker_do_rename (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) -{ - marker_local_t *local = NULL, *oplocal = NULL; - char contri_key[512] = {0, }; - int32_t ret = 0; - int64_t *contribution = 0; - - local = frame->local; - oplocal = local->oplocal; - - //Reset frame uid and gid if set. - if (cookie == (void *) _GF_UID_GID_CHANGED) - MARKER_RESET_UID_GID (frame, frame->root, local); - - if ((op_ret < 0) && (op_errno != ENOATTR)) { - local->err = op_errno; - gf_log (this->name, GF_LOG_WARNING, - "fetching contribution values from %s (gfid:%s) " - "failed (%s)", local->loc.path, - uuid_utoa (local->loc.inode->gfid), - strerror (op_errno)); - goto err; - } + local->ia_nlink = 0; + if (xdata) + ret = dict_get_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA, + &local->ia_nlink); - if (local->loc.inode != NULL) { - GET_CONTRI_KEY (contri_key, local->loc.parent->gfid, ret); - if (ret < 0) { - local->err = errno; - goto err; - } - - if (dict_get_bin (dict, contri_key, - (void **) &contribution) == 0) { - local->contribution = ntoh64 (*contribution); - } + local->buf = *buf; + stub = fop_rename_cbk_stub(frame, default_rename_cbk, op_ret, op_errno, + buf, preoldparent, postoldparent, + prenewparent, postnewparent, xdata); + if (stub == NULL) { + local->err = ENOMEM; + goto quota_err; } - STACK_WIND (frame, marker_rename_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rename, &oplocal->loc, - &local->loc, NULL); - - return 0; - -err: - marker_rename_release_oldp_lock (frame, NULL, this, 0, 0, NULL); - return 0; -} - - -int32_t -marker_get_newpath_contribution (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *dict, dict_t *xdata) -{ - marker_local_t *local = NULL, *oplocal = NULL; - char contri_key[512] = {0, }; - int32_t ret = 0; - int64_t *contribution = 0; - - local = frame->local; - oplocal = local->oplocal; - - //Reset frame uid and gid if set. - if (cookie == (void *) _GF_UID_GID_CHANGED) - MARKER_RESET_UID_GID (frame, frame->root, local); - - if ((op_ret < 0) && (op_errno != ENOATTR)) { - local->err = op_errno; - gf_log (this->name, GF_LOG_WARNING, - "fetching contribution values from %s (gfid:%s) " - "failed (%s)", oplocal->loc.path, - uuid_utoa (oplocal->loc.inode->gfid), - strerror (op_errno)); - goto err; - } + local->stub = stub; - GET_CONTRI_KEY (contri_key, oplocal->loc.parent->gfid, ret); + GET_CONTRI_KEY(this, contri_key, oplocal->loc.parent->gfid, ret); if (ret < 0) { - local->err = errno; - goto err; + local->err = ENOMEM; + goto quota_err; } - if (dict_get_bin (dict, contri_key, (void **) &contribution) == 0) - oplocal->contribution = ntoh64 (*contribution); - - if (local->loc.inode != NULL) { - GET_CONTRI_KEY (contri_key, local->loc.parent->gfid, ret); - if (ret < 0) { - local->err = errno; - goto err; - } - - /* getxattr requires uid and gid to be 0, - * reset them in the callback. - */ - MARKER_SET_UID_GID (frame, local, frame->root); - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, local->loc.inode->gfid); - - GF_UUID_ASSERT (local->loc.gfid); - - STACK_WIND_COOKIE (frame, marker_do_rename, - frame->cookie, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, - &local->loc, contri_key, NULL); - } else { - marker_do_rename (frame, NULL, this, 0, 0, NULL, NULL); - } + /* Removexattr requires uid and gid to be 0, + * reset them in the callback. + */ + MARKER_SET_UID_GID(frame, local, frame->root); - return 0; -err: - marker_rename_release_oldp_lock (frame, NULL, this, 0, 0, NULL); - return 0; -} + newloc.inode = inode_ref(oplocal->loc.inode); + newloc.path = gf_strdup(local->loc.path); + newloc.name = strrchr(newloc.path, '/'); + if (newloc.name) + newloc.name++; + newloc.parent = inode_ref(local->loc.parent); + gf_uuid_copy(newloc.gfid, oplocal->loc.inode->gfid); + STACK_WIND_COOKIE( + frame, marker_rename_unwind, frame->cookie, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, &newloc, contri_key, NULL); -int32_t -marker_get_oldpath_contribution (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xdata) -{ - marker_local_t *local = NULL, *oplocal = NULL; - char contri_key[512] = {0, }; - int32_t ret = 0; + loc_wipe(&newloc); + } else { + frame->local = NULL; - local = frame->local; - oplocal = local->oplocal; + STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, buf, preoldparent, + postoldparent, prenewparent, postnewparent, xdata); - if (op_ret < 0) { - local->err = op_errno; - gf_log (this->name, GF_LOG_WARNING, - "cannot hold inodelk on %s (gfid:%s) (%s)", - local->next_lock_on->path, - uuid_utoa (local->next_lock_on->inode->gfid), - strerror (op_errno)); - goto lock_err; + if ((op_ret < 0) || (local == NULL)) { + goto out; } - GET_CONTRI_KEY (contri_key, oplocal->loc.parent->gfid, ret); - if (ret < 0) { - local->err = errno; - goto quota_err; + if (priv->feature_enabled & GF_XTIME) { + // update marks on oldpath + if (!local->loc.inode) + local->loc.inode = inode_ref(oplocal->loc.inode); + gf_uuid_copy(local->loc.gfid, oplocal->loc.inode->gfid); + marker_xtime_update_marks(this, oplocal); + marker_xtime_update_marks(this, local); } + } - /* getxattr requires uid and gid to be 0, - * reset them in the callback. - */ - MARKER_SET_UID_GID (frame, local, frame->root); - - if (uuid_is_null (oplocal->loc.gfid)) - uuid_copy (oplocal->loc.gfid, - oplocal->loc.inode->gfid); - - GF_UUID_ASSERT (oplocal->loc.gfid); +out: + if (!(priv->feature_enabled & GF_QUOTA)) { + marker_local_unref(local); + marker_local_unref(oplocal); + } - STACK_WIND_COOKIE (frame, marker_get_newpath_contribution, - frame->cookie, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, - &oplocal->loc, contri_key, NULL); - return 0; + return 0; quota_err: - marker_rename_release_oldp_lock (frame, NULL, this, 0, 0, NULL); - return 0; - -lock_err: - if ((local->next_lock_on == NULL) - || (local->next_lock_on == &local->parent_loc)) { - local->next_lock_on = NULL; - marker_rename_release_oldp_lock (frame, NULL, this, 0, 0, NULL); - } else { - marker_rename_release_newp_lock (frame, NULL, this, 0, 0, NULL); - } + marker_rename_unwind(frame, NULL, this, 0, 0, NULL); + return 0; +} + +int32_t +marker_do_rename(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + marker_local_t *local = NULL; + marker_local_t *oplocal = NULL; + char contri_key[QUOTA_KEY_MAX] = { + 0, + }; + int keylen = 0; + quota_meta_t contribution = { + 0, + }; + + local = frame->local; + oplocal = local->oplocal; + + // Reset frame uid and gid if set. + if (cookie == (void *)_GF_UID_GID_CHANGED) + MARKER_RESET_UID_GID(frame, frame->root, local); + + if ((op_ret < 0) && (op_errno != ENOATTR) && (op_errno != ENODATA)) { + local->err = op_errno ? op_errno : EINVAL; + gf_log(this->name, GF_LOG_WARNING, + "fetching contribution values from %s (gfid:%s) " + "failed (%s)", + oplocal->loc.path, uuid_utoa(oplocal->loc.inode->gfid), + strerror(op_errno)); + goto err; + } + + GET_CONTRI_KEY(this, contri_key, oplocal->loc.parent->gfid, keylen); + if (keylen < 0) { + local->err = errno ? errno : ENOMEM; + goto err; + } + quota_dict_get_meta(dict, contri_key, keylen, &contribution); + oplocal->contribution = contribution; + + STACK_WIND(frame, marker_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, &oplocal->loc, &local->loc, + local->xdata); + + return 0; - return 0; +err: + marker_rename_unwind(frame, NULL, this, 0, 0, NULL); + return 0; } - int32_t -marker_rename_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +marker_get_oldpath_contribution(call_frame_t *lk_frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) { - marker_local_t *local = NULL, *oplocal = NULL; - loc_t *loc = NULL; - struct gf_flock lock = {0, }; + call_frame_t *frame = NULL; + marker_local_t *local = NULL; + marker_local_t *oplocal = NULL; + char contri_key[QUOTA_KEY_MAX] = { + 0, + }; + int32_t ret = 0; - local = frame->local; - oplocal = local->oplocal; + local = lk_frame->local; + oplocal = local->oplocal; + frame = local->frame; - if (op_ret < 0) { - if (local->next_lock_on != &oplocal->parent_loc) { - loc = &oplocal->parent_loc; - } else { - loc = &local->parent_loc; - } - - local->err = op_errno; - gf_log (this->name, GF_LOG_WARNING, - "cannot hold inodelk on %s (gfid:%s) (%s)", - loc->path, uuid_utoa (loc->inode->gfid), - strerror (op_errno)); - goto err; + if (op_ret < 0) { + local->err = op_errno ? op_errno : EINVAL; + gf_log(this->name, GF_LOG_WARNING, + "cannot hold inodelk on %s (gfid:%s) (%s)", oplocal->loc.path, + uuid_utoa(oplocal->loc.inode->gfid), strerror(op_errno)); + if (local->lk_frame) { + STACK_DESTROY(local->lk_frame->root); + local->lk_frame = NULL; } + goto err; + } - if (local->next_lock_on != NULL) { - lock.l_len = 0; - lock.l_start = 0; - lock.l_type = F_WRLCK; - lock.l_whence = SEEK_SET; - - STACK_WIND (frame, - marker_get_oldpath_contribution, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, - this->name, local->next_lock_on, - F_SETLKW, &lock, NULL); - } else { - marker_get_oldpath_contribution (frame, 0, this, 0, 0, NULL); - } + GET_CONTRI_KEY(this, contri_key, oplocal->loc.parent->gfid, ret); + if (ret < 0) { + local->err = errno ? errno : ENOMEM; + goto err; + } - return 0; + /* getxattr requires uid and gid to be 0, + * reset them in the callback. + */ + MARKER_SET_UID_GID(frame, local, frame->root); -err: - marker_rename_done (frame, NULL, this, 0, 0, NULL); - return 0; -} + if (gf_uuid_is_null(oplocal->loc.gfid)) + gf_uuid_copy(oplocal->loc.gfid, oplocal->loc.inode->gfid); + + GF_UUID_ASSERT(oplocal->loc.gfid); + STACK_WIND_COOKIE(frame, marker_do_rename, frame->cookie, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, &oplocal->loc, + contri_key, NULL); + return 0; +err: + marker_rename_unwind(frame, NULL, this, 0, 0, NULL); + return 0; +} + +/* For a marker_rename FOP, following is the algorithm used for Quota + * accounting. The use-case considered is: + * 1. rename (src, dst) + * 2. both src and dst exist + * 3. there are parallel operations on src and dst (lets say through fds + * opened on them before rename was initiated). + * + * PS: We've not thought through whether this algo works in the presence of + * hardlinks to src and/or dst. + * + * Algorithm: + * ========== + * + * 1) set inodelk on src-parent + * As part of rename operation, parent can change for the file. + * We need to remove contribution (both on disk xattr and in-memory one) + * to src-parent (and its ancestors) and add the contribution to dst-parent + * (and its ancestors). While we are doing these operations, contribution of + * the file/directory shouldn't be changing as we want to be sure that + * a) what we subtract from src-parent is exactly what we add to dst-parent + * b) we should subtract from src-parent exactly what we contributed to + * src-parent + * So, We hold a lock on src-parent to block any parallel transcations on + * src-inode (since that's the one which survives rename). + * + * If there are any parallel transactions on dst-inode they keep succeeding + * till the association of dst-inode with dst-parent is broken because of an + * inode_rename after unwind of rename fop from marker. Only after unwind + * (and hence inode_rename), we delete and subtract the contribution of + * dst-inode to dst-parent. That way we are making sure we subtract exactly + * what dst-inode contributed to dst-parent. + * + * 2) lookup contribution to src-parent on src-inode. + * We need to save the contribution info for use at step-8. + * + * 3) wind rename + * Perform rename on disk + * + * 4) remove xattr on src-loc + * After rename, parent can change, so + * need to remove xattrs storing contribution to src-parent. + * + * 5) remove contribution node corresponding to src-parent from the in-memory + * list. + * After rename, contri gfid can change and we have + * also removed xattr from file. + * We need to remove in-memory contribution node to prevent updations to + * src-parent even after a successful rename + * + * 6) unwind rename + * This will ensure that rename is done in the server + * inode table. An inode_rename disassociates src-inode from src-parent and + * associates it with dst-parent. It also disassociates dst-inode from + * dst-parent. After inode_rename, inode_parent on src-inode will give + * dst-parent and inode_parent on dst-inode will return NULL (assuming + * dst-inode doesn't have any hardlinks). + * + * 7) release inodelk on src-parent + * Lock on src-parent should be released only after + * rename on disk, remove xattr and rename_unwind (and hence inode_rename) + * operations. If lock is released before inode_rename, a parallel + * transaction on src-inode can still update src-parent (as inode_parent on + * src-inode can still return src-parent). This would make the + * contribution from src-inode to src-parent stored in step-2 stale. + * + * 8) Initiate mq_reduce_parent_size_txn on src-parent to remove contribution + * of src-inode to src-parent. We use the contribution stored in step-2. + * Since, we had acquired the lock on src-parent all along step-2 through + * inode_rename, we can be sure that a parallel transaction wouldn't have + * added a delta to src-parent. + * + * 9) Initiate mq_reduce_parent_size_txn on dst-parent if dst-inode exists. + * The size reduced from dst-parent and its ancestors is the + * size stored as contribution to dst-parent in dst-inode. + * If the destination file had existed, rename will unlink the + * destination file as part of its operation. + * We need to reduce the size on the dest parent similarly to + * unlink. Since, we are initiating reduce-parent-size transaction after + * inode_rename, we can be sure that a parallel transaction wouldn't add + * delta to dst-parent while we are reducing the contribution of dst-inode + * from its ancestors before rename. + * + * 10) create contribution xattr to dst-parent on src-inode. + */ int32_t -marker_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc, dict_t *xdata) +marker_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_local_t *oplocal = NULL; - marker_conf_t *priv = NULL; - struct gf_flock lock = {0, }; - loc_t *lock_on = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_local_t *oplocal = NULL; + marker_conf_t *priv = NULL; + struct gf_flock lock = { + 0, + }; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto rename_wind; + if (priv->feature_enabled == 0) + goto rename_wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - oplocal = mem_get0 (this->local_pool); + oplocal = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, oplocal); + MARKER_INIT_LOCAL(frame, oplocal); - frame->local = local; + frame->local = local; - local->oplocal = marker_local_ref (oplocal); + local->oplocal = marker_local_ref(oplocal); - ret = loc_copy (&local->loc, newloc); - if (ret < 0) - goto err; + ret = loc_copy(&local->loc, newloc); + if (ret < 0) + goto err; - ret = loc_copy (&oplocal->loc, oldloc); - if (ret < 0) - goto err; + ret = loc_copy(&oplocal->loc, oldloc); + if (ret < 0) + goto err; - if (!(priv->feature_enabled & GF_QUOTA)) { - goto rename_wind; - } + if (!(priv->feature_enabled & GF_QUOTA)) { + goto rename_wind; + } - ret = mq_inode_loc_fill (NULL, newloc->parent, &local->parent_loc); - if (ret < 0) - goto err; + ret = mq_inode_loc_fill(NULL, newloc->parent, &local->parent_loc); + if (ret < 0) + goto err; - ret = mq_inode_loc_fill (NULL, oldloc->parent, &oplocal->parent_loc); - if (ret < 0) - goto err; + ret = mq_inode_loc_fill(NULL, oldloc->parent, &oplocal->parent_loc); + if (ret < 0) + goto err; - if ((newloc->inode != NULL) && (newloc->parent != oldloc->parent) - && (uuid_compare (newloc->parent->gfid, - oldloc->parent->gfid) < 0)) { - lock_on = &local->parent_loc; - local->next_lock_on = &oplocal->parent_loc; - } else { - lock_on = &oplocal->parent_loc; - if ((newloc->inode != NULL) && (newloc->parent - != oldloc->parent)) { - local->next_lock_on = &local->parent_loc; - } - } + lock.l_len = 0; + lock.l_start = 0; + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; - lock.l_len = 0; - lock.l_start = 0; - lock.l_type = F_WRLCK; - lock.l_whence = SEEK_SET; + local->xdata = xdata ? dict_ref(xdata) : dict_new(); + ret = dict_set_int32(local->xdata, GF_REQUEST_LINK_COUNT_XDATA, 1); + if (ret < 0) + goto err; - STACK_WIND (frame, - marker_rename_inodelk_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, - this->name, lock_on, - F_SETLKW, &lock, NULL); + local->frame = frame; + local->lk_frame = create_frame(this, this->ctx->pool); + if (local->lk_frame == NULL) + goto err; - return 0; + local->lk_frame->root->uid = 0; + local->lk_frame->root->gid = 0; + local->lk_frame->local = local; + set_lk_owner_from_ptr(&local->lk_frame->root->lk_owner, + local->lk_frame->root); + + STACK_WIND(local->lk_frame, marker_get_oldpath_contribution, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->inodelk, this->name, + &oplocal->parent_loc, F_SETLKW, &lock, NULL); + + return 0; rename_wind: - STACK_WIND (frame, marker_rename_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + STACK_WIND(frame, marker_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); - return 0; + return 0; err: - STACK_UNWIND_STRICT (rename, frame, -1, ENOMEM, NULL, - NULL, NULL, NULL, NULL, NULL); + MARKER_STACK_UNWIND(rename, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL, + NULL); + marker_local_unref(oplocal); - return 0; + return 0; } - int32_t -marker_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +marker_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "%s occurred while " - "truncating a file ", strerror (op_errno)); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, + "%s occurred while " + "truncating a file ", + strerror(op_errno)); + } - local = (marker_local_t *) frame->local; + local = (marker_local_t *)frame->local; - frame->local = NULL; + frame->local = NULL; - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, - postbuf, xdata); + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); - if (op_ret == -1 || local == NULL) - goto out; + if (op_ret == -1 || local == NULL) + goto out; - priv = this->private; + priv = this->private; - if (priv->feature_enabled & GF_QUOTA) - mq_initiate_quota_txn (this, &local->loc); + if (priv->feature_enabled & GF_QUOTA) { + /* DHT Rebalance process, at the end of migration will + * first make the src file as a linkto file and then + * truncate the file. By doing a truncate after making the + * src file as linkto file, the contri which is already + * accounted is left over. + * So, we need to account for the linkto file when a truncate + * happens, thereby updating the contri properly. + * By passing NULL for postbuf, mq_prevalidate does not check + * for linkto file. + * Same happens with ftruncate as well. + */ + if (postbuf && IS_DHT_LINKFILE_MODE(postbuf)) + mq_initiate_quota_txn(this, &local->loc, NULL); + else + mq_initiate_quota_txn(this, &local->loc, postbuf); + } - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } int32_t -marker_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, - dict_t *xdata) +marker_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - ret = loc_copy (&local->loc, loc); + ret = loc_copy(&local->loc, loc); - if (ret == -1) - goto err; + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_truncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); - return 0; + STACK_WIND(frame, marker_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; err: - STACK_UNWIND_STRICT (truncate, frame, -1, ENOMEM, NULL, NULL, NULL); + MARKER_STACK_UNWIND(truncate, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + return 0; } - int32_t -marker_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +marker_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "%s occurred while " - "truncating a file ", strerror (op_errno)); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, + "%s occurred while " + "truncating a file ", + strerror(op_errno)); + } - local = (marker_local_t *) frame->local; + local = (marker_local_t *)frame->local; - frame->local = NULL; + frame->local = NULL; - STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, - postbuf, xdata); + STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); - if (op_ret == -1 || local == NULL) - goto out; + if (op_ret == -1 || local == NULL) + goto out; - priv = this->private; + priv = this->private; - if (priv->feature_enabled & GF_QUOTA) - mq_initiate_quota_txn (this, &local->loc); + if (priv->feature_enabled & GF_QUOTA) { + if (postbuf && IS_DHT_LINKFILE_MODE(postbuf)) + mq_initiate_quota_txn(this, &local->loc, NULL); + else + mq_initiate_quota_txn(this, &local->loc, postbuf); + } - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } int32_t -marker_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - dict_t *xdata) +marker_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - ret = marker_inode_loc_fill (fd->inode, &local->loc); + ret = marker_inode_loc_fill(fd->inode, &local->loc); - if (ret == -1) - goto err; + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_ftruncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); - return 0; + STACK_WIND(frame, marker_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; err: - STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL); + MARKER_STACK_UNWIND(ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + return 0; } - int32_t -marker_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +marker_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - marker_conf_t *priv = NULL; - marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "%s occurred while " - "creating symlinks ", strerror (op_errno)); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, + "%s occurred while " + "creating symlinks ", + strerror(op_errno)); + } - local = (marker_local_t *) frame->local; + local = (marker_local_t *)frame->local; - frame->local = NULL; + frame->local = NULL; + priv = this->private; - STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); + if (op_ret >= 0 && inode && (priv->feature_enabled & GF_QUOTA)) { + ctx = mq_inode_ctx_new(inode, this); + if (ctx == NULL) { + gf_log(this->name, GF_LOG_WARNING, + "mq_inode_ctx_new " + "failed for %s", + uuid_utoa(inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + } + } - if (op_ret == -1 || local == NULL) - goto out; + STACK_UNWIND_STRICT(symlink, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, buf->ia_gfid); + if (op_ret == -1 || local == NULL) + goto out; - priv = this->private; + if (gf_uuid_is_null(local->loc.gfid)) + gf_uuid_copy(local->loc.gfid, buf->ia_gfid); - if (priv->feature_enabled & GF_QUOTA) - mq_set_inode_xattr (this, &local->loc); + if (priv->feature_enabled & GF_QUOTA) { + mq_create_xattrs_txn(this, &local->loc, buf); + } - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } int -marker_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, - loc_t *loc, mode_t umask, dict_t *xdata) +marker_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - ret = loc_copy (&local->loc, loc); + ret = loc_copy(&local->loc, loc); - if (ret == -1) - goto err; + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_symlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, - xdata); - return 0; + STACK_WIND(frame, marker_symlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, xdata); + return 0; err: - STACK_UNWIND_STRICT (symlink, frame, -1, ENOMEM, NULL, - NULL, NULL, NULL, NULL); - return 0; -} + MARKER_STACK_UNWIND(symlink, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, + NULL); + return 0; +} int32_t -marker_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +marker_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + quota_inode_ctx_t *ctx = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "%s occurred while " - "creating symlinks ", strerror (op_errno)); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, + "%s occurred with " + "mknod ", + strerror(op_errno)); + } - local = (marker_local_t *) frame->local; + local = (marker_local_t *)frame->local; - frame->local = NULL; + frame->local = NULL; + priv = this->private; - STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, - buf, preparent, postparent, xdata); + if (op_ret >= 0 && inode && (priv->feature_enabled & GF_QUOTA)) { + ctx = mq_inode_ctx_new(inode, this); + if (ctx == NULL) { + gf_log(this->name, GF_LOG_WARNING, + "mq_inode_ctx_new " + "failed for %s", + uuid_utoa(inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + } + } - if (op_ret == -1 || local == NULL) - goto out; + STACK_UNWIND_STRICT(mknod, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, buf->ia_gfid); + if (op_ret == -1 || local == NULL) + goto out; - priv = this->private; + if (gf_uuid_is_null(local->loc.gfid)) + gf_uuid_copy(local->loc.gfid, buf->ia_gfid); - if ((priv->feature_enabled & GF_QUOTA) && (S_ISREG (local->mode))) { - mq_set_inode_xattr (this, &local->loc); - } + if ((priv->feature_enabled & GF_QUOTA) && (S_ISREG(local->mode))) { + mq_create_xattrs_txn(this, &local->loc, buf); + } - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } int -marker_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, mode_t umask, dict_t *xdata) +marker_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - ret = loc_copy (&local->loc, loc); + ret = loc_copy(&local->loc, loc); - local->mode = mode; + local->mode = mode; - if (ret == -1) - goto err; + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_mknod_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, - xdata); - return 0; + STACK_WIND(frame, marker_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); + return 0; err: - STACK_UNWIND_STRICT (mknod, frame, -1, ENOMEM, NULL, - NULL, NULL, NULL, NULL); - return 0; -} + MARKER_STACK_UNWIND(mknod, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); + return 0; +} int32_t -marker_fallocate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +marker_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "%s occurred while " - "fallocating a file ", strerror (op_errno)); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, + "%s occurred while " + "fallocating a file ", + strerror(op_errno)); + } - local = (marker_local_t *) frame->local; + local = (marker_local_t *)frame->local; - frame->local = NULL; + frame->local = NULL; - STACK_UNWIND_STRICT (fallocate, frame, op_ret, op_errno, prebuf, - postbuf, xdata); + STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); - if (op_ret == -1 || local == NULL) - goto out; + if (op_ret == -1 || local == NULL) + goto out; - priv = this->private; + priv = this->private; - if (priv->feature_enabled & GF_QUOTA) - mq_initiate_quota_txn (this, &local->loc); + if (priv->feature_enabled & GF_QUOTA) + mq_initiate_quota_txn(this, &local->loc, postbuf); - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } int32_t marker_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, - off_t offset, size_t len, dict_t *xdata) + off_t offset, size_t len, dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - ret = marker_inode_loc_fill (fd->inode, &local->loc); + ret = marker_inode_loc_fill(fd->inode, &local->loc); - if (ret == -1) - goto err; + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_fallocate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, - xdata); - return 0; + STACK_WIND(frame, marker_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, + xdata); + return 0; err: - STACK_UNWIND_STRICT (fallocate, frame, -1, ENOMEM, NULL, NULL, NULL); + MARKER_STACK_UNWIND(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + return 0; } - int32_t marker_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "%s occurred during discard", - strerror (op_errno)); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, "%s occurred during discard", + strerror(op_errno)); + } - local = (marker_local_t *) frame->local; + local = (marker_local_t *)frame->local; - frame->local = NULL; + frame->local = NULL; - STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, prebuf, - postbuf, xdata); + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf, + xdata); - if (op_ret == -1 || local == NULL) - goto out; + if (op_ret == -1 || local == NULL) + goto out; - priv = this->private; + priv = this->private; - if (priv->feature_enabled & GF_QUOTA) - mq_initiate_quota_txn (this, &local->loc); + if (priv->feature_enabled & GF_QUOTA) + mq_initiate_quota_txn(this, &local->loc, postbuf); - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } int32_t marker_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata) + size_t len, dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - ret = marker_inode_loc_fill (fd->inode, &local->loc); + ret = marker_inode_loc_fill(fd->inode, &local->loc); - if (ret == -1) - goto err; + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_discard_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); - return 0; + STACK_WIND(frame, marker_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; err: - STACK_UNWIND_STRICT (discard, frame, -1, ENOMEM, NULL, NULL, NULL); + MARKER_STACK_UNWIND(discard, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + return 0; } int32_t marker_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "%s occurred during zerofill", - strerror (op_errno)); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, "%s occurred during zerofill", + strerror(op_errno)); + } - local = (marker_local_t *) frame->local; + local = (marker_local_t *)frame->local; - frame->local = NULL; + frame->local = NULL; - STACK_UNWIND_STRICT (zerofill, frame, op_ret, op_errno, prebuf, - postbuf, xdata); + STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf, + xdata); - if (op_ret == -1 || local == NULL) - goto out; + if (op_ret == -1 || local == NULL) + goto out; - priv = this->private; + priv = this->private; - if (priv->feature_enabled & GF_QUOTA) - mq_initiate_quota_txn (this, &local->loc); + if (priv->feature_enabled & GF_QUOTA) + mq_initiate_quota_txn(this, &local->loc, postbuf); - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } int32_t marker_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - off_t len, dict_t *xdata) + off_t len, dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - ret = marker_inode_loc_fill (fd->inode, &local->loc); + ret = marker_inode_loc_fill(fd->inode, &local->loc); - if (ret == -1) - goto err; + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_zerofill_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); - return 0; + STACK_WIND(frame, marker_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; err: - STACK_UNWIND_STRICT (zerofill, frame, -1, ENOMEM, NULL, NULL, NULL); + MARKER_STACK_UNWIND(zerofill, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + return 0; } - /* when a call from the special client is received on * key trusted.glusterfs.volume-mark with value "RESET" * or if the value is 0length, update the change the @@ -2039,923 +2323,1246 @@ err: * timestamp file. */ int32_t -call_from_sp_client_to_reset_tmfile (call_frame_t *frame, - xlator_t *this, - dict_t *dict) +call_from_sp_client_to_reset_tmfile(call_frame_t *frame, xlator_t *this, + dict_t *dict) { - int32_t fd = 0; - int32_t op_ret = 0; - int32_t op_errno = 0; - data_t *data = NULL; - marker_conf_t *priv = NULL; + int32_t fd = 0; + int32_t op_ret = 0; + int32_t op_errno = 0; + data_t *data = NULL; + marker_conf_t *priv = NULL; - if (frame == NULL || this == NULL || dict == NULL) - return -1; + if (frame == NULL || this == NULL || dict == NULL) + return -1; - priv = this->private; + priv = this->private; - data = dict_get (dict, "trusted.glusterfs.volume-mark"); - if (data == NULL) - return -1; + data = dict_get(dict, "trusted.glusterfs.volume-mark"); + if (data == NULL) + return -1; - if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { - op_ret = -1; - op_errno = EPERM; + if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { + op_ret = -1; + op_errno = EPERM; - goto out; + goto out; + } + + if (data->len == 0 || + (data->len == 5 && memcmp(data->data, "RESET", 5) == 0)) { + fd = open(priv->timestamp_file, O_WRONLY | O_TRUNC); + if (fd != -1) { + /* TODO check whether the O_TRUNC would update the + * timestamps on a zero length file on all machies. + */ + sys_close(fd); } - if (data->len == 0 || (data->len == 5 && - memcmp (data->data, "RESET", 5) == 0)) { - fd = open (priv->timestamp_file, O_WRONLY|O_TRUNC); - if (fd != -1) { - /* TODO check whether the O_TRUNC would update the - * timestamps on a zero length file on all machies. - */ - close (fd); - } - - if (fd != -1 || errno == ENOENT) { - op_ret = 0; - op_errno = 0; - } else { - op_ret = -1; - op_errno = errno; - } + if (fd != -1 || errno == ENOENT) { + op_ret = 0; + op_errno = 0; } else { - op_ret = -1; - op_errno = EINVAL; + op_ret = -1; + op_errno = errno; } + } else { + op_ret = -1; + op_errno = EINVAL; + } out: - STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, NULL); + STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, NULL); - return 0; + return 0; } - int32_t -marker_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +marker_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "%s occurred in " - "setxattr ", strerror (op_errno)); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, + "%s occurred in " + "setxattr ", + strerror(op_errno)); + } - local = (marker_local_t *) frame->local; + local = (marker_local_t *)frame->local; - frame->local = NULL; + frame->local = NULL; - STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata); + STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, xdata); - if (op_ret == -1 || local == NULL) - goto out; + if (op_ret == -1 || local == NULL) + goto out; - priv = this->private; + priv = this->private; - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } -int32_t -marker_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags, dict_t *xdata) -{ - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; - - priv = this->private; - - if (priv->feature_enabled == 0) - goto wind; - - ret = call_from_sp_client_to_reset_tmfile (frame, this, dict); - if (ret == 0) - return 0; +int +remove_quota_keys(dict_t *dict, char *k, data_t *v, void *data) +{ + call_frame_t *frame = data; + marker_local_t *local = frame->local; + xlator_t *this = frame->this; + marker_conf_t *priv = NULL; + char ver_str[NAME_MAX] = { + 0, + }; + char *dot = NULL; + int ret = -1; + + priv = this->private; + + /* If quota is enabled immediately after disable. + * quota healing starts creating new xattrs + * before completing the cleanup operation. + * So we should check if the xattr is the new. + * Do not remove xattr if its xattr + * version is same as current version + */ + if ((priv->feature_enabled & GF_QUOTA) && priv->version > 0) { + snprintf(ver_str, sizeof(ver_str), ".%d", priv->version); + dot = strrchr(k, '.'); + if (dot && !strcmp(dot, ver_str)) + return 0; + } + + ret = syncop_removexattr(FIRST_CHILD(this), &local->loc, k, 0, NULL); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "%s: Failed to remove " + "extended attribute: %s", + local->loc.path, k); + return -1; + } + return 0; +} - local = mem_get0 (this->local_pool); +int +quota_xattr_cleaner_cbk(int ret, call_frame_t *frame, void *args) +{ + dict_t *xdata = args; + int op_ret = -1; + int op_errno = 0; - MARKER_INIT_LOCAL (frame, local); + op_ret = (ret < 0) ? -1 : 0; + op_errno = -ret; - ret = loc_copy (&local->loc, loc); + MARKER_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); + return ret; +} - if (ret == -1) - goto err; -wind: - STACK_WIND (frame, marker_setxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata); - return 0; -err: - STACK_UNWIND_STRICT (setxattr, frame, -1, ENOMEM, NULL); +int +quota_xattr_cleaner(void *args) +{ + struct synctask *task = NULL; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + marker_local_t *local = NULL; + dict_t *xdata = NULL; + int ret = -1; + + task = synctask_get(); + if (!task) + goto out; + + frame = task->frame; + this = frame->this; + local = frame->local; + + ret = syncop_listxattr(FIRST_CHILD(this), &local->loc, &xdata, NULL, NULL); + if (ret == -1) { + ret = -errno; + goto out; + } + + ret = dict_foreach_fnmatch(xdata, "trusted.glusterfs.quota.*", + remove_quota_keys, frame); + if (ret == -1) { + ret = -errno; + goto out; + } + ret = dict_foreach_fnmatch(xdata, PGFID_XATTR_KEY_PREFIX "*", + remove_quota_keys, frame); + if (ret == -1) { + ret = -errno; + goto out; + } + + ret = 0; +out: + if (xdata) + dict_unref(xdata); - return 0; + return ret; } - -int32_t -marker_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +int +marker_do_xattr_cleanup(call_frame_t *frame, xlator_t *this, dict_t *xdata, + loc_t *loc) { - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "%s occurred while " - "creating symlinks ", strerror (op_errno)); - } + int ret = -1; + marker_local_t *local = NULL; - local = (marker_local_t *) frame->local; + local = mem_get0(this->local_pool); + if (!local) + goto out; - frame->local = NULL; + MARKER_INIT_LOCAL(frame, local); - STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata); + loc_copy(&local->loc, loc); + ret = synctask_new(this->ctx->env, quota_xattr_cleaner, + quota_xattr_cleaner_cbk, frame, xdata); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "Failed to create synctask " + "for cleaning up quota extended attributes"); + goto out; + } - if (op_ret == -1 || local == NULL) - goto out; - - priv = this->private; - - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + ret = 0; out: - marker_local_unref (local); + if (ret) + MARKER_STACK_UNWIND(setxattr, frame, -1, ENOMEM, xdata); - return 0; + return ret; +} + +static gf_boolean_t +marker_xattr_cleanup_cmd(dict_t *dict) +{ + return (dict_get(dict, VIRTUAL_QUOTA_XATTR_CLEANUP_KEY) != NULL); } int32_t -marker_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, - int32_t flags, dict_t *xdata) +marker_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + int op_errno = ENOMEM; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (marker_xattr_cleanup_cmd(dict)) { + if (frame->root->uid != 0 || frame->root->gid != 0) { + op_errno = EPERM; + ret = -1; + goto err; + } - ret = call_from_sp_client_to_reset_tmfile (frame, this, dict); - if (ret == 0) - return 0; + /* The following function does the cleanup and then unwinds the + * corresponding call*/ + loc_path(loc, NULL); + marker_do_xattr_cleanup(frame, this, xdata, loc); + return 0; + } - local = mem_get0 (this->local_pool); + ret = marker_key_replace_with_ver(this, dict); + if (ret < 0) + goto err; - MARKER_INIT_LOCAL (frame, local); + if (priv->feature_enabled == 0) + goto wind; - ret = marker_inode_loc_fill (fd->inode, &local->loc); + ret = call_from_sp_client_to_reset_tmfile(frame, this, dict); + if (ret == 0) + return 0; - if (ret == -1) - goto err; + local = mem_get0(this->local_pool); + + MARKER_INIT_LOCAL(frame, local); + + ret = loc_copy(&local->loc, loc); + + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_fsetxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); - return 0; + STACK_WIND(frame, marker_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata); + return 0; err: - STACK_UNWIND_STRICT (fsetxattr, frame, -1, ENOMEM, NULL); + MARKER_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); - return 0; + return 0; } - int32_t -marker_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost, dict_t *xdata) +marker_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, "%s occurred while " - "creating symlinks ", strerror (op_errno)); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, + "%s occurred in " + "fsetxattr", + strerror(op_errno)); + } - local = (marker_local_t *) frame->local; + local = (marker_local_t *)frame->local; - frame->local = NULL; + frame->local = NULL; - STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno, statpre, - statpost, xdata); + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata); - if (op_ret == -1 || local == NULL) - goto out; + if (op_ret == -1 || local == NULL) + goto out; - priv = this->private; + priv = this->private; - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } - int32_t -marker_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid, dict_t *xdata) +marker_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + ret = call_from_sp_client_to_reset_tmfile(frame, this, dict); + if (ret == 0) + return 0; - MARKER_INIT_LOCAL (frame, local); + local = mem_get0(this->local_pool); - ret = marker_inode_loc_fill (fd->inode, &local->loc); + MARKER_INIT_LOCAL(frame, local); - if (ret == -1) - goto err; + ret = marker_inode_loc_fill(fd->inode, &local->loc); + + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_fsetattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsetattr, fd, stbuf, valid, xdata); - return 0; + STACK_WIND(frame, marker_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + return 0; err: - STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL); + MARKER_STACK_UNWIND(fsetxattr, frame, -1, ENOMEM, NULL); - return 0; + return 0; } - int32_t -marker_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +marker_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *statpre, struct iatt *statpost, dict_t *xdata) { - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - local = (marker_local_t *) frame->local; + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, + "%s occurred in " + "fsetattr ", + strerror(op_errno)); + } - frame->local = NULL; + local = (marker_local_t *)frame->local; - if (op_ret == -1) { - gf_log (this->name, ((op_errno == ENOENT) ? GF_LOG_DEBUG : - GF_LOG_ERROR), - "%s occurred during setattr of %s", - strerror (op_errno), - (local ? local->loc.path : "<nul>")); - } + frame->local = NULL; - STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, statpre, - statpost, xdata); + STACK_UNWIND_STRICT(fsetattr, frame, op_ret, op_errno, statpre, statpost, + xdata); - if (op_ret == -1 || local == NULL) - goto out; + if (op_ret == -1 || local == NULL) + goto out; - priv = this->private; + priv = this->private; - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } int32_t -marker_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, +marker_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - ret = loc_copy (&local->loc, loc); + ret = marker_inode_loc_fill(fd->inode, &local->loc); - if (ret == -1) - goto err; + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_setattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->setattr, loc, stbuf, valid, xdata); - return 0; + STACK_WIND(frame, marker_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; err: - STACK_UNWIND_STRICT (setattr, frame, -1, ENOMEM, NULL, NULL, NULL); + MARKER_STACK_UNWIND(fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + return 0; } - int32_t -marker_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +marker_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) { - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, "%s occurred while " - "creating symlinks ", strerror (op_errno)); - } + local = (marker_local_t *)frame->local; - local = (marker_local_t *) frame->local; + frame->local = NULL; - frame->local = NULL; + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, "%s occurred during setattr of %s", + strerror(op_errno), (local ? local->loc.path : "<nul>")); + } - STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata); + STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, statpre, statpost, + xdata); - if (op_ret == -1 || local == NULL) - goto out; + if (op_ret == -1 || local == NULL) + goto out; - priv = this->private; + priv = this->private; - if (priv->feature_enabled & GF_XTIME) - marker_xtime_update_marks (this, local); + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } int32_t -marker_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) +marker_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (priv->feature_enabled == 0) + goto wind; - local = mem_get0 (this->local_pool); + local = mem_get0(this->local_pool); - MARKER_INIT_LOCAL (frame, local); + MARKER_INIT_LOCAL(frame, local); - ret = loc_copy (&local->loc, loc); + ret = loc_copy(&local->loc, loc); - if (ret == -1) - goto err; + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_removexattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); - return 0; + STACK_WIND(frame, marker_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; err: - STACK_UNWIND_STRICT (removexattr, frame, -1, ENOMEM, NULL); + MARKER_STACK_UNWIND(setattr, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + return 0; } - int32_t -marker_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *dict, struct iatt *postparent) +marker_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - marker_conf_t *priv = NULL; - marker_local_t *local = NULL; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "lookup failed with %s", - strerror (op_errno)); - } - - local = (marker_local_t *) frame->local; - - frame->local = NULL; - - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, - dict, postparent); + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, + "%s occurred while " + "removing extended attribute", + strerror(op_errno)); + } - if (op_ret == -1 || local == NULL) - goto out; + local = (marker_local_t *)frame->local; - /* copy the gfid from the stat structure instead of inode, - * since if the lookup is fresh lookup, then the inode - * would have not yet linked to the inode table which happens - * in protocol/server. - */ - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, buf->ia_gfid); + frame->local = NULL; + STACK_UNWIND_STRICT(removexattr, frame, op_ret, op_errno, xdata); - priv = this->private; + if (op_ret == -1 || local == NULL) + goto out; - if (priv->feature_enabled & GF_QUOTA) { - mq_xattr_state (this, &local->loc, dict, *buf); - } + priv = this->private; + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks(this, local); out: - marker_local_unref (local); + marker_local_unref(local); - return 0; + return 0; } int32_t -marker_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) +marker_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - int32_t ret = 0; - marker_local_t *local = NULL; - marker_conf_t *priv = NULL; + int32_t ret = -1; + int32_t i = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + char key[QUOTA_KEY_MAX] = { + 0, + }; - priv = this->private; + priv = this->private; - if (priv->feature_enabled == 0) - goto wind; + if (name) { + for (i = 0; mq_ext_xattrs[i]; i++) { + if (strcmp(name, mq_ext_xattrs[i])) + continue; - local = mem_get0 (this->local_pool); + GET_QUOTA_KEY(this, key, mq_ext_xattrs[i], ret); + if (ret < 0) + goto err; + name = key; + break; + } + } - MARKER_INIT_LOCAL (frame, local); + if (priv->feature_enabled == 0) + goto wind; - ret = loc_copy (&local->loc, loc); - if (ret == -1) - goto err; + local = mem_get0(this->local_pool); + + MARKER_INIT_LOCAL(frame, local); + + ret = loc_copy(&local->loc, loc); - if ((priv->feature_enabled & GF_QUOTA) && xattr_req) - mq_req_xattr (this, loc, xattr_req); + if (ret == -1) + goto err; wind: - STACK_WIND (frame, marker_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, xattr_req); - return 0; + STACK_WIND(frame, marker_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; err: - STACK_UNWIND_STRICT (lookup, frame, -1, 0, NULL, NULL, NULL, NULL); + MARKER_STACK_UNWIND(removexattr, frame, -1, ENOMEM, NULL); - return 0; + return 0; } - -int -marker_build_ancestry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries, - dict_t *xdata) +static gf_boolean_t +__has_quota_xattrs(dict_t *xattrs) { - gf_dirent_t *entry = NULL; - loc_t loc = {0, }; - inode_t *parent = NULL; - - if ((op_ret <= 0) || (entries == NULL)) { - goto out; - } - + if (dict_foreach_match(xattrs, _is_quota_internal_xattr, NULL, + dict_null_foreach_fn, NULL) > 0) + return _gf_true; - list_for_each_entry (entry, &entries->list, list) { - if (entry->inode == entry->inode->table->root) { - loc.path = gf_strdup ("/"); - inode_unref (parent); - parent = NULL; - } + return _gf_false; +} - loc.inode = inode_ref (entry->inode); +int32_t +marker_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *dict, struct iatt *postparent) +{ + marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + dict_t *xattrs = NULL; + quota_inode_ctx_t *ctx = NULL; + int32_t ret = -1; + + priv = this->private; + local = (marker_local_t *)frame->local; + frame->local = NULL; + + if (op_ret == -1) { + gf_log(this->name, GF_LOG_TRACE, "lookup failed with %s", + strerror(op_errno)); + goto unwind; + } + + ret = marker_key_set_ver(this, dict); + if (ret < 0) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + if (dict && __has_quota_xattrs(dict)) { + xattrs = dict_copy_with_ref(dict, NULL); + if (!xattrs) { + op_ret = -1; + op_errno = ENOMEM; + } else { + marker_filter_internal_xattrs(this, xattrs); + } + } else if (dict) { + xattrs = dict_ref(dict); + } - if (parent != NULL) { - loc.parent = inode_ref (parent); - uuid_copy (loc.pargfid, parent->gfid); - } + if (op_ret >= 0 && inode && (priv->feature_enabled & GF_QUOTA)) { + ctx = mq_inode_ctx_new(inode, this); + if (ctx == NULL) { + gf_log(this->name, GF_LOG_WARNING, + "mq_inode_ctx_new " + "failed for %s", + uuid_utoa(inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + } + } - uuid_copy (loc.gfid, entry->d_stat.ia_gfid); +unwind: + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xattrs, + postparent); - mq_xattr_state (this, &loc, entry->dict, entry->d_stat); + if (op_ret == -1 || local == NULL) + goto out; - inode_unref (parent); - parent = inode_ref (entry->inode); - loc_wipe (&loc); - } + /* copy the gfid from the stat structure instead of inode, + * since if the lookup is fresh lookup, then the inode + * would have not yet linked to the inode table which happens + * in protocol/server. + */ + if (gf_uuid_is_null(local->loc.gfid)) + gf_uuid_copy(local->loc.gfid, buf->ia_gfid); - if (parent) - inode_unref (parent); + if (priv->feature_enabled & GF_QUOTA) { + mq_xattr_state(this, &local->loc, dict, buf); + } out: - STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); - return 0; + marker_local_unref(local); + if (xattrs) + dict_unref(xattrs); + + return 0; } -int -marker_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries, - dict_t *xdata) +int32_t +marker_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xattr_req) { - gf_dirent_t *entry = NULL; - marker_conf_t *priv = NULL; - marker_local_t *local = NULL; - loc_t loc = {0, }; + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; - if (op_ret <= 0) - goto unwind; + priv = this->private; - priv = this->private; - local = frame->local; + xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new(); + if (!xattr_req) + goto err; - if (!(priv->feature_enabled & GF_QUOTA) || (local == NULL)) { - goto unwind; - } + ret = marker_key_replace_with_ver(this, xattr_req); + if (ret < 0) + goto err; - list_for_each_entry (entry, &entries->list, list) { - if ((strcmp (entry->d_name, ".") == 0) || - (strcmp (entry->d_name, "..") == 0)) - continue; + if (priv->feature_enabled == 0) + goto wind; - loc.inode = inode_ref (entry->inode); - loc.parent = inode_ref (local->loc.inode); + local = mem_get0(this->local_pool); + if (local == NULL) + goto err; - uuid_copy (loc.gfid, entry->d_stat.ia_gfid); - uuid_copy (loc.pargfid, loc.parent->gfid); + MARKER_INIT_LOCAL(frame, local); - mq_xattr_state (this, &loc, entry->dict, entry->d_stat); + ret = loc_copy(&local->loc, loc); + if (ret == -1) + goto err; - loc_wipe (&loc); - } + if ((priv->feature_enabled & GF_QUOTA)) + mq_req_xattr(this, loc, xattr_req, NULL, NULL); -unwind: - local = frame->local; - frame->local = NULL; +wind: + STACK_WIND(frame, marker_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xattr_req); - STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); - marker_local_unref (local); + dict_unref(xattr_req); - return 0; + return 0; +err: + MARKER_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL); + + if (xattr_req) + dict_unref(xattr_req); + + return 0; } int -marker_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, dict_t *dict) +marker_build_ancestry_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) { - marker_conf_t *priv = NULL; - loc_t loc = {0, }; - marker_local_t *local = NULL; + gf_dirent_t *entry = NULL; + quota_inode_ctx_t *ctx = NULL; + int ret = -1; - priv = this->private; + if ((op_ret <= 0) || (entries == NULL)) { + goto out; + } - if ((dict != NULL) && dict_get (dict, GET_ANCESTRY_DENTRY_KEY)) { - STACK_WIND (frame, marker_build_ancestry_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdirp, - fd, size, offset, dict); - } else { - if (priv->feature_enabled & GF_QUOTA) { - local = mem_get0 (this->local_pool); + list_for_each_entry(entry, &entries->list, list) + { + if (entry->inode == NULL) + continue; - MARKER_INIT_LOCAL (frame, local); + ret = marker_key_set_ver(this, entry->dict); + if (ret < 0) { + op_ret = -1; + op_errno = ENOMEM; + break; + } - loc.parent = local->loc.inode = inode_ref (fd->inode); + ctx = mq_inode_ctx_new(entry->inode, this); + if (ctx == NULL) + gf_log(this->name, GF_LOG_WARNING, + "mq_inode_ctx_new " + "failed for %s", + uuid_utoa(entry->inode->gfid)); + } - if (dict == NULL) - dict = dict_new (); +out: + STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} - mq_req_xattr (this, &loc, dict); - } +int +marker_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + marker_conf_t *priv = NULL; + marker_local_t *local = NULL; + loc_t loc = { + 0, + }; + int ret = -1; + char *resolvedpath = NULL; + quota_inode_ctx_t *ctx = NULL; + + if (op_ret <= 0) + goto unwind; + + priv = this->private; + local = frame->local; + + if (!(priv->feature_enabled & GF_QUOTA) || (local == NULL)) { + goto unwind; + } + + list_for_each_entry(entry, &entries->list, list) + { + if ((strcmp(entry->d_name, ".") == 0) || + (strcmp(entry->d_name, "..") == 0) || entry->inode == NULL) + continue; + + loc.parent = inode_ref(local->loc.inode); + loc.inode = inode_ref(entry->inode); + ret = inode_path(loc.parent, entry->d_name, &resolvedpath); + if (ret < 0) { + gf_log(this->name, GF_LOG_ERROR, + "failed to get the " + "path for the entry %s", + entry->d_name); + loc_wipe(&loc); + continue; + } + + loc.path = resolvedpath; + resolvedpath = NULL; + + ctx = mq_inode_ctx_new(loc.inode, this); + if (ctx == NULL) + gf_log(this->name, GF_LOG_WARNING, + "mq_inode_ctx_new " + "failed for %s", + uuid_utoa(loc.inode->gfid)); - STACK_WIND (frame, marker_readdirp_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdirp, - fd, size, offset, dict); + mq_xattr_state(this, &loc, entry->dict, &entry->d_stat); + loc_wipe(&loc); + + ret = marker_key_set_ver(this, entry->dict); + if (ret < 0) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; } + } - return 0; +unwind: + MARKER_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata); + + return 0; } +int +marker_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict) +{ + marker_conf_t *priv = NULL; + loc_t loc = { + 0, + }; + marker_local_t *local = NULL; + int ret = -1; + + priv = this->private; + + dict = dict ? dict_ref(dict) : dict_new(); + if (!dict) + goto unwind; + + ret = marker_key_replace_with_ver(this, dict); + if (ret < 0) + goto unwind; + + if (dict_get(dict, GET_ANCESTRY_DENTRY_KEY)) { + STACK_WIND(frame, marker_build_ancestry_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict); + } else { + if (priv->feature_enabled & GF_QUOTA) { + local = mem_get0(this->local_pool); + + MARKER_INIT_LOCAL(frame, local); + + loc.parent = local->loc.inode = inode_ref(fd->inode); + + mq_req_xattr(this, &loc, dict, NULL, NULL); + } + + STACK_WIND(frame, marker_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict); + } + + dict_unref(dict); + return 0; +unwind: + MARKER_STACK_UNWIND(readdirp, frame, -1, ENOMEM, NULL, NULL); + return 0; +} int32_t -mem_acct_init (xlator_t *this) +mem_acct_init(xlator_t *this) { - int ret = -1; + int ret = -1; - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_marker_mt_end + 1); + if (!this) + return ret; - if (ret != 0) { - gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } + ret = xlator_mem_acct_init(this, gf_marker_mt_end + 1); + if (ret != 0) { + gf_log(this->name, GF_LOG_ERROR, + "Memory accounting init" + " failed"); return ret; -} + } + return ret; +} int32_t -init_xtime_priv (xlator_t *this, dict_t *options) +init_xtime_priv(xlator_t *this, dict_t *options) { - data_t *data = NULL; - int32_t ret = -1; - marker_conf_t *priv = NULL; + int32_t ret = -1; + marker_conf_t *priv = NULL; + char *tmp_opt = NULL; - GF_VALIDATE_OR_GOTO ("marker", this, out); - GF_VALIDATE_OR_GOTO (this->name, options, out); - GF_VALIDATE_OR_GOTO (this->name, this->private, out); + GF_VALIDATE_OR_GOTO("marker", this, out); + GF_VALIDATE_OR_GOTO(this->name, options, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); - priv = this->private; + priv = this->private; - if((data = dict_get (options, VOLUME_UUID)) != NULL) { - priv->volume_uuid = data->data; + ret = dict_get_str(options, "volume-uuid", &tmp_opt); - ret = uuid_parse (priv->volume_uuid, priv->volume_uuid_bin); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "invalid volume uuid %s", priv->volume_uuid); - goto out; - } + if (ret) { + priv->volume_uuid = NULL; + tmp_opt = ""; - ret = gf_asprintf (& (priv->marker_xattr), "%s.%s.%s", - MARKER_XATTR_PREFIX, priv->volume_uuid, - XTIME); + gf_log(this->name, GF_LOG_ERROR, + "please specify the volume-uuid" + "in the translator options"); - if (ret == -1){ - priv->marker_xattr = NULL; + return -1; + } + gf_asprintf(&priv->volume_uuid, "%s", tmp_opt); - gf_log (this->name, GF_LOG_ERROR, - "Failed to allocate memory"); - goto out; - } + ret = gf_uuid_parse(priv->volume_uuid, priv->volume_uuid_bin); - gf_log (this->name, GF_LOG_DEBUG, - "the volume-uuid = %s", priv->volume_uuid); - } else { - priv->volume_uuid = NULL; + if (ret == -1) { + gf_log(this->name, GF_LOG_ERROR, "invalid volume uuid %s", + priv->volume_uuid); + goto out; + } - gf_log (this->name, GF_LOG_ERROR, - "please specify the volume-uuid" - "in the translator options"); + ret = gf_asprintf(&(priv->marker_xattr), "%s.%s.%s", MARKER_XATTR_PREFIX, + priv->volume_uuid, XTIME); - return -1; - } + if (ret == -1) { + priv->marker_xattr = NULL; + goto out; + } - if ((data = dict_get (options, TIMESTAMP_FILE)) != NULL) { - priv->timestamp_file = data->data; + gf_log(this->name, GF_LOG_DEBUG, "volume-uuid = %s", priv->volume_uuid); - gf_log (this->name, GF_LOG_DEBUG, - "the timestamp-file is = %s", - priv->timestamp_file); + ret = dict_get_str(options, "timestamp-file", &tmp_opt); + if (ret) { + priv->timestamp_file = NULL; + tmp_opt = ""; - } else { - priv->timestamp_file = NULL; + gf_log(this->name, GF_LOG_ERROR, + "please specify the timestamp-file" + "in the translator options"); - gf_log (this->name, GF_LOG_ERROR, - "please specify the timestamp-file" - "in the translator options"); + goto out; + } - goto out; - } + ret = gf_asprintf(&priv->timestamp_file, "%s", tmp_opt); + if (ret == -1) { + priv->timestamp_file = NULL; + goto out; + } - ret = 0; + gf_log(this->name, GF_LOG_DEBUG, "the timestamp-file is = %s", + priv->timestamp_file); + + ret = 0; out: - return ret; + return ret; } void -marker_xtime_priv_cleanup (xlator_t *this) +marker_xtime_priv_cleanup(xlator_t *this) { - marker_conf_t *priv = NULL; + marker_conf_t *priv = NULL; - GF_VALIDATE_OR_GOTO ("marker", this, out); + GF_VALIDATE_OR_GOTO("marker", this, out); - priv = (marker_conf_t *) this->private; + priv = (marker_conf_t *)this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, out); + GF_VALIDATE_OR_GOTO(this->name, priv, out); - GF_FREE (priv->volume_uuid); + GF_FREE(priv->volume_uuid); - GF_FREE (priv->timestamp_file); + GF_FREE(priv->timestamp_file); - GF_FREE (priv->marker_xattr); + GF_FREE(priv->marker_xattr); out: - return; + return; } void -marker_priv_cleanup (xlator_t *this) +marker_priv_cleanup(xlator_t *this) { - marker_conf_t *priv = NULL; + marker_conf_t *priv = NULL; - GF_VALIDATE_OR_GOTO ("marker", this, out); + GF_VALIDATE_OR_GOTO("marker", this, out); - priv = (marker_conf_t *) this->private; + priv = (marker_conf_t *)this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, out); + GF_VALIDATE_OR_GOTO(this->name, priv, out); - marker_xtime_priv_cleanup (this); + marker_xtime_priv_cleanup(this); - LOCK_DESTROY (&priv->lock); + LOCK_DESTROY(&priv->lock); - GF_FREE (priv); -out: - return; -} + GF_FREE(priv); -int32_t -reconfigure (xlator_t *this, dict_t *options) -{ - int32_t ret = 0; - data_t *data = NULL; - gf_boolean_t flag = _gf_false; - marker_conf_t *priv = NULL; - - GF_ASSERT (this); - GF_ASSERT (this->private); - - priv = this->private; - - priv->feature_enabled = 0; - - GF_VALIDATE_OR_GOTO (this->name, options, out); - - data = dict_get (options, "quota"); - if (data) { - ret = gf_string2boolean (data->data, &flag); - if (ret == 0 && flag == _gf_true) { - ret = init_quota_priv (this); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "failed to initialize quota private"); - } else { - priv->feature_enabled |= GF_QUOTA; - } - } - } + if (this->local_pool) { + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } - data = dict_get (options, "xtime"); - if (data) { - ret = gf_string2boolean (data->data, &flag); - if (ret == 0 && flag == _gf_true) { - marker_xtime_priv_cleanup (this); - - ret = init_xtime_priv (this, options); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "failed to initialize xtime private, " - "xtime updation will fail"); - } else { - priv->feature_enabled |= GF_XTIME; - data = dict_get (options, "gsync-force-xtime"); - if (!data) - goto out; - ret = gf_string2boolean (data->data, &flag); - if (ret == 0 && flag) - priv->feature_enabled |= GF_XTIME_GSYNC_FORCE; - } - } - } out: - return ret; + return; +} + +int32_t +reconfigure(xlator_t *this, dict_t *options) +{ + int32_t ret = 0; + data_t *data = NULL; + gf_boolean_t flag = _gf_false; + marker_conf_t *priv = NULL; + int32_t version = 0; + + GF_ASSERT(this); + GF_ASSERT(this->private); + + priv = this->private; + + priv->feature_enabled = 0; + + GF_VALIDATE_OR_GOTO(this->name, options, out); + + data = dict_get(options, "quota"); + if (data) { + ret = gf_string2boolean(data->data, &flag); + if (ret == 0 && flag == _gf_true) + priv->feature_enabled |= GF_QUOTA; + } + + data = dict_get(options, "inode-quota"); + if (data) { + ret = gf_string2boolean(data->data, &flag); + if (ret == 0 && flag == _gf_true) + priv->feature_enabled |= GF_INODE_QUOTA; + } + + data = dict_get(options, "quota-version"); + if (data) + ret = gf_string2int32(data->data, &version); + + if (priv->feature_enabled) { + if (version >= 0) + priv->version = version; + else + gf_log(this->name, GF_LOG_ERROR, + "Invalid quota " + "version %d", + priv->version); + } + + data = dict_get(options, "xtime"); + if (data) { + ret = gf_string2boolean(data->data, &flag); + if (ret == 0 && flag == _gf_true) { + marker_xtime_priv_cleanup(this); + + ret = init_xtime_priv(this, options); + if (ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "failed to initialize xtime private, " + "xtime updation will fail"); + } else { + priv->feature_enabled |= GF_XTIME; + data = dict_get(options, "gsync-force-xtime"); + if (!data) + goto out; + ret = gf_string2boolean(data->data, &flag); + if (ret == 0 && flag) + priv->feature_enabled |= GF_XTIME_GSYNC_FORCE; + } + } + } +out: + return ret; } - int32_t -init (xlator_t *this) -{ - dict_t *options = NULL; - data_t *data = NULL; - int32_t ret = 0; - gf_boolean_t flag = _gf_false; - marker_conf_t *priv = NULL; - - if (!this->children) { - gf_log (this->name, GF_LOG_ERROR, - "marker translator needs subvolume defined."); - return -1; - } +init(xlator_t *this) +{ + dict_t *options = NULL; + data_t *data = NULL; + int32_t ret = 0; + gf_boolean_t flag = _gf_false; + marker_conf_t *priv = NULL; - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "Volume is dangling."); - return -1; - } + if (!this->children) { + gf_log(this->name, GF_LOG_ERROR, + "marker translator needs subvolume defined."); + return -1; + } - options = this->options; + if (!this->parents) { + gf_log(this->name, GF_LOG_WARNING, "Volume is dangling."); + return -1; + } - ALLOCATE_OR_GOTO (this->private, marker_conf_t, err); + options = this->options; - priv = this->private; + ALLOCATE_OR_GOTO(this->private, marker_conf_t, err); - priv->feature_enabled = 0; + priv = this->private; - LOCK_INIT (&priv->lock); + priv->feature_enabled = 0; + priv->version = 0; - data = dict_get (options, "quota"); - if (data) { - ret = gf_string2boolean (data->data, &flag); - if (ret == 0 && flag == _gf_true) { - ret = init_quota_priv (this); - if (ret < 0) - goto err; + LOCK_INIT(&priv->lock); - priv->feature_enabled |= GF_QUOTA; - } - } + data = dict_get(options, "quota"); + if (data) { + ret = gf_string2boolean(data->data, &flag); + if (ret == 0 && flag == _gf_true) + priv->feature_enabled |= GF_QUOTA; + } - data = dict_get (options, "xtime"); - if (data) { - ret = gf_string2boolean (data->data, &flag); - if (ret == 0 && flag == _gf_true) { - ret = init_xtime_priv (this, options); - if (ret < 0) - goto err; - - priv->feature_enabled |= GF_XTIME; - data = dict_get (options, "gsync-force-xtime"); - if (!data) - goto cont; - ret = gf_string2boolean (data->data, &flag); - if (ret == 0 && flag) - priv->feature_enabled |= GF_XTIME_GSYNC_FORCE; - } - } + data = dict_get(options, "inode-quota"); + if (data) { + ret = gf_string2boolean(data->data, &flag); + if (ret == 0 && flag == _gf_true) + priv->feature_enabled |= GF_INODE_QUOTA; + } + + data = dict_get(options, "quota-version"); + if (data) + ret = gf_string2int32(data->data, &priv->version); - cont: - this->local_pool = mem_pool_new (marker_local_t, 128); - if (!this->local_pool) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create local_t's memory pool"); + if ((ret == 0) && priv->feature_enabled && priv->version < 0) { + gf_log(this->name, GF_LOG_ERROR, "Invalid quota version %d", + priv->version); + goto err; + } + + data = dict_get(options, "xtime"); + if (data) { + ret = gf_string2boolean(data->data, &flag); + if (ret == 0 && flag == _gf_true) { + ret = init_xtime_priv(this, options); + if (ret < 0) goto err; - } - return 0; + priv->feature_enabled |= GF_XTIME; + data = dict_get(options, "gsync-force-xtime"); + if (!data) + goto cont; + ret = gf_string2boolean(data->data, &flag); + if (ret == 0 && flag) + priv->feature_enabled |= GF_XTIME_GSYNC_FORCE; + } + } + +cont: + this->local_pool = mem_pool_new(marker_local_t, 128); + if (!this->local_pool) { + gf_log(this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto err; + } + + return 0; err: - marker_priv_cleanup (this); + marker_priv_cleanup(this); - return -1; + return -1; } int32_t -marker_forget (xlator_t *this, inode_t *inode) +marker_forget(xlator_t *this, inode_t *inode) { - marker_inode_ctx_t *ctx = NULL; - uint64_t value = 0; + marker_inode_ctx_t *ctx = NULL; + uint64_t value = 0; - if (inode_ctx_del (inode, this, &value) != 0) - goto out; + if (inode_ctx_del(inode, this, &value) != 0) + goto out; - ctx = (marker_inode_ctx_t *)(unsigned long)value; - if (ctx == NULL) { - goto out; - } + ctx = (marker_inode_ctx_t *)(unsigned long)value; + if (ctx == NULL) { + goto out; + } - mq_forget (this, ctx->quota_ctx); + mq_forget(this, ctx->quota_ctx); - GF_FREE (ctx); + GF_FREE(ctx); out: - return 0; + return 0; } void -fini (xlator_t *this) +fini(xlator_t *this) { - marker_priv_cleanup (this); + marker_priv_cleanup(this); } struct xlator_fops fops = { - .lookup = marker_lookup, - .create = marker_create, - .mkdir = marker_mkdir, - .writev = marker_writev, - .truncate = marker_truncate, - .ftruncate = marker_ftruncate, - .symlink = marker_symlink, - .link = marker_link, - .unlink = marker_unlink, - .rmdir = marker_rmdir, - .rename = marker_rename, - .mknod = marker_mknod, - .setxattr = marker_setxattr, - .fsetxattr = marker_fsetxattr, - .setattr = marker_setattr, - .fsetattr = marker_fsetattr, - .removexattr = marker_removexattr, - .getxattr = marker_getxattr, - .readdirp = marker_readdirp, - .fallocate = marker_fallocate, - .discard = marker_discard, - .zerofill = marker_zerofill, + .lookup = marker_lookup, + .create = marker_create, + .mkdir = marker_mkdir, + .writev = marker_writev, + .truncate = marker_truncate, + .ftruncate = marker_ftruncate, + .symlink = marker_symlink, + .link = marker_link, + .unlink = marker_unlink, + .rmdir = marker_rmdir, + .rename = marker_rename, + .mknod = marker_mknod, + .setxattr = marker_setxattr, + .fsetxattr = marker_fsetxattr, + .setattr = marker_setattr, + .fsetattr = marker_fsetattr, + .removexattr = marker_removexattr, + .getxattr = marker_getxattr, + .readdirp = marker_readdirp, + .fallocate = marker_fallocate, + .discard = marker_discard, + .zerofill = marker_zerofill, }; -struct xlator_cbks cbks = { - .forget = marker_forget -}; +struct xlator_cbks cbks = {.forget = marker_forget}; struct volume_options options[] = { - {.key = {"volume-uuid"}}, - {.key = {"timestamp-file"}}, - {.key = {"quota"}}, - {.key = {"xtime"}}, - {.key = {"gsync-force-xtime"}}, - {.key = {NULL}} + {.key = {"volume-uuid"}, .default_value = "{{ volume.id }}"}, + {.key = {"timestamp-file"}}, + { + .key = {"quota"}, + .op_version = {1}, + .flags = OPT_FLAG_NONE, + .tags = {}, + }, + { + .key = {"inode-quota"}, + .op_version = {1}, + .flags = OPT_FLAG_NONE, + .tags = {}, + }, + { + .key = {"xtime"}, + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_FORCE, + .tags = {}, + }, + { + .key = {"gsync-force-xtime"}, + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_FORCE, + .tags = {}, + }, + { + .key = {"quota-version"}, + .flags = OPT_FLAG_NONE, + }, + {.key = {NULL}}}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "marker", + .category = GF_MAINTAINED, }; diff --git a/xlators/features/marker/src/marker.h b/xlators/features/marker/src/marker.h index 23d1580f0e5..4821094c14b 100644 --- a/xlators/features/marker/src/marker.h +++ b/xlators/features/marker/src/marker.h @@ -10,129 +10,138 @@ #ifndef _MARKER_H #define _MARKER_H -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include "marker-quota.h" -#include "xlator.h" -#include "defaults.h" -#include "uuid.h" -#include "call-stub.h" +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include <glusterfs/compat-uuid.h> +#include <glusterfs/call-stub.h> #define MARKER_XATTR_PREFIX "trusted.glusterfs" -#define XTIME "xtime" -#define VOLUME_MARK "volume-mark" -#define VOLUME_UUID "volume-uuid" -#define TIMESTAMP_FILE "timestamp-file" +#define XTIME "xtime" +#define VOLUME_MARK "volume-mark" +#define VOLUME_UUID "volume-uuid" +#define TIMESTAMP_FILE "timestamp-file" enum { - GF_QUOTA = 1, - GF_XTIME = 2, - GF_XTIME_GSYNC_FORCE = 4, + GF_QUOTA = 1, + GF_XTIME = 2, + GF_XTIME_GSYNC_FORCE = 4, + GF_INODE_QUOTA = 8, }; /*initialize the local variable*/ -#define MARKER_INIT_LOCAL(_frame,_local) do { \ - _frame->local = _local; \ - _local->pid = _frame->root->pid; \ - memset (&_local->loc, 0, sizeof (loc_t)); \ - _local->ref = 1; \ - _local->uid = -1; \ - _local->gid = -1; \ - LOCK_INIT (&_local->lock); \ - _local->oplocal = NULL; \ - } while (0) +#define MARKER_INIT_LOCAL(_frame, _local) \ + do { \ + _frame->local = _local; \ + _local->pid = _frame->root->pid; \ + memset(&_local->loc, 0, sizeof(loc_t)); \ + _local->ref = 1; \ + _local->uid = -1; \ + _local->gid = -1; \ + LOCK_INIT(&_local->lock); \ + _local->oplocal = NULL; \ + } while (0) /* try alloc and if it fails, goto label */ -#define ALLOCATE_OR_GOTO(var, type, label) do { \ - var = GF_CALLOC (sizeof (type), 1, \ - gf_marker_mt_##type); \ - if (!var) { \ - gf_log (this->name, GF_LOG_ERROR, \ - "out of memory :("); \ - goto label; \ - } \ - } while (0) - -#define _MARKER_SET_UID_GID(dest, src) \ - do { \ - if (src->uid != -1 && \ - src->gid != -1) { \ - dest->uid = src->uid; \ - dest->gid = src->gid; \ - } \ - } while (0) - -#define MARKER_SET_UID_GID(frame, dest, src) \ - do { \ - _MARKER_SET_UID_GID (dest, src); \ - frame->root->uid = 0; \ - frame->root->gid = 0; \ - frame->cookie = (void *) _GF_UID_GID_CHANGED; \ - } while (0) - -#define MARKER_RESET_UID_GID(frame, dest, src) \ - do { \ - _MARKER_SET_UID_GID (dest, src); \ - frame->cookie = NULL; \ - } while (0) - -struct marker_local{ - uint32_t timebuf[2]; - pid_t pid; - loc_t loc; - loc_t parent_loc; - loc_t *next_lock_on; - uid_t uid; - gid_t gid; - int32_t ref; - int32_t ia_nlink; - gf_lock_t lock; - mode_t mode; - int32_t err; - call_stub_t *stub; - int64_t contribution; - struct marker_local *oplocal; - - /* marker quota specific */ - int64_t delta; - int64_t d_off; - int64_t sum; - int64_t size; - int32_t hl_count; - int32_t dentry_child_count; - - fd_t *fd; - call_frame_t *frame; - - quota_inode_ctx_t *ctx; - inode_contribution_t *contri; - - int xflag; - dict_t *xdata; - gf_boolean_t skip_txn; +#define ALLOCATE_OR_GOTO(var, type, label) \ + do { \ + var = GF_CALLOC(sizeof(type), 1, gf_marker_mt_##type); \ + if (!var) { \ + gf_log(this->name, GF_LOG_ERROR, "out of memory :("); \ + goto label; \ + } \ + } while (0) + +#define _MARKER_SET_UID_GID(dest, src) \ + do { \ + if (src->uid != -1 && src->gid != -1) { \ + dest->uid = src->uid; \ + dest->gid = src->gid; \ + } \ + } while (0) + +#define MARKER_SET_UID_GID(frame, dest, src) \ + do { \ + _MARKER_SET_UID_GID(dest, src); \ + frame->root->uid = 0; \ + frame->root->gid = 0; \ + frame->cookie = (void *)_GF_UID_GID_CHANGED; \ + } while (0) + +#define MARKER_RESET_UID_GID(frame, dest, src) \ + do { \ + _MARKER_SET_UID_GID(dest, src); \ + frame->cookie = NULL; \ + } while (0) + +#define MARKER_STACK_UNWIND(fop, frame, params...) \ + do { \ + quota_local_t *_local = NULL; \ + if (frame) { \ + _local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + if (_local) \ + marker_local_unref(_local); \ + } while (0) + +struct marker_local { + uint32_t timebuf[2]; + pid_t pid; + loc_t loc; + loc_t parent_loc; + uid_t uid; + gid_t gid; + int32_t ref; + uint32_t ia_nlink; + struct iatt buf; + gf_lock_t lock; + mode_t mode; + int32_t err; + call_stub_t *stub; + call_frame_t *lk_frame; + quota_meta_t contribution; + struct marker_local *oplocal; + + /* marker quota specific */ + int64_t delta; + int64_t d_off; + int64_t sum; + int64_t size; + int32_t hl_count; + int32_t dentry_child_count; + + fd_t *fd; + call_frame_t *frame; + + quota_inode_ctx_t *ctx; + inode_contribution_t *contri; + + int xflag; + dict_t *xdata; + gf_boolean_t skip_txn; }; typedef struct marker_local marker_local_t; #define quota_local_t marker_local_t struct marker_inode_ctx { - struct quota_inode_ctx *quota_ctx; + struct quota_inode_ctx *quota_ctx; }; typedef struct marker_inode_ctx marker_inode_ctx_t; -struct marker_conf{ - char feature_enabled; - char *size_key; - char *dirty_key; - char *volume_uuid; - uuid_t volume_uuid_bin; - char *timestamp_file; - char *marker_xattr; - uint64_t quota_lk_owner; - gf_lock_t lock; +struct marker_conf { + char feature_enabled; + char *size_key; + char *dirty_key; + char *volume_uuid; + uuid_t volume_uuid_bin; + char *timestamp_file; + char *marker_xattr; + uint64_t quota_lk_owner; + gf_lock_t lock; + int32_t version; }; typedef struct marker_conf marker_conf_t; diff --git a/xlators/features/metadisp/Makefile.am b/xlators/features/metadisp/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/features/metadisp/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/metadisp/src/Makefile.am b/xlators/features/metadisp/src/Makefile.am new file mode 100644 index 00000000000..1520ad8c424 --- /dev/null +++ b/xlators/features/metadisp/src/Makefile.am @@ -0,0 +1,38 @@ +noinst_PYTHON = gen-fops.py + +EXTRA_DIST = fops-tmpl.c + +xlator_LTLIBRARIES = metadisp.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +nodist_metadisp_la_SOURCES = fops.c + +BUILT_SOURCES = fops.c + +metadisp_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +metadisp_la_SOURCES = metadisp.c \ + metadisp-unlink.c \ + metadisp-stat.c \ + metadisp-lookup.c \ + metadisp-readdir.c \ + metadisp-create.c \ + metadisp-open.c \ + metadisp-fsync.c \ + metadisp-setattr.c \ + backend.c + +metadisp_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = metadisp.h metadisp-fops.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +fops.c: fops-tmpl.c $(top_srcdir)/libglusterfs/src/generator.py gen-fops.py + PYTHONPATH=$(top_srcdir)/libglusterfs/src \ + $(PYTHON) $(srcdir)/gen-fops.py $(srcdir)/fops-tmpl.c > $@ + +CLEANFILES = $(nodist_metadisp_la_SOURCES) diff --git a/xlators/features/metadisp/src/backend.c b/xlators/features/metadisp/src/backend.c new file mode 100644 index 00000000000..ee2c25bfaa7 --- /dev/null +++ b/xlators/features/metadisp/src/backend.c @@ -0,0 +1,45 @@ +#define GFID_STR_LEN 37 + +#include "metadisp.h" + +/* + * backend.c + * + * functions responsible for converting user-facing paths to backend-style + * "/$GFID" paths. + */ + +int32_t +build_backend_loc(uuid_t gfid, loc_t *src_loc, loc_t *dst_loc) +{ + static uuid_t root = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + char gfid_buf[GFID_STR_LEN + 1] = { + 0, + }; + char *path = NULL; + + GF_VALIDATE_OR_GOTO("metadisp", src_loc, out); + GF_VALIDATE_OR_GOTO("metadisp", dst_loc, out); + + loc_copy(dst_loc, src_loc); + memcpy(dst_loc->pargfid, root, sizeof(root)); + GF_FREE((char *)dst_loc->path); // we are overwriting path so nuke + // whatever loc_copy gave us + + uuid_utoa_r(gfid, gfid_buf); + + path = GF_CALLOC(GFID_STR_LEN + 1, sizeof(char), + gf_common_mt_char); // freed via loc_wipe + + path[0] = '/'; + strncpy(path + 1, gfid_buf, GFID_STR_LEN); + path[GFID_STR_LEN] = 0; + dst_loc->path = path; + if (src_loc->name) + dst_loc->name = strrchr(dst_loc->path, '/'); + if (dst_loc->name) + dst_loc->name++; + return 0; +out: + return -1; +} diff --git a/xlators/features/metadisp/src/fops-tmpl.c b/xlators/features/metadisp/src/fops-tmpl.c new file mode 100644 index 00000000000..4385b7dd5b7 --- /dev/null +++ b/xlators/features/metadisp/src/fops-tmpl.c @@ -0,0 +1,10 @@ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <glusterfs/xlator.h> +#include "metadisp.h" +#include "metadisp-fops.h" + +#pragma generate diff --git a/xlators/features/metadisp/src/gen-fops.py b/xlators/features/metadisp/src/gen-fops.py new file mode 100644 index 00000000000..8b5e120fdec --- /dev/null +++ b/xlators/features/metadisp/src/gen-fops.py @@ -0,0 +1,160 @@ +#!/usr/bin/python + +import sys +from generator import fop_subs, generate + +FN_METADATA_CHILD_GENERIC = """ +int32_t +metadisp_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + METADISP_TRACE("@NAME@ metadata"); + STACK_WIND (frame, default_@NAME@_cbk, + METADATA_CHILD(this), METADATA_CHILD(this)->fops->@NAME@, + @SHORT_ARGS@); + return 0; +} +""" + +FN_GENERIC_TEMPLATE = """ +int32_t +metadisp_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + METADISP_TRACE("@NAME@ generic"); + STACK_WIND (frame, default_@NAME@_cbk, + DATA_CHILD(this), DATA_CHILD(this)->fops->@NAME@, + @SHORT_ARGS@); + return 0; +} +""" + +FN_DATAFD_TEMPLATE = """ +int32_t +metadisp_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + METADISP_TRACE("@NAME@ datafd"); + xlator_t *child = NULL; + child = DATA_CHILD(this); + STACK_WIND (frame, default_@NAME@_cbk, + child, child->fops->@NAME@, + @SHORT_ARGS@); + return 0; +} +""" + +FN_DATALOC_TEMPLATE = """ +int32_t +metadisp_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + METADISP_TRACE("@NAME@ dataloc"); + loc_t backend_loc = { + 0, + }; + if (build_backend_loc(loc->gfid, loc, &backend_loc)) { + goto unwind; + } + xlator_t *child = NULL; + child = DATA_CHILD(this); + STACK_WIND (frame, default_@NAME@_cbk, + child, child->fops->@NAME@, + @SHORT_ARGS@); + return 0; + +unwind: + STACK_UNWIND_STRICT(lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL); + return 0; +} +""" + +FOPS_LINE_TEMPLATE = "\t.@NAME@ = metadisp_@NAME@," + +skipped = [ + "readdir", + "readdirp", + "lookup", + "fsync", + "stat", + "open", + "create", + "unlink", + "setattr", + # TODO: implement "inodelk", +] + + +def gen_fops(): + done = skipped + + # + # these are fops that wind to the DATA_CHILD + # + # NOTE: re-written in order from google doc: + # https://docs.google.com/document/d/1KEwVtSNvDhs4qb63gWx2ulCp5GJjge77NGJk4p_Ms4Q + for name in [ + "writev", + "readv", + "ftruncate", + "zerofill", + "discard", + "seek", + "fstat", + ]: + done = done + [name] + print(generate(FN_DATAFD_TEMPLATE, name, fop_subs)) + + for name in ["truncate"]: + done = done + [name] + print(generate(FN_DATALOC_TEMPLATE, name, fop_subs)) + + # these are fops that operate solely on dentries, folders, + # or extended attributes. Therefore, they must always + # wind to METADATA_CHILD and should never perform + # any path rewriting + # + # NOTE: re-written in order from google doc: + # https://docs.google.com/document/d/1KEwVtSNvDhs4qb63gWx2ulCp5GJjge77NGJk4p_Ms4Q + for name in [ + "mkdir", + "symlink", + "link", + "rename", + "mknod", + "opendir", + # "readdir, # special-cased + # "readdirp, # special-cased + "fsyncdir", + # "setattr", # special-cased + "readlink", + "fentrylk", + "access", + # TODO: these wind to both, + # data for backend-attributes and metadata for the rest + "xattrop", + "setxattr", + "getxattr", + "removexattr", + "fgetxattr", + "fsetxattr", + "fremovexattr", + ]: + + done = done + [name] + print(generate(FN_METADATA_CHILD_GENERIC, name, fop_subs)) + + print("struct xlator_fops fops = {") + for name in done: + print(generate(FOPS_LINE_TEMPLATE, name, fop_subs)) + + print("};") + + +for l in open(sys.argv[1], "r").readlines(): + if l.find("#pragma generate") != -1: + print("/* BEGIN GENERATED CODE - DO NOT MODIFY */") + gen_fops() + print("/* END GENERATED CODE */") + else: + print(l[:-1]) diff --git a/xlators/features/metadisp/src/metadisp-create.c b/xlators/features/metadisp/src/metadisp-create.c new file mode 100644 index 00000000000..f8c9798dd59 --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-create.c @@ -0,0 +1,101 @@ +#include "metadisp.h" +#include <glusterfs/call-stub.h> + +/** + * Create, like stat, is a two-step process. We send a create + * to the METADATA_CHILD, then send another create to the DATA_CHILD. + * + * We do the metadata child first to ensure that the ACLs are enforced. + */ + +int32_t +metadisp_create_dentry_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); + return 0; +} + +int32_t +metadisp_create_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flags, mode_t mode, mode_t umask, fd_t *fd, + dict_t *xdata) +{ + // create the backend data inode + STACK_WIND(frame, metadisp_create_dentry_cbk, DATA_CHILD(this), + DATA_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; +} + +int32_t +metadisp_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + METADISP_TRACE("%d %d", op_ret, op_errno); + call_stub_t *stub = cookie; + if (op_ret != 0) { + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); + return 0; + } + + if (stub == NULL) { + goto unwind; + } + + if (stub->poison) { + call_stub_destroy(stub); + return 0; + } + + call_resume(stub); + return 0; + +unwind: + STACK_UNWIND_STRICT(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; +} + +int32_t +metadisp_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + METADISP_TRACE("."); + + loc_t backend_loc = { + 0, + }; + call_stub_t *stub = NULL; + uuid_t *gfid_req = NULL; + + RESOLVE_GFID_REQ(xdata, gfid_req, out); + + if (build_backend_loc(*gfid_req, loc, &backend_loc)) { + goto unwind; + } + + frame->local = loc; + + stub = fop_create_stub(frame, metadisp_create_resume, &backend_loc, flags, + mode, umask, fd, xdata); + + STACK_WIND_COOKIE(frame, metadisp_create_cbk, stub, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->create, loc, flags, mode, + umask, fd, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; +out: + return -1; +} diff --git a/xlators/features/metadisp/src/metadisp-fops.h b/xlators/features/metadisp/src/metadisp-fops.h new file mode 100644 index 00000000000..56dd427cf34 --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-fops.h @@ -0,0 +1,51 @@ +#ifndef GF_METADISP_FOPS_H_ +#define GF_METADISP_FOPS_H_ + +#include <glusterfs/xlator.h> +#include <glusterfs/dict.h> +#include <glusterfs/glusterfs.h> + +#include <sys/types.h> + +/* fops in here are defined in their own file. Every other fop is just defined + * inline of fops.c */ + +int +metadisp_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata); + +int +metadisp_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict); + +int +metadisp_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +metadisp_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata); + +int +metadisp_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata); + +int +metadisp_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +metadisp_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata); + +int +metadisp_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata); + +int +metadisp_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata); + +int +metadisp_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata); + +#endif diff --git a/xlators/features/metadisp/src/metadisp-fsync.c b/xlators/features/metadisp/src/metadisp-fsync.c new file mode 100644 index 00000000000..2e46fa84eac --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-fsync.c @@ -0,0 +1,54 @@ + +#include "metadisp.h" +#include <glusterfs/call-stub.h> + +int32_t +metadisp_fsync_resume(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t flags, dict_t *xdata) +{ + STACK_WIND(frame, default_fsync_cbk, DATA_CHILD(this), + DATA_CHILD(this)->fops->fsync, fd, flags, xdata); + return 0; +} + +int32_t +metadisp_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + call_stub_t *stub = NULL; + if (cookie) { + stub = cookie; + } + + if (op_ret != 0) { + goto unwind; + } + + if (stub->poison) { + call_stub_destroy(stub); + stub = NULL; + return 0; + } + + call_resume(stub); + return 0; + +unwind: + if (stub) { + call_stub_destroy(stub); + } + STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} + +int32_t +metadisp_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) +{ + call_stub_t *stub = NULL; + stub = fop_fsync_stub(frame, metadisp_fsync_resume, fd, flags, xdata); + STACK_WIND_COOKIE(frame, metadisp_fsync_cbk, stub, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->fsync, fd, flags, xdata); + return 0; +} diff --git a/xlators/features/metadisp/src/metadisp-lookup.c b/xlators/features/metadisp/src/metadisp-lookup.c new file mode 100644 index 00000000000..27d90c9f746 --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-lookup.c @@ -0,0 +1,90 @@ +#include "metadisp.h" +#include <glusterfs/call-stub.h> + +/** + * Lookup, like stat, is a two-step process for grabbing the metadata details + * as well as the data details. + */ + +int32_t +metadisp_backend_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + METADISP_TRACE("backend_lookup_cbk"); + if (op_errno == ENOENT) { + op_errno = ENODATA; + op_ret = -1; + } + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + return 0; +} + +int32_t +metadisp_backend_lookup_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) +{ + METADISP_TRACE("backend_lookup_resume"); + loc_t backend_loc = { + 0, + }; + if (build_backend_loc(loc->gfid, loc, &backend_loc)) { + goto unwind; + } + + STACK_WIND(frame, metadisp_backend_lookup_cbk, DATA_CHILD(this), + DATA_CHILD(this)->fops->lookup, &backend_loc, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL); + return 0; +} + +int32_t +metadisp_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + METADISP_TRACE("%d %d", op_ret, op_errno); + call_stub_t *stub = NULL; + stub = cookie; + + if (op_ret != 0) { + goto unwind; + } + + if (!IA_ISREG(buf->ia_type)) { + goto unwind; + } else if (!stub) { + op_errno = EINVAL; + goto unwind; + } + + METADISP_TRACE("resuming stub"); + + // memcpy(stub->args.loc.gfid, buf->ia_gfid, sizeof(uuid_t)); + call_resume(stub); + return 0; +unwind: + METADISP_TRACE("unwinding %d %d", op_ret, op_errno); + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + if (stub) { + call_stub_destroy(stub); + } + return 0; +} + +int32_t +metadisp_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + METADISP_TRACE("lookup"); + call_stub_t *stub = NULL; + stub = fop_lookup_stub(frame, metadisp_backend_lookup_resume, loc, xdata); + STACK_WIND_COOKIE(frame, metadisp_lookup_cbk, stub, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->lookup, loc, xdata); + return 0; +} diff --git a/xlators/features/metadisp/src/metadisp-open.c b/xlators/features/metadisp/src/metadisp-open.c new file mode 100644 index 00000000000..64814afe636 --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-open.c @@ -0,0 +1,70 @@ +#include <glusterfs/call-stub.h> +#include "metadisp.h" + +int32_t +metadisp_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + METADISP_TRACE("got open results %d %d", op_ret, op_errno); + + call_stub_t *stub = NULL; + if (cookie) { + stub = cookie; + } + + if (op_ret != 0) { + goto unwind; + } + + if (!stub) { + goto unwind; + } + + if (stub->poison) { + call_stub_destroy(stub); + stub = NULL; + return 0; + } + + call_resume(stub); + return 0; + +unwind: + if (stub) { + call_stub_destroy(stub); + } + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata); + return 0; +} + +int32_t +metadisp_open_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flags, fd_t *fd, dict_t *xdata) +{ + STACK_WIND_COOKIE(frame, metadisp_open_cbk, NULL, DATA_CHILD(this), + DATA_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +} + +int32_t +metadisp_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + call_stub_t *stub = NULL; + loc_t backend_loc = { + 0, + }; + + if (build_backend_loc(loc->gfid, loc, &backend_loc)) { + goto unwind; + } + + stub = fop_open_stub(frame, metadisp_open_resume, &backend_loc, flags, fd, + xdata); + STACK_WIND_COOKIE(frame, metadisp_open_cbk, stub, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(open, frame, -1, EINVAL, NULL, NULL); + return 0; +} diff --git a/xlators/features/metadisp/src/metadisp-readdir.c b/xlators/features/metadisp/src/metadisp-readdir.c new file mode 100644 index 00000000000..5f840b1e88f --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-readdir.c @@ -0,0 +1,65 @@ +#include "metadisp.h" + +/** + * With a change to the posix xlator, readdir and readdirp are shockingly + * simple. + * + * The issue with separating the backend data of the files + * with the metadata is that readdirs must now read from multiple sources + * to coalesce the directory entries. + * + * The way we do this is to tell the METADATA_CHILD that when it's + * running readdirp, each file entry should have a stat wound to + * 'stat-source-of-truth'. + * + * see metadisp_stat for how it handles winds _from_posix. + */ + +int32_t +metadisp_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + METADISP_TRACE("."); + /* + * Always use readdirp, even if the original was readdir. Why? Because NFS. + * There are multiple translations between Gluster, UNIX, and NFS stat + * structures in that path. One of them uses the type etc. from the stat + * structure, which is only filled in by readdirp. If we use readdir, the + * entries do actually go all the way back to the client and are visible in + * getdents, but then the readdir throws them away because of the + * uninitialized type. + */ + GF_UNUSED int32_t ret; + if (!xdata) { + xdata = dict_new(); + } + + // ret = dict_set_int32 (xdata, "list-xattr", 1); + + // I'm my own source of truth! + ret = dict_set_static_ptr(xdata, "stat-source-of-truth", (void *)this); + + STACK_WIND(frame, default_readdirp_cbk, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->readdirp, fd, size, off, xdata); + + return 0; +} + +int32_t +metadisp_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + METADISP_TRACE("."); + if (!xdata) { + xdata = dict_new(); + } + GF_UNUSED int32_t ret; + // ret = dict_set_int32 (xdata, "list-xattr", 1); + + // I'm my own source of truth! + ret = dict_set_static_ptr(xdata, "stat-source-of-truth", (void *)this); + + STACK_WIND(frame, default_readdirp_cbk, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->readdirp, fd, size, off, xdata); + return 0; +} diff --git a/xlators/features/metadisp/src/metadisp-setattr.c b/xlators/features/metadisp/src/metadisp-setattr.c new file mode 100644 index 00000000000..6991cf644f3 --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-setattr.c @@ -0,0 +1,90 @@ +#include "metadisp.h" +#include <glusterfs/call-stub.h> + +int32_t +metadisp_backend_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *statpre, struct iatt *statpost, + dict_t *xdata) + +{ + METADISP_TRACE("backend_setattr_cbk"); + if (op_errno == ENOENT) { + op_errno = ENODATA; + op_ret = -1; + } + STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, statpre, statpost, + xdata); + return 0; +} + +int32_t +metadisp_backend_setattr_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, + dict_t *xdata) + +{ + METADISP_TRACE("backend_setattr_resume"); + loc_t backend_loc = { + 0, + }; + if (build_backend_loc(loc->gfid, loc, &backend_loc)) { + goto unwind; + } + + STACK_WIND(frame, metadisp_backend_setattr_cbk, DATA_CHILD(this), + DATA_CHILD(this)->fops->setattr, &backend_loc, stbuf, valid, + xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(setattr, frame, -1, EINVAL, NULL, NULL, NULL); + return 0; +} + +int32_t +metadisp_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + METADISP_TRACE("%d %d", op_ret, op_errno); + call_stub_t *stub = NULL; + stub = cookie; + + if (op_ret != 0) { + goto unwind; + } + + if (!IA_ISREG(statpost->ia_type)) { + goto unwind; + } else if (!stub) { + op_errno = EINVAL; + goto unwind; + } + + METADISP_TRACE("resuming stub"); + call_resume(stub); + return 0; +unwind: + METADISP_TRACE("unwinding %d %d", op_ret, op_errno); + STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, statpre, statpost, + xdata); + if (stub) { + call_stub_destroy(stub); + } + return 0; +} + +int32_t +metadisp_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + METADISP_TRACE("setattr"); + call_stub_t *stub = NULL; + stub = fop_setattr_stub(frame, metadisp_backend_setattr_resume, loc, stbuf, + valid, xdata); + STACK_WIND_COOKIE(frame, metadisp_setattr_cbk, stub, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->setattr, loc, stbuf, valid, + xdata); + return 0; +} diff --git a/xlators/features/metadisp/src/metadisp-stat.c b/xlators/features/metadisp/src/metadisp-stat.c new file mode 100644 index 00000000000..b06d0dbcddd --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-stat.c @@ -0,0 +1,124 @@ +#include "metadisp.h" +#include <glusterfs/call-stub.h> + +/** + * The stat flow in METADISP is complicated because we must + * do ensure a few things: + * 1. stat, on the path within the metadata layer, + * MUST get the backend FD of the data layer. + * --- we wind to the metadata layer, then the data layer. + * + * 2. the metadata layer MUST be able to ask the data + * layer for stat information. + * --- this is 'syncop-internal-from-posix' + * + * 3. when the metadata exists BUT the data is missing, + * we MUST mark the backend file as bad and heal it. + */ + +int32_t +metadisp_stat_backend_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + METADISP_TRACE("got backend stat results %d %d", op_ret, op_errno); + if (op_errno == ENOENT) { + STACK_UNWIND_STRICT(open, frame, -1, ENODATA, NULL, NULL); + return 0; + } + STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int32_t +metadisp_stat_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) +{ + METADISP_TRACE("winding stat to path %s", loc->path); + if (gf_uuid_is_null(loc->gfid)) { + METADISP_TRACE("bad object, sending EUCLEAN"); + STACK_UNWIND_STRICT(open, frame, -1, EUCLEAN, NULL, NULL); + return 0; + } + + STACK_WIND(frame, metadisp_stat_backend_cbk, SECOND_CHILD(this), + SECOND_CHILD(this)->fops->stat, loc, xdata); + return 0; +} + +int32_t +metadisp_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + call_stub_t *stub = NULL; + + METADISP_TRACE("got stat results %d %d", op_ret, op_errno); + + if (cookie) { + stub = cookie; + } + + if (op_ret != 0) { + goto unwind; + } + + // only use the stub for the files + if (!IA_ISREG(buf->ia_type)) { + goto unwind; + } + + if (stub->poison) { + call_stub_destroy(stub); + stub = NULL; + return 0; + } + + call_resume(stub); + return 0; + +unwind: + if (stub) { + call_stub_destroy(stub); + } + STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int32_t +metadisp_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + call_stub_t *stub = NULL; + int32_t ret = 0; + loc_t backend_loc = { + 0, + }; + METADISP_FILTER_ROOT(stat, loc, xdata); + + if (build_backend_loc(loc->gfid, loc, &backend_loc)) { + goto unwind; + } + + if (dict_get_int32(xdata, "syncop-internal-from-posix", &ret) == 0) { + // if we've just been sent a stat from posix, then we know + // that we must send down a stat for a file to the second child. + // + // that means we can skip the stat for the first child and just + // send to the data disk. + METADISP_TRACE("got syncop-internal-from-posix"); + STACK_WIND(frame, default_stat_cbk, DATA_CHILD(this), + DATA_CHILD(this)->fops->stat, &backend_loc, xdata); + return 0; + } + + // we do not know if the request is for a file, folder, etc. wind + // to first child to find out. + stub = fop_stat_stub(frame, metadisp_stat_resume, &backend_loc, xdata); + METADISP_TRACE("winding stat to first child %s", loc->path); + STACK_WIND_COOKIE(frame, metadisp_stat_cbk, stub, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->stat, loc, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(stat, frame, -1, EINVAL, NULL, NULL); + return 0; +} diff --git a/xlators/features/metadisp/src/metadisp-unlink.c b/xlators/features/metadisp/src/metadisp-unlink.c new file mode 100644 index 00000000000..1f6a8eb35ce --- /dev/null +++ b/xlators/features/metadisp/src/metadisp-unlink.c @@ -0,0 +1,160 @@ + +#include "metadisp.h" +#include <glusterfs/call-stub.h> + +/** + * The unlink flow in metadisp is complicated because we must + * do ensure that UNLINK causes both the metadata objects + * to get removed and the data objects to get removed. + */ + +int32_t +metadisp_unlink_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + int xflag, dict_t *xdata) +{ + METADISP_TRACE("winding backend unlink to path %s", loc->path); + STACK_WIND(frame, default_unlink_cbk, DATA_CHILD(this), + DATA_CHILD(this)->fops->unlink, loc, xflag, xdata); + return 0; +} + +int32_t +metadisp_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + METADISP_TRACE(". %d %d", op_ret, op_errno); + + int ret = 0; + call_stub_t *stub = NULL; + int nlink = 0; + + if (cookie) { + stub = cookie; + } + + if (op_ret != 0) { + goto unwind; + } + + if (stub->poison) { + call_stub_destroy(stub); + stub = NULL; + return 0; + } + + ret = dict_get_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA, &nlink); + if (ret != 0) { + op_errno = EINVAL; + op_ret = -1; + goto unwind; + } + METADISP_TRACE("frontend hardlink count %d %d", ret, nlink); + if (nlink > 1) { + goto unwind; + } + + call_resume(stub); + return 0; + +unwind: + if (stub) { + call_stub_destroy(stub); + } + STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); + return 0; +} + +int32_t +metadisp_unlink_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + call_stub_t *stub = NULL; + + if (cookie) { + stub = cookie; + } + + if (op_ret != 0) { + goto unwind; + } + + // fail fast on empty gfid so we don't loop forever + if (gf_uuid_is_null(buf->ia_gfid)) { + op_ret = -1; + op_errno = ENODATA; + goto unwind; + } + + // fill gfid since the stub is incomplete + memcpy(stub->args.loc.gfid, buf->ia_gfid, sizeof(uuid_t)); + memcpy(stub->args.loc.pargfid, postparent->ia_gfid, sizeof(uuid_t)); + + if (stub->poison) { + call_stub_destroy(stub); + stub = NULL; + return 0; + } + + call_resume(stub); + return 0; + +unwind: + if (stub) { + call_stub_destroy(stub); + } + STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +metadisp_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) +{ + call_stub_t *stub = NULL; + loc_t backend_loc = { + 0, + }; + + if (gf_uuid_is_null(loc->gfid)) { + METADISP_TRACE("winding lookup for unlink to path %s", loc->path); + + // loop back to ourselves after a lookup + stub = fop_unlink_stub(frame, metadisp_unlink, loc, xflag, xdata); + STACK_WIND_COOKIE(frame, metadisp_unlink_lookup_cbk, stub, + METADATA_CHILD(this), + METADATA_CHILD(this)->fops->lookup, loc, xdata); + return 0; + } + + if (build_backend_loc(loc->gfid, loc, &backend_loc)) { + goto unwind; + } + + // + // ensure we get the link count on the unlink response, so we can + // account for hardlinks before winding to the backend. + // NOTE: + // multiple xlators use GF_REQUEST_LINK_COUNT_XDATA. confirmation + // is needed to ensure that multiple requests will work in the same + // xlator stack. + // + if (!xdata) { + xdata = dict_new(); + } + dict_set_int32(xdata, GF_REQUEST_LINK_COUNT_XDATA, 1); + + METADISP_TRACE("winding frontend unlink to path %s", loc->path); + stub = fop_unlink_stub(frame, metadisp_unlink_resume, &backend_loc, xflag, + xdata); + + STACK_WIND_COOKIE(frame, metadisp_unlink_cbk, stub, METADATA_CHILD(this), + METADATA_CHILD(this)->fops->unlink, loc, xflag, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(unlink, frame, -1, EINVAL, NULL, NULL, NULL); + return 0; +} diff --git a/xlators/features/metadisp/src/metadisp.c b/xlators/features/metadisp/src/metadisp.c new file mode 100644 index 00000000000..3c8f150cebc --- /dev/null +++ b/xlators/features/metadisp/src/metadisp.c @@ -0,0 +1,46 @@ +#include <glusterfs/call-stub.h> + +#include "metadisp.h" +#include "metadisp-fops.h" + +int32_t +init(xlator_t *this) +{ + if (!this->children) { + gf_log(this->name, GF_LOG_ERROR, + "not configured with children. exiting"); + return -1; + } + + if (!this->parents) { + gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile "); + } + + return 0; +} + +void +fini(xlator_t *this) +{ + return; +} + +/* defined in fops.c */ +struct xlator_fops fops; + +struct xlator_cbks cbks = {}; + +struct volume_options options[] = { + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .fops = &fops, + .cbks = &cbks, + .options = options, + .op_version = {1}, + .identifier = "metadisp", + .category = GF_EXPERIMENTAL, +}; diff --git a/xlators/features/metadisp/src/metadisp.h b/xlators/features/metadisp/src/metadisp.h new file mode 100644 index 00000000000..c8fd7a13c04 --- /dev/null +++ b/xlators/features/metadisp/src/metadisp.h @@ -0,0 +1,45 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef GF_METADISP_H_ +#define GF_METADISP_H_ + +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> + +#define METADATA_CHILD(_this) FIRST_CHILD(_this) +#define DATA_CHILD(_this) SECOND_CHILD(_this) + +int32_t +build_backend_loc(uuid_t gfid, loc_t *src_loc, loc_t *dst_loc); + +#define METADISP_TRACE(_args...) gf_log("metadisp", GF_LOG_INFO, _args) + +#define METADISP_FILTER_ROOT(_op, _args...) \ + if (strcmp(loc->path, "/") == 0) { \ + STACK_WIND(frame, default_##_op##_cbk, METADATA_CHILD(this), \ + METADATA_CHILD(this)->fops->_op, _args); \ + return 0; \ + } + +#define METADISP_FILTER_ROOT_BY_GFID(_op, _gfid, _args...) \ + if (__is_root_gfid(_gfid)) { \ + STACK_WIND(frame, default_##_op##_cbk, METADATA_CHILD(this), \ + METADATA_CHILD(this)->fops->_op, _args); \ + return 0; \ + } + +#define RESOLVE_GFID_REQ(_dict, _dest, _lbl) \ + VALIDATE_OR_GOTO(dict_get_ptr(_dict, "gfid-req", (void **)&_dest) == 0, \ + _lbl) + +#endif /* __TEMPLATE_H__ */ diff --git a/xlators/features/namespace/Makefile.am b/xlators/features/namespace/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/features/namespace/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/namespace/src/Makefile.am b/xlators/features/namespace/src/Makefile.am new file mode 100644 index 00000000000..e355d42cf4e --- /dev/null +++ b/xlators/features/namespace/src/Makefile.am @@ -0,0 +1,17 @@ +xlator_LTLIBRARIES = namespace.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +namespace_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +namespace_la_SOURCES = namespace.c +namespace_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = namespace.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(top_srcdir)/xlators/lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/namespace/src/namespace.c b/xlators/features/namespace/src/namespace.c new file mode 100644 index 00000000000..86c5ebee900 --- /dev/null +++ b/xlators/features/namespace/src/namespace.c @@ -0,0 +1,1344 @@ +/* + * Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + * + * xlators/features/namespace: + * This translator tags each request with a namespace hash, + * which then can be used in later translators to track and + * throttle fops per namespace. + */ + +#include <sys/types.h> + +#include <glusterfs/defaults.h> +#include <glusterfs/hashfn.h> +#include <glusterfs/logging.h> +#include "namespace.h" + +/* Return codes for common path parsing functions. */ +enum _path_parse_result { + PATH_PARSE_RESULT_NO_PATH = 0, + PATH_PARSE_RESULT_FOUND = 1, + PATH_PARSE_RESULT_IS_GFID = 2, +}; + +typedef enum _path_parse_result path_parse_result_t; + +/* Clean up an ns_local struct. Wipe a loc (its inode is ref'd, so we're good.) + */ +static inline void +ns_local_cleanup(ns_local_t *local) +{ + if (!local) { + return; + } + + loc_wipe(&local->loc); + GF_FREE(local); +} + +/* Create a new ns_local. We ref the inode, fake a new loc struct, and stash + * the stub given to us. */ +static inline ns_local_t * +ns_local_new(call_stub_t *stub, inode_t *inode) +{ + ns_local_t *local = NULL; + loc_t loc = { + 0, + }; + + if (!stub || !inode) { + goto out; + } + + local = GF_CALLOC(1, sizeof(ns_local_t), 0); + if (local == NULL) { + goto out; + } + + /* Set up a fake loc_t struct to give to the getxattr call. */ + gf_uuid_copy(loc.gfid, inode->gfid); + loc.inode = inode_ref(inode); + + /* If for some reason inode_ref() fails, then just give up. */ + if (!loc.inode) { + GF_FREE(local); + goto out; + } + + local->stub = stub; + local->loc = loc; + +out: + return local; +} + +/* Try parsing a path string. If the path string is a GFID, then return + * with PATH_PARSE_RESULT_IS_GFID. If we have no namespace (i.e. '/') then + * return PATH_PARSE_RESULT_NO_PATH and set the hash to 1. Otherwise, hash the + * namespace and store it in the info struct. */ +static path_parse_result_t +parse_path(ns_info_t *info, const char *path) +{ + int len = 0; + const char *ns_begin = path; + const char *ns_end = NULL; + + if (!path || strlen(path) == 0) { + return PATH_PARSE_RESULT_NO_PATH; + } + + if (path[0] == '<') { + return PATH_PARSE_RESULT_IS_GFID; + } + + /* Right now we only want the top-level directory, so + * skip the initial '/' and read until the next '/'. */ + while (*ns_begin == '/') { + ns_begin++; + } + + /* ns_end will point to the next '/' or NULL if there is no delimiting + * '/' (i.e. "/directory" or the top level "/") */ + ns_end = strchr(ns_begin, '/'); + len = ns_end ? (ns_end - ns_begin) : strlen(ns_begin); + + if (len != 0) { + info->hash = SuperFastHash(ns_begin, len); + } else { + /* If our substring is empty, then we can hash '/' instead. + * '/' is used in the namespace config for the top-level + * namespace. */ + info->hash = SuperFastHash("/", 1); + } + + info->found = _gf_true; + return PATH_PARSE_RESULT_FOUND; +} + +/* Cache namespace info stored in the stack (info) into the inode. */ +static int +ns_inode_ctx_put(inode_t *inode, xlator_t *this, ns_info_t *info) +{ + ns_info_t *cached_ns_info = NULL; + uint64_t ns_as_64 = 0; + int ret = -1; + + if (!inode || !this) { + gf_log(this ? this->name : "namespace", GF_LOG_WARNING, + "Need a valid inode and xlator to cache ns_info."); + ret = -1; + goto out; + } + + cached_ns_info = GF_CALLOC(1, sizeof(ns_info_t), 0); + + /* If we've run out of memory, then return ENOMEM. */ + if (cached_ns_info == NULL) { + gf_log(this->name, GF_LOG_WARNING, "No memory to cache ns_info."); + ret = -(ENOMEM); + goto out; + } + + *cached_ns_info = *info; + ns_as_64 = (uint64_t)(uintptr_t)cached_ns_info; + + ret = inode_ctx_put(inode, this, ns_as_64); + + if (ret) { + goto out; + } + + ret = 0; +out: + if (ret && cached_ns_info) { + GF_FREE(cached_ns_info); + } + + return ret; +} + +/* Retrieve namespace info cached in the inode into the stack for use in later + * translators. */ +static int +ns_inode_ctx_get(inode_t *inode, xlator_t *this, ns_info_t *info) +{ + ns_info_t *cached_ns_info = NULL; + uint64_t ns_as_64 = 0; + int ret = -1; + + if (!inode) { + ret = -ENOENT; + goto out; + } + + ret = inode_ctx_get(inode, this, &ns_as_64); + + if (!ret) { + cached_ns_info = (ns_info_t *)(uintptr_t)ns_as_64; + *info = *cached_ns_info; + } + +out: + return ret; +} + +/* This callback is the top of the unwind path of our attempt to get the path + * manually from the posix translator. We'll try to parse the path returned + * if it exists, then cache the hash if possible. Then just return to the + * default stub that we provide in the local, since there's nothing else to do + * once we've gotten the namespace hash. */ +int32_t +get_path_resume_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + path_parse_result_t ret = PATH_PARSE_RESULT_NO_PATH; + call_frame_t *resume_frame = NULL; + ns_local_t *local = NULL; + call_stub_t *stub = NULL; + ns_info_t *info = NULL; + char *path = NULL; + + GF_VALIDATE_OR_GOTO(this->name, frame, out); + local = frame->local; + + GF_VALIDATE_OR_GOTO(this->name, local, out); + stub = local->stub; + + GF_VALIDATE_OR_GOTO(this->name, stub, out); + /* Get the ns_info from the frame that we will eventually resume, + * not the frame that we're going to destroy (frame). */ + resume_frame = stub->frame; + + GF_VALIDATE_OR_GOTO(this->name, resume_frame, out); + GF_VALIDATE_OR_GOTO(this->name, resume_frame->root, out); + info = &resume_frame->root->ns_info; + + GF_VALIDATE_OR_GOTO(this->name, dict, out); + + /* If we get a value back for the GET_ANCESTRY_PATH_KEY, then we + * try to access it and parse it like a path. */ + if (!op_ret && !dict_get_str(dict, GET_ANCESTRY_PATH_KEY, &path)) { + gf_log(this->name, GF_LOG_DEBUG, "G>P %s retrieved path %s", + uuid_utoa(local->loc.gfid), path); + /* Now let's parse a path, finally. */ + ret = parse_path(info, path); + } + + if (ret == PATH_PARSE_RESULT_FOUND) { + /* If we finally found namespace, then stash it. */ + ns_inode_ctx_put(local->loc.inode, this, info); + + gf_log(this->name, GF_LOG_DEBUG, "G>P %s %10u namespace found %s", + uuid_utoa(local->loc.inode->gfid), info->hash, path); + } else if (ret == PATH_PARSE_RESULT_NO_PATH) { + gf_log(this->name, GF_LOG_WARNING, "G>P %s has no path", + uuid_utoa(local->loc.inode->gfid)); + } else if (ret == PATH_PARSE_RESULT_IS_GFID) { + gf_log(this->name, GF_LOG_WARNING, + "G>P %s winding failed, still have gfid", + uuid_utoa(local->loc.inode->gfid)); + } + +out: + /* Make sure to clean up local finally. */ + + if (frame) { + frame->local = NULL; + STACK_DESTROY(frame->root); + } + + if (local) { + ns_local_cleanup(local); + } + + if (stub) { + call_resume(stub); + } + + return 0; +} + +/* This function tries first to set a namespace based on the information that + * it can retrieve from an `loc_t`. This includes first looking for a cached + * namespace in the inode, then trying to parse the path string in the `loc_t` + * struct. If this fails, then it will try to call inode_path. */ +static path_parse_result_t +set_ns_from_loc(const char *fn, call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + path_parse_result_t ret = PATH_PARSE_RESULT_NO_PATH; + ns_private_t *priv = (ns_private_t *)this->private; + ns_info_t *info = &frame->root->ns_info; + char *path = NULL; + + info->hash = 0; + info->found = _gf_false; + + if (!priv->tag_namespaces) { + return ret; + } + + /* This is our first pass at trying to get a path. Try getting + * from the inode context, then from the loc's path itself. */ + if (!loc || !loc->path || !loc->inode) { + ret = PATH_PARSE_RESULT_NO_PATH; + } else if (!ns_inode_ctx_get(loc->inode, this, info)) { + ret = PATH_PARSE_RESULT_FOUND; + } else { + ret = parse_path(info, loc->path); + gf_log(this->name, GF_LOG_DEBUG, "%s: LOC retrieved path %s", fn, + loc->path); + + if (ret == PATH_PARSE_RESULT_FOUND) { + ns_inode_ctx_put(loc->inode, this, info); + } + } + + /* Keep trying by calling inode_path next, making sure to copy + the loc's gfid into its inode if necessary. */ + if (ret == PATH_PARSE_RESULT_IS_GFID) { + if (gf_uuid_is_null(loc->inode->gfid)) { + gf_uuid_copy(loc->inode->gfid, loc->gfid); + } + + if (inode_path(loc->inode, NULL, &path) >= 0 && path) { + ret = parse_path(info, loc->path); + gf_log(this->name, GF_LOG_DEBUG, "%s: LOC retrieved path %s", fn, + path); + + if (ret == PATH_PARSE_RESULT_FOUND) { + ns_inode_ctx_put(loc->inode, this, info); + } + } + + if (path) { + GF_FREE(path); + } + } + + /* Report our status, and if we have a GFID, we'll eventually try a + * GET_ANCESTRY_PATH_KEY wind when we return from this function. */ + if (ret == PATH_PARSE_RESULT_FOUND) { + gf_log(this->name, GF_LOG_DEBUG, + "%s: LOC %s %10u namespace found for %s", fn, + uuid_utoa(loc->inode->gfid), info->hash, loc->path); + } else if (ret == PATH_PARSE_RESULT_NO_PATH) { + gf_log(this->name, GF_LOG_WARNING, "%s: LOC has no path", fn); + } else if (ret == PATH_PARSE_RESULT_IS_GFID) { + /* Make sure to copy the inode's gfid for the eventual wind. */ + if (gf_uuid_is_null(loc->inode->gfid)) { + gf_uuid_copy(loc->inode->gfid, loc->gfid); + } + + gf_log(this->name, GF_LOG_DEBUG, "%s: LOC %s winding, looking for path", + fn, uuid_utoa(loc->inode->gfid)); + } + + return ret; +} + +/* This function tries first to set a namespace based on the information that + * it can retrieve from an `fd_t`. This includes first looking for a cached + * namespace in the inode, then trying to call inode_path manually. */ +static path_parse_result_t +set_ns_from_fd(const char *fn, call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + path_parse_result_t ret = PATH_PARSE_RESULT_NO_PATH; + ns_private_t *priv = (ns_private_t *)this->private; + ns_info_t *info = &frame->root->ns_info; + char *path = NULL; + + info->hash = 0; + info->found = _gf_false; + + if (!priv->tag_namespaces) { + return ret; + } + + /* This is our first pass at trying to get a path. Try getting + * from the inode context, then inode_path. */ + if (!fd || !fd->inode) { + ret = PATH_PARSE_RESULT_NO_PATH; + } else if (!ns_inode_ctx_get(fd->inode, this, info)) { + ret = PATH_PARSE_RESULT_FOUND; + } else if (inode_path(fd->inode, NULL, &path) >= 0 && path) { + ret = parse_path(info, path); + gf_log(this->name, GF_LOG_DEBUG, "%s: FD retrieved path %s", fn, path); + + if (ret == PATH_PARSE_RESULT_FOUND) { + ns_inode_ctx_put(fd->inode, this, info); + } + } + + if (path) { + GF_FREE(path); + } + + /* Report our status, and if we have a GFID, we'll eventually try a + * GET_ANCESTRY_PATH_KEY wind when we return from this function. */ + if (ret == PATH_PARSE_RESULT_FOUND) { + gf_log(this->name, GF_LOG_DEBUG, "%s: FD %s %10u namespace found", fn, + uuid_utoa(fd->inode->gfid), info->hash); + } else if (ret == PATH_PARSE_RESULT_NO_PATH) { + gf_log(this->name, GF_LOG_WARNING, "%s: FD has no path", fn); + } else if (ret == PATH_PARSE_RESULT_IS_GFID) { + gf_log(this->name, GF_LOG_DEBUG, "%s: FD %s winding, looking for path", + fn, uuid_utoa(fd->inode->gfid)); + } + + return ret; +} + +/* This macro does the work of winding down a call of `getxattr` in the case + * that we have to retrieve the path manually. It assumes that there is a label + * called `wind` and the existence of several basic variables (frame, this), + * but otherwise is general enough for any fop (fd- or loc-based.) */ +#define GET_ANCESTRY_PATH_WIND(fop, inode, args...) \ + do { \ + ns_info_t *info = &frame->root->ns_info; \ + call_frame_t *new_frame = NULL; \ + ns_local_t *local = NULL; \ + call_stub_t *stub = NULL; \ + \ + gf_log(this->name, GF_LOG_DEBUG, " %s winding, looking for path", \ + uuid_utoa(inode->gfid)); \ + \ + new_frame = create_frame(this, this->ctx->pool); \ + if (!new_frame) { \ + gf_log(this->name, GF_LOG_ERROR, \ + "Cannot allocate new call frame."); \ + goto wind; \ + } \ + \ + stub = fop_##fop##_stub(frame, default_##fop, args); \ + if (!stub) { \ + gf_log(this->name, GF_LOG_ERROR, \ + "Cannot allocate function stub."); \ + goto wind; \ + } \ + \ + new_frame->root->uid = 0; \ + new_frame->root->gid = 0; \ + /* Put a phony "not found" NS info into this call. */ \ + new_frame->root->ns_info = *info; \ + \ + local = ns_local_new(stub, inode); \ + if (!local) { \ + gf_log(this->name, GF_LOG_ERROR, \ + "Cannot allocate function local."); \ + goto wind; \ + } \ + \ + new_frame->local = local; \ + /* After allocating a new frame, a call stub (to \ + * resume our current fop), and a local variables \ + * struct (for our loc to getxattr and our resume \ + * stub), call getxattr and unwind to get_path_resume_cbk. \ + */ \ + STACK_WIND(new_frame, get_path_resume_cbk, FIRST_CHILD(this), \ + FIRST_CHILD(this)->fops->getxattr, &local->loc, \ + GET_ANCESTRY_PATH_KEY, NULL); \ + } while (0) + +int32_t +ns_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(rmdir, loc->inode, loc, xflags, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_rmdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, loc, xflags, xdata); + return 0; +} + +int32_t +ns_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(unlink, loc->inode, loc, xflags, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflags, xdata); + return 0; +} + +int32_t +ns_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, + newloc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(rename, newloc->inode, oldloc, newloc, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; +} + +int32_t +ns_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, + newloc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(link, newloc->inode, oldloc, newloc, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; +} + +int32_t +ns_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(mkdir, loc->inode, loc, mode, umask, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); + return 0; +} + +int32_t +ns_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(symlink, loc->inode, linkname, loc, umask, + xdata); + return 0; + } +wind: + STACK_WIND(frame, default_symlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata); + return 0; +} + +int32_t +ns_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(mknod, loc->inode, loc, mode, dev, umask, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, dev, umask, xdata); + return 0; +} + +int32_t +ns_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(create, loc->inode, loc, flags, mode, umask, fd, + xdata); + return 0; + } +wind: + STACK_WIND(frame, default_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; +} + +int32_t +ns_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(fsetattr, fd->inode, fd, stbuf, valid, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; +} + +int32_t +ns_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(setattr, loc->inode, loc, stbuf, valid, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; +} + +int32_t +ns_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(fremovexattr, fd->inode, fd, name, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_fremovexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + return 0; +} + +int32_t +ns_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(removexattr, loc->inode, loc, name, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; +} + +int32_t +ns_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(setxattr, loc->inode, loc, dict, flags, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata); + return 0; +} + +int32_t +ns_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(fsetxattr, fd->inode, fd, dict, flags, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + return 0; +} + +int32_t +ns_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(truncate, loc->inode, loc, offset, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; +} + +int32_t +ns_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(ftruncate, fd->inode, fd, offset, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; +} + +int32_t +ns_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(writev, fd->inode, fd, vector, count, offset, + flags, iobref, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; +} + +int32_t +ns_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(lookup, loc->inode, loc, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + return 0; +} + +int32_t +ns_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(stat, loc->inode, loc, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; +} + +int32_t +ns_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(fstat, fd->inode, fd, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; +} + +int32_t +ns_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(readlink, loc->inode, loc, size, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_readlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, loc, size, xdata); + return 0; +} + +int32_t +ns_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(access, loc->inode, loc, mask, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_access_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->access, loc, mask, xdata); + return 0; +} + +int32_t +ns_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(open, fd->inode, loc, flags, fd, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +} + +int32_t +ns_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(readv, fd->inode, fd, size, offset, flags, + xdata); + return 0; + } +wind: + STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + return 0; +} + +int32_t +ns_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(flush, fd->inode, fd, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; +} + +int32_t +ns_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(fsync, fd->inode, fd, datasync, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; +} + +int32_t +ns_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(opendir, loc->inode, loc, fd, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + return 0; +} + +int32_t +ns_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) + +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(fsyncdir, fd->inode, fd, datasync, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_fsyncdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsyncdir, fd, datasync, xdata); + return 0; +} + +int32_t +ns_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + int32_t len, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(rchecksum, fd->inode, fd, offset, len, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_rchecksum_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata); + return 0; +} + +int32_t +ns_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(statfs, loc->inode, loc, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_statfs_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, loc, xdata); + return 0; +} + +int32_t +ns_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(inodelk, loc->inode, volume, loc, cmd, flock, + xdata); + return 0; + } +wind: + STACK_WIND(frame, default_inodelk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->inodelk, volume, loc, cmd, flock, + xdata); + return 0; +} + +int32_t +ns_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(finodelk, fd->inode, volume, fd, cmd, flock, + xdata); + return 0; + } +wind: + STACK_WIND(frame, default_finodelk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, volume, fd, cmd, flock, + xdata); + return 0; +} + +int32_t +ns_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(entrylk, loc->inode, volume, loc, basename, cmd, + type, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, volume, loc, basename, cmd, + type, xdata); + return 0; +} + +int32_t +ns_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(fentrylk, fd->inode, volume, fd, basename, cmd, + type, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_fentrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fentrylk, volume, fd, basename, cmd, + type, xdata); + return 0; +} + +int32_t +ns_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(fgetxattr, fd->inode, fd, name, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + return 0; +} + +int32_t +ns_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(getxattr, loc->inode, loc, name, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + return 0; +} + +int32_t +ns_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(lk, fd->inode, fd, cmd, flock, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_lk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lk, fd, cmd, flock, xdata); + return 0; +} + +int32_t +ns_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(readdir, fd->inode, fd, size, offset, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata); + + return 0; +} + +int32_t +ns_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(readdirp, fd->inode, fd, size, offset, dict); + return 0; + } +wind: + STACK_WIND(frame, default_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict); + return 0; +} + +int32_t +ns_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(xattrop, loc->inode, loc, flags, dict, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_xattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, loc, flags, dict, xdata); + + return 0; +} + +int32_t +ns_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(fxattrop, fd->inode, fd, flags, dict, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_fxattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, fd, flags, dict, xdata); + + return 0; +} + +int32_t +ns_getspec(call_frame_t *frame, xlator_t *this, const char *key, int32_t flag) +{ + STACK_WIND(frame, default_getspec_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getspec, key, flag); + return 0; +} + +int32_t +ns_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(fallocate, fd->inode, fd, keep_size, offset, len, + xdata); + return 0; + } +wind: + STACK_WIND(frame, default_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset, len, + xdata); + return 0; +} + +int32_t +ns_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(discard, fd->inode, fd, offset, len, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; +} + +int32_t +ns_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd); + + if (ret == PATH_PARSE_RESULT_IS_GFID) { + GET_ANCESTRY_PATH_WIND(zerofill, fd->inode, fd, offset, len, xdata); + return 0; + } +wind: + STACK_WIND(frame, default_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; +} + +int +ns_forget(xlator_t *this, inode_t *inode) +{ + uint64_t ns_as_64 = 0; + ns_info_t *info = NULL; + + inode_ctx_del(inode, this, &ns_as_64); + + if (!ns_as_64) { + return 0; + } + + info = (ns_info_t *)(uintptr_t)ns_as_64; + GF_FREE(info); + + return 0; +} + +int32_t +init(xlator_t *this) +{ + int32_t ret = -1; + ns_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO(GF_NAMESPACE, this, out); + + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_ERROR, + "translator needs a single subvolume."); + goto out; + } + + if (!this->parents) { + gf_log(this->name, GF_LOG_ERROR, + "dangling volume. please check volfile."); + goto out; + } + + priv = GF_CALLOC(1, sizeof(ns_private_t), 0); + + if (!priv) { + gf_log(this->name, GF_LOG_ERROR, "Can't allocate ns_priv structure."); + goto out; + } + + GF_OPTION_INIT("tag-namespaces", priv->tag_namespaces, bool, out); + + gf_log(this->name, GF_LOG_INFO, "Namespace xlator loaded"); + this->private = priv; + ret = 0; + +out: + if (ret) { + GF_FREE(priv); + } + + return ret; +} + +void +fini(xlator_t *this) +{ + GF_FREE(this->private); +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + int ret = -1; + ns_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, options, out); + + priv = (ns_private_t *)this->private; + + GF_OPTION_RECONF("tag-namespaces", priv->tag_namespaces, options, bool, + out); + + ret = 0; +out: + return ret; +} + +struct xlator_fops fops = { + .lookup = ns_lookup, + .stat = ns_stat, + .fstat = ns_fstat, + .truncate = ns_truncate, + .ftruncate = ns_ftruncate, + .access = ns_access, + .readlink = ns_readlink, + .mknod = ns_mknod, + .mkdir = ns_mkdir, + .unlink = ns_unlink, + .rmdir = ns_rmdir, + .symlink = ns_symlink, + .rename = ns_rename, + .link = ns_link, + .create = ns_create, + .open = ns_open, + .readv = ns_readv, + .writev = ns_writev, + .flush = ns_flush, + .fsync = ns_fsync, + .opendir = ns_opendir, + .readdir = ns_readdir, + .readdirp = ns_readdirp, + .fsyncdir = ns_fsyncdir, + .statfs = ns_statfs, + .setxattr = ns_setxattr, + .getxattr = ns_getxattr, + .fsetxattr = ns_fsetxattr, + .fgetxattr = ns_fgetxattr, + .removexattr = ns_removexattr, + .fremovexattr = ns_fremovexattr, + .lk = ns_lk, + .inodelk = ns_inodelk, + .finodelk = ns_finodelk, + .entrylk = ns_entrylk, + .fentrylk = ns_fentrylk, + .rchecksum = ns_rchecksum, + .xattrop = ns_xattrop, + .fxattrop = ns_fxattrop, + .setattr = ns_setattr, + .fsetattr = ns_fsetattr, + .getspec = ns_getspec, + .fallocate = ns_fallocate, + .discard = ns_discard, + .zerofill = ns_zerofill, +}; + +struct xlator_cbks cbks = { + .forget = ns_forget, +}; + +struct xlator_dumpops dumpops; + +struct volume_options options[] = { + { + .key = {"tag-namespaces"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option enables this translator's functionality " + "that tags every fop with a namespace hash for later " + "throttling, stats collection, logging, etc.", + .op_version = {GD_OP_VERSION_4_1_0}, + .tags = {"namespace"}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + }, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .op_version = {GD_OP_VERSION_3_12_0}, + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "namespace", + .category = GF_TECH_PREVIEW, +}; diff --git a/xlators/features/namespace/src/namespace.h b/xlators/features/namespace/src/namespace.h new file mode 100644 index 00000000000..3a9b84d6426 --- /dev/null +++ b/xlators/features/namespace/src/namespace.h @@ -0,0 +1,23 @@ +#ifndef __NAMESPACE_H__ +#define __NAMESPACE_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <glusterfs/xlator.h> +#include <glusterfs/call-stub.h> + +#define GF_NAMESPACE "namespace" + +typedef struct { + gf_boolean_t tag_namespaces; +} ns_private_t; + +typedef struct { + loc_t loc; /* We store a "fake" loc_t for the getxattr wind. */ + call_stub_t *stub; /* A stub back to the function we're resuming. */ +} ns_local_t; + +#endif /* __NAMESPACE_H__ */ diff --git a/xlators/features/path-convertor/src/Makefile.am b/xlators/features/path-convertor/src/Makefile.am deleted file mode 100644 index 393a7bd089c..00000000000 --- a/xlators/features/path-convertor/src/Makefile.am +++ /dev/null @@ -1,15 +0,0 @@ - -xlator_LTLIBRARIES = path-converter.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/features - -path_converter_la_LDFLAGS = -module -avoid-version - -path_converter_la_SOURCES = path.c -path_converter_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src - -AM_CFLAGS = -Wall $(GF_CFLAGS) - -CLEANFILES = - diff --git a/xlators/features/path-convertor/src/path.c b/xlators/features/path-convertor/src/path.c deleted file mode 100644 index 5c52e0a8d53..00000000000 --- a/xlators/features/path-convertor/src/path.c +++ /dev/null @@ -1,1228 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ -/* TODO: add gf_log to all the cases returning errors */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -/** - * xlators/features/path-translator: - * This translator converts the path it gets into user specified targets. - */ - -#include <sys/types.h> -#include <regex.h> -#include <time.h> -#include <errno.h> -#include "glusterfs.h" -#include "xlator.h" -#include "path-mem-types.h" - -typedef struct path_private -{ - int32_t this_len; - int32_t start_off; - int32_t end_off; - char *this; - char *that; - char *path; - regex_t *preg; -} path_private_t; - -static char * -name_this_to_that (xlator_t *xl, const char *path, const char *name) -{ - path_private_t *priv = xl->private; - char priv_path[PATH_MAX] = {0,}; - char *tmp_name = NULL; - int32_t path_len = strlen (path); - int32_t name_len = strlen (name) - ZR_FILE_CONTENT_STRLEN; - int32_t total_len = path_len + name_len; - int32_t i = 0, j = 0; - - if (path_len >= priv->end_off) - return (char *)name; - - if (priv->end_off && (total_len > priv->end_off)) { - j = priv->start_off; - tmp_name = GF_CALLOC (1, (total_len + - ZR_FILE_CONTENT_STRLEN), - gf_path_mt_char); - ERR_ABORT (tmp_name); - - /* Get the complete path for the file first */ - strcpy (tmp_name, path); - strcat (tmp_name, name + ZR_FILE_CONTENT_STRLEN); - - strncpy (priv_path, tmp_name, priv->start_off); - for (i = priv->start_off; i < priv->end_off; i++) { - if (tmp_name[i] == '/') - continue; - priv_path[j++] = tmp_name[i]; - } - memcpy ((priv_path + j), - (tmp_name + priv->end_off), - (total_len - priv->end_off)); - priv_path[(total_len - (priv->end_off - j))] = '\0'; - - strcpy (tmp_name, ZR_FILE_CONTENT_STR); - strcat (tmp_name, priv_path); - - return tmp_name; - } - - return (char *)name; -} - -/* This function should return - * NULL - - * converted path - if path match - * same path - if it doesn't match - */ -static char * -path_this_to_that (xlator_t *xl, const char *path) -{ - path_private_t *priv = xl->private; - char *priv_path = NULL; - int32_t path_len = strlen (path); - int32_t i = 0, j = 0; - - if (priv->end_off && (path_len > priv->start_off)) { - priv_path = GF_CALLOC (1, path_len, gf_path_mt_char); - ERR_ABORT (priv_path); - - if (priv->start_off && (path_len > priv->start_off)) - memcpy (priv_path, path, priv->start_off); - if (path_len > priv->end_off) { - j = priv->start_off; - for (i = priv->start_off; i < priv->end_off; i++) { - if (path[i] == '/') - continue; - priv_path[j++] = path[i]; - } - memcpy ((priv_path + j), - (path + priv->end_off), - (path_len - priv->end_off)); - priv_path[(path_len - (priv->end_off - j))] = '\0'; - } - return priv_path; - } - return (char *)path; -} - -int32_t -path_create_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); - return 0; -} - -int32_t -path_open_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - STACK_UNWIND (frame, op_ret, op_errno, fd); - return 0; -} - -int32_t -path_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entries, - int32_t count) -{ - STACK_UNWIND (frame, op_ret, op_errno, entries, count); - return 0; -} - -int32_t -path_readdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - gf_dirent_t *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - - -int32_t -path_readlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - const char *buf, - struct iatt *sbuf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf, sbuf); - return 0; -} - -int32_t -path_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *xattr, - struct iatt *postparent) -{ - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, xattr); - return 0; -} - - -int32_t -path_symlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - STACK_UNWIND (frame, op_ret, op_errno, inode, buf); - return 0; -} - -int32_t -path_mknod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - STACK_UNWIND (frame, op_ret, op_errno, inode, buf); - return 0; -} - - -int32_t -path_mkdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - STACK_UNWIND (frame, op_ret, op_errno, inode, buf); - return 0; -} - -int32_t -path_link_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - STACK_UNWIND (frame, op_ret, op_errno, inode, buf); - return 0; -} - -int32_t -path_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - STACK_UNWIND (frame, op_ret, op_errno, fd); - return 0; -} - - -int32_t -path_rename_buf_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf, - struct iatt *preoldparent, - struct iatt *postoldparent, - struct iatt *prenewparent, - struct iatt *postnewparent) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - - - -int32_t -path_common_buf_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -int32_t -path_common_dict_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *dict) -{ - STACK_UNWIND (frame, op_ret, op_errno, dict); - return 0; -} - -int32_t -path_common_remove_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno,struct iatt *preparent, - struct iatt *postparent) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -int32_t -path_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno,struct iatt *prebuf, - struct iatt *postbuf) -{ - STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} - - -int32_t -path_common_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/* */ -int32_t -path_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xattr_req) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, path_lookup_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, - loc, xattr_req); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - -int32_t -path_stat (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_common_buf_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, - loc); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - -int32_t -path_readlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - size_t size) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_readlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readlink, - loc, - size); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - -int32_t -path_mknod (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode, - dev_t dev) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_mknod_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mknod, - loc, - mode, - dev); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - -int32_t -path_mkdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_mkdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mkdir, - loc, - mode); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - -int32_t -path_unlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_common_remove_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, - loc); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - -int32_t -path_rmdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_common_remove_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rmdir, - loc); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - -int32_t -path_symlink (call_frame_t *frame, - xlator_t *this, - const char *linkpath, - loc_t *loc) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_symlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->symlink, - linkpath, - loc); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - -int32_t -path_rename (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - char *oldloc_path = (char *)oldloc->path; - char *tmp_oldloc_path = NULL; - - char *newloc_path = (char *)newloc->path; - char *tmp_newloc_path = NULL; - - if (!(tmp_oldloc_path = path_this_to_that (this, oldloc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - oldloc->path = tmp_oldloc_path; - - if (!(tmp_newloc_path = path_this_to_that (this, newloc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - newloc->path = tmp_newloc_path; - - STACK_WIND (frame, - path_rename_buf_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rename, - oldloc, - newloc); - - oldloc->path = oldloc_path; - if (tmp_oldloc_path != oldloc_path) - GF_FREE (tmp_oldloc_path); - - newloc->path = newloc_path; - if (tmp_newloc_path != newloc_path) - GF_FREE (tmp_newloc_path); - - return 0; -} - -int32_t -path_link (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - char *oldloc_path = (char *)oldloc->path; - char *tmp_oldloc_path = NULL; - - char *newloc_path = (char *)newloc->path; - char *tmp_newloc_path = NULL; - - if (!(tmp_oldloc_path = path_this_to_that (this, oldloc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - oldloc->path = tmp_oldloc_path; - - if (!(tmp_newloc_path = path_this_to_that (this, newloc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - newloc->path = tmp_newloc_path; - - STACK_WIND (frame, - path_link_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->link, - oldloc, - newloc); - - oldloc->path = oldloc_path; - if (tmp_oldloc_path != oldloc_path) - GF_FREE (tmp_oldloc_path); - - newloc->path = newloc_path; - if (tmp_newloc_path != newloc_path) - GF_FREE (tmp_newloc_path); - - return 0; -} - -int32_t -path_setattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preop, - struct iatt *postop) -{ - STACK_UNWIND (frame, op_ret, op_errno, preop, postop); - return 0; -} - -int32_t -path_setattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - struct iatt *stbuf, - int32_t valid) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_setattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setattr, - loc, - stbuf, valid); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - - -int32_t -path_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_truncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, - loc, - offset); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - - -int32_t -path_open (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - fd_t *fd, - int32_t wbflags) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_open_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, - loc, - flags, - fd, - wbflags); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - -int32_t -path_create (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - mode_t mode, - fd_t *fd) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_create_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - loc, - flags, - mode, - fd); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - -int32_t -path_setxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *dict, - int32_t flags) -{ - char *tmp_name = NULL; - data_pair_t *trav = dict->members_list; - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - if (ZR_FILE_CONTENT_REQUEST(trav->key)) { - tmp_name = name_this_to_that (this, loc->path, trav->key); - if (tmp_name != trav->key) { - trav->key = tmp_name; - } else { - tmp_name = NULL; - } - } - - STACK_WIND (frame, - path_common_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, - loc, - dict, - flags); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - GF_FREE (tmp_name); - - return 0; -} - -int32_t -path_getxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - char *tmp_name = (char *)name; - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - if (ZR_FILE_CONTENT_REQUEST(name)) { - tmp_name = name_this_to_that (this, loc->path, name); - } - - STACK_WIND (frame, - path_common_dict_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, - loc, - tmp_name); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - if (tmp_name != name) - GF_FREE (tmp_name); - - return 0; -} - -int32_t -path_removexattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - char *tmp_name = (char *)name; - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - if (ZR_FILE_CONTENT_REQUEST(name)) { - tmp_name = name_this_to_that (this, loc->path, name); - } - - STACK_WIND (frame, - path_common_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->removexattr, - loc, - tmp_name); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - if (tmp_name != name) - GF_FREE (tmp_name); - - return 0; -} - -int32_t -path_opendir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - fd_t *fd) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_opendir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->opendir, - loc, - fd); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - -int32_t -path_access (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t mask) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_common_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->access, - loc, - mask); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - -int32_t -path_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *fchecksum, - uint8_t *dchecksum) -{ - STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum); - return 0; -} - -int32_t -path_checksum (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flag) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_checksum_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->checksum, - loc, - flag); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - - -int32_t -path_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, path_common_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->entrylk, - volume, loc, basename, cmd, type); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - -int32_t -path_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_common_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, - volume, loc, cmd, lock); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - - -int32_t -path_xattrop (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - gf_xattrop_flags_t flags, - dict_t *dict) -{ - char *loc_path = (char *)loc->path; - char *tmp_path = NULL; - - if (!(tmp_path = path_this_to_that (this, loc->path))) { - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - return 0; - } - loc->path = tmp_path; - - STACK_WIND (frame, - path_common_dict_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->xattrop, - loc, - flags, - dict); - - loc->path = loc_path; - if (tmp_path != loc_path) - GF_FREE (tmp_path); - - return 0; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_path_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - -int32_t -init (xlator_t *this) -{ - dict_t *options = this->options; - path_private_t *priv = NULL; - - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "path translator requires exactly one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - priv = GF_CALLOC (1, sizeof (*priv), gf_path_mt_path_private_t); - ERR_ABORT (priv); - if (dict_get (options, "start-offset")) { - priv->start_off = data_to_int32 (dict_get (options, - "start-offset")); - } - if (dict_get (options, "end-offset")) { - priv->end_off = data_to_int32 (dict_get (options, - "end-offset")); - } - - if (dict_get (options, "regex")) { - int32_t ret = 0; - priv->preg = GF_CALLOC (1, sizeof (regex_t), - gf_path_mt_regex_t); - ERR_ABORT (priv->preg); - ret = regcomp (priv->preg, - data_to_str (dict_get (options, "regex")), - REG_EXTENDED); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to compile the 'option regex'"); - GF_FREE (priv); - return -1; - } - if (dict_get (options, "replace-with")) { - priv->that = data_to_str (dict_get (options, - "replace-with")); - } else { - priv->that = ""; - } - } - - this->private = priv; - return 0; -} - -void -fini (xlator_t *this) -{ - return; -} - -struct xlator_fops fops = { - .stat = path_stat, - .readlink = path_readlink, - .mknod = path_mknod, - .mkdir = path_mkdir, - .unlink = path_unlink, - .rmdir = path_rmdir, - .symlink = path_symlink, - .rename = path_rename, - .link = path_link, - .truncate = path_truncate, - .open = path_open, - .setxattr = path_setxattr, - .getxattr = path_getxattr, - .removexattr = path_removexattr, - .opendir = path_opendir, - .access = path_access, - .create = path_create, - .lookup = path_lookup, - .checksum = path_checksum, - .xattrop = path_xattrop, - .entrylk = path_entrylk, - .inodelk = path_inodelk, - .setattr = path_setattr, -}; - -struct xlator_cbks cbks = { -}; - -struct volume_options options[] = { - { .key = {"start-offset"}, - .type = GF_OPTION_TYPE_INT, - .min = 0, - .max = 4095 - }, - { .key = {"end-offset"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 4096 - }, - { .key = {"replace-with"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {NULL} }, -}; diff --git a/xlators/features/protect/src/Makefile.am b/xlators/features/protect/src/Makefile.am deleted file mode 100644 index 7eb93f32e11..00000000000 --- a/xlators/features/protect/src/Makefile.am +++ /dev/null @@ -1,21 +0,0 @@ -xlator_LTLIBRARIES = prot_dht.la prot_client.la prot_server.la - -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features - -prot_dht_la_LDFLAGS = -module -avoidversion -prot_dht_la_SOURCES = prot_dht.c -prot_dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -prot_client_la_LDFLAGS = -module -avoidversion -prot_client_la_SOURCES = prot_client.c -prot_client_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -prot_server_la_LDFLAGS = -module -avoidversion -prot_server_la_SOURCES = prot_server.c -prot_server_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -AM_CFLAGS = -Wall $(GF_CFLAGS) - -CLEANFILES = - diff --git a/xlators/features/protect/src/prot_client.c b/xlators/features/protect/src/prot_client.c deleted file mode 100644 index d09715067bd..00000000000 --- a/xlators/features/protect/src/prot_client.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "defaults.h" - -#ifndef __NetBSD__ -#include <execinfo.h> -#endif - -#define NUM_FRAMES 20 - -static char PROTECT_KEY[] = "trusted.glusterfs.protect"; - -enum { - PROT_ACT_NONE = 0, - PROT_ACT_LOG, - PROT_ACT_REJECT, -}; - -void -pcli_print_trace (char *name, call_frame_t *frame) -{ - void *frames[NUM_FRAMES]; - char **symbols; - int size; - int i; - - gf_log (name, GF_LOG_INFO, "Translator stack:"); - while (frame) { - gf_log (name, GF_LOG_INFO, "%s (%s)", - frame->wind_from, frame->this->name); - frame = frame->next; - } - - size = backtrace(frames,NUM_FRAMES); - if (size <= 0) { - return; - } - symbols = backtrace_symbols(frames,size); - if (!symbols) { - return; - } - - gf_log(name, GF_LOG_INFO, "Processor stack:"); - for (i = 0; i < size; ++i) { - gf_log (name, GF_LOG_INFO, "%s", symbols[i]); - } - free(symbols); -} - -int32_t -pcli_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc, dict_t *xdata) -{ - uint64_t value; - - if (newloc->parent == oldloc->parent) { - gf_log (this->name, GF_LOG_DEBUG, "rename in same directory"); - goto simple_unwind; - } - if (!oldloc->parent) { - goto simple_unwind; - } - if (inode_ctx_get(oldloc->parent,this,&value) != 0) { - goto simple_unwind; - } - - if (value != PROT_ACT_NONE) { - gf_log (this->name, GF_LOG_WARNING, - "got rename for protected %s", oldloc->path); - pcli_print_trace(this->name,frame->next); - if (value == PROT_ACT_REJECT) { - STACK_UNWIND_STRICT (rename, frame, -1, EPERM, - NULL, NULL, NULL, NULL, NULL, - xdata); - return 0; - } - } - -simple_unwind: - STACK_WIND_TAIL (frame, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rename, oldloc, newloc, - xdata); - return 0; -} - -int32_t -pcli_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags, dict_t *xdata) -{ - data_t *data; - uint64_t value; - - /* - * We can't use dict_get_str and strcmp here, because the value comes - * directly from the user and might not be NUL-terminated (it would - * be if we had set it ourselves. - */ - - data = dict_get(dict,PROTECT_KEY); - if (!data) { - goto simple_wind; - } - - if (dict->count > 1) { - gf_log (this->name, GF_LOG_WARNING, - "attempted to mix %s with other keys", PROTECT_KEY); - goto simple_wind; - } - - gf_log (this->name, GF_LOG_DEBUG, "got %s request", PROTECT_KEY); - if (!strncmp(data->data,"log",data->len)) { - gf_log (this->name, GF_LOG_DEBUG, - "logging removals on %s", loc->path); - value = PROT_ACT_LOG; - } - else if (!strncmp(data->data,"reject",data->len)) { - gf_log (this->name, GF_LOG_DEBUG, - "rejecting removals on %s", loc->path); - value = PROT_ACT_REJECT; - } - else { - gf_log (this->name, GF_LOG_DEBUG, - "removing protection on %s", loc->path); - value = PROT_ACT_NONE; - } - /* Right now the value doesn't matter - just the presence. */ - if (inode_ctx_set(loc->inode,this,&value) != 0) { - gf_log (this->name, GF_LOG_WARNING, - "failed to set protection status for %s", loc->path); - } - STACK_UNWIND_STRICT (setxattr, frame, 0, 0, NULL); - return 0; - -simple_wind: - STACK_WIND_TAIL (frame, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, - loc, dict, flags, xdata); - return 0; -} - -int32_t -pcli_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, - dict_t *xdata) -{ - uint64_t value; - - if (!loc->parent || (inode_ctx_get(loc->parent,this,&value) != 0)) { - goto simple_unwind; - } - - if (value != PROT_ACT_NONE) { - gf_log (this->name, GF_LOG_WARNING, - "got unlink for protected %s", loc->path); - pcli_print_trace(this->name,frame->next); - if (value == PROT_ACT_REJECT) { - STACK_UNWIND_STRICT (unlink, frame, -1, EPERM, - NULL, NULL, NULL); - return 0; - } - } - -simple_unwind: - STACK_WIND_TAIL (frame, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); - return 0; -} - -int32_t -init (xlator_t *this) -{ - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "translator not configured with exactly one child"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - return 0; -} - - -void -fini (xlator_t *this) -{ - return; -} - - -struct xlator_fops fops = { - .rename = pcli_rename, - .setxattr = pcli_setxattr, - .unlink = pcli_unlink, -}; - -struct xlator_cbks cbks = { -}; - -struct volume_options options[] = { - { .key = {NULL} }, -}; diff --git a/xlators/features/protect/src/prot_dht.c b/xlators/features/protect/src/prot_dht.c deleted file mode 100644 index feec6ffd69c..00000000000 --- a/xlators/features/protect/src/prot_dht.c +++ /dev/null @@ -1,168 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "defaults.h" - -enum gf_pdht_mem_types_ { - gf_pdht_mt_coord_t = gf_common_mt_end + 1, - gf_pdht_mt_end -}; - -typedef struct { - pthread_mutex_t lock; - uint16_t refs; - int32_t op_ret; - int32_t op_errno; - dict_t *xdata; -} pdht_coord_t; - -static char PROTECT_KEY[] = "trusted.glusterfs.protect"; - -void -pdht_unref_and_unlock (call_frame_t *frame, xlator_t *this, - pdht_coord_t *coord) -{ - gf_boolean_t should_unwind; - - should_unwind = (--(coord->refs) == 0); - pthread_mutex_unlock(&coord->lock); - - if (should_unwind) { - STACK_UNWIND_STRICT (setxattr, frame, - coord->op_ret, coord->op_errno, - coord->xdata); - if (coord->xdata) { - dict_unref(coord->xdata); - } - GF_FREE(coord); - } -} - -int32_t -pdht_recurse_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - pdht_coord_t *coord = cookie; - - pthread_mutex_lock(&coord->lock); - if (op_ret) { - coord->op_ret = op_ret; - coord->op_errno = op_errno; - } - if (xdata) { - if (coord->xdata) { - dict_unref(coord->xdata); - } - coord->xdata = dict_ref(xdata); - } - pdht_unref_and_unlock(frame,this,coord); - - return 0; -} - -void -pdht_recurse (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags, dict_t *xdata, xlator_t *xl, pdht_coord_t *coord) -{ - xlator_list_t *iter; - - if (!strcmp(xl->type,"features/prot_client")) { - pthread_mutex_lock(&coord->lock); - ++(coord->refs); - pthread_mutex_unlock(&coord->lock); - STACK_WIND_COOKIE (frame, pdht_recurse_cbk, coord, xl, - xl->fops->setxattr, loc, dict, flags, xdata); - } - - else for (iter = xl->children; iter; iter = iter->next) { - pdht_recurse (frame, this, loc, dict, flags, xdata, - iter->xlator, coord); - } -} - -int32_t -pdht_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags, dict_t *xdata) -{ - pdht_coord_t *coord; - - if (!dict_get(dict,PROTECT_KEY)) { - goto simple_wind; - } - - if (dict->count > 1) { - gf_log (this->name, GF_LOG_WARNING, - "attempted to mix %s with other keys", PROTECT_KEY); - goto simple_wind; - } - - coord = GF_CALLOC(1,sizeof(*coord),gf_pdht_mt_coord_t); - if (!coord) { - gf_log (this->name, GF_LOG_WARNING, "allocation failed"); - goto simple_wind; - } - - pthread_mutex_init(&coord->lock,NULL); - coord->refs = 1; - coord->op_ret = 0; - coord->xdata = NULL; - - pdht_recurse(frame,this,loc,dict,flags,xdata,this,coord); - pthread_mutex_lock(&coord->lock); - pdht_unref_and_unlock(frame,this,coord); - - return 0; - -simple_wind: - STACK_WIND_TAIL (frame, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, - loc, dict, flags, xdata); - return 0; -} - -int32_t -init (xlator_t *this) -{ - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "translator not configured with exactly one child"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - return 0; -} - - -void -fini (xlator_t *this) -{ - return; -} - -struct xlator_fops fops = { - .setxattr = pdht_setxattr, -}; - -struct xlator_cbks cbks = { -}; - -struct volume_options options[] = { - { .key = {NULL} }, -}; diff --git a/xlators/features/protect/src/prot_server.c b/xlators/features/protect/src/prot_server.c deleted file mode 100644 index beaee0889b7..00000000000 --- a/xlators/features/protect/src/prot_server.c +++ /dev/null @@ -1,51 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "defaults.h" - -int32_t -init (xlator_t *this) -{ - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "translator not configured with exactly one child"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - return 0; -} - - -void -fini (xlator_t *this) -{ - return; -} - - -struct xlator_fops fops = { -}; - -struct xlator_cbks cbks = { -}; - -struct volume_options options[] = { - { .key = {NULL} }, -}; diff --git a/xlators/features/qemu-block/src/Makefile.am b/xlators/features/qemu-block/src/Makefile.am deleted file mode 100644 index 08a7b62a0db..00000000000 --- a/xlators/features/qemu-block/src/Makefile.am +++ /dev/null @@ -1,155 +0,0 @@ -if ENABLE_QEMU_BLOCK -xlator_LTLIBRARIES = qemu-block.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features - -qemu_block_la_LDFLAGS = -module -avoid-version -qemu_block_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(GLIB_LIBS) -lz -lrt - -qemu_block_la_SOURCES_qemu = \ - $(CONTRIBDIR)/qemu/qemu-coroutine.c \ - $(CONTRIBDIR)/qemu/qemu-coroutine-lock.c \ - $(CONTRIBDIR)/qemu/qemu-coroutine-sleep.c \ - $(CONTRIBDIR)/qemu/coroutine-ucontext.c \ - $(CONTRIBDIR)/qemu/block.c \ - $(CONTRIBDIR)/qemu/nop-symbols.c - -qemu_block_la_SOURCES_qemu_util = \ - $(CONTRIBDIR)/qemu/util/aes.c \ - $(CONTRIBDIR)/qemu/util/bitmap.c \ - $(CONTRIBDIR)/qemu/util/bitops.c \ - $(CONTRIBDIR)/qemu/util/cutils.c \ - $(CONTRIBDIR)/qemu/util/error.c \ - $(CONTRIBDIR)/qemu/util/hbitmap.c \ - $(CONTRIBDIR)/qemu/util/iov.c \ - $(CONTRIBDIR)/qemu/util/module.c \ - $(CONTRIBDIR)/qemu/util/oslib-posix.c \ - $(CONTRIBDIR)/qemu/util/qemu-option.c \ - $(CONTRIBDIR)/qemu/util/qemu-error.c \ - $(CONTRIBDIR)/qemu/util/qemu-thread-posix.c \ - $(CONTRIBDIR)/qemu/util/unicode.c \ - $(CONTRIBDIR)/qemu/util/hexdump.c - -qemu_block_la_SOURCES_qemu_block = \ - $(CONTRIBDIR)/qemu/block/snapshot.c \ - $(CONTRIBDIR)/qemu/block/qcow2-cache.c \ - $(CONTRIBDIR)/qemu/block/qcow2-cluster.c \ - $(CONTRIBDIR)/qemu/block/qcow2-refcount.c \ - $(CONTRIBDIR)/qemu/block/qcow2-snapshot.c \ - $(CONTRIBDIR)/qemu/block/qcow2.c \ - $(CONTRIBDIR)/qemu/block/qed-check.c \ - $(CONTRIBDIR)/qemu/block/qed-cluster.c \ - $(CONTRIBDIR)/qemu/block/qed-gencb.c \ - $(CONTRIBDIR)/qemu/block/qed-l2-cache.c \ - $(CONTRIBDIR)/qemu/block/qed-table.c \ - $(CONTRIBDIR)/qemu/block/qed.c - -qemu_block_la_SOURCES_qemu_qobject = \ - $(CONTRIBDIR)/qemu/qobject/json-lexer.c \ - $(CONTRIBDIR)/qemu/qobject/json-parser.c \ - $(CONTRIBDIR)/qemu/qobject/json-streamer.c \ - $(CONTRIBDIR)/qemu/qobject/qbool.c \ - $(CONTRIBDIR)/qemu/qobject/qdict.c \ - $(CONTRIBDIR)/qemu/qobject/qerror.c \ - $(CONTRIBDIR)/qemu/qobject/qfloat.c \ - $(CONTRIBDIR)/qemu/qobject/qint.c \ - $(CONTRIBDIR)/qemu/qobject/qjson.c \ - $(CONTRIBDIR)/qemu/qobject/qlist.c \ - $(CONTRIBDIR)/qemu/qobject/qstring.c - -qemu_block_la_SOURCES = \ - $(qemu_block_la_SOURCES_qemu) \ - $(qemu_block_la_SOURCES_qemu_util) \ - $(qemu_block_la_SOURCES_qemu_block) \ - $(qemu_block_la_SOURCES_qemu_qobject) \ - bdrv-xlator.c \ - coroutine-synctask.c \ - bh-syncop.c \ - monitor-logging.c \ - clock-timer.c \ - qemu-block.c \ - qb-coroutines.c - -noinst_HEADERS_qemu = \ - $(CONTRIBDIR)/qemu/config-host.h \ - $(CONTRIBDIR)/qemu/qapi-types.h \ - $(CONTRIBDIR)/qemu/qmp-commands.h \ - $(CONTRIBDIR)/qemu/trace/generated-tracers.h \ - $(CONTRIBDIR)/qemu/include/config.h \ - $(CONTRIBDIR)/qemu/include/glib-compat.h \ - $(CONTRIBDIR)/qemu/include/qemu-common.h \ - $(CONTRIBDIR)/qemu/include/trace.h \ - $(CONTRIBDIR)/qemu/include/block/coroutine.h \ - $(CONTRIBDIR)/qemu/include/block/aio.h \ - $(CONTRIBDIR)/qemu/include/block/block.h \ - $(CONTRIBDIR)/qemu/include/block/block_int.h \ - $(CONTRIBDIR)/qemu/include/block/blockjob.h \ - $(CONTRIBDIR)/qemu/include/block/coroutine.h \ - $(CONTRIBDIR)/qemu/include/block/coroutine_int.h \ - $(CONTRIBDIR)/qemu/include/block/snapshot.h \ - $(CONTRIBDIR)/qemu/include/exec/cpu-common.h \ - $(CONTRIBDIR)/qemu/include/exec/hwaddr.h \ - $(CONTRIBDIR)/qemu/include/exec/poison.h \ - $(CONTRIBDIR)/qemu/include/fpu/softfloat.h \ - $(CONTRIBDIR)/qemu/include/migration/migration.h \ - $(CONTRIBDIR)/qemu/include/migration/qemu-file.h \ - $(CONTRIBDIR)/qemu/include/migration/vmstate.h \ - $(CONTRIBDIR)/qemu/include/monitor/monitor.h \ - $(CONTRIBDIR)/qemu/include/monitor/readline.h \ - $(CONTRIBDIR)/qemu/include/qapi/error.h \ - $(CONTRIBDIR)/qemu/include/qapi/qmp/json-lexer.h \ - $(CONTRIBDIR)/qemu/include/qapi/qmp/json-parser.h \ - $(CONTRIBDIR)/qemu/include/qapi/qmp/json-streamer.h \ - $(CONTRIBDIR)/qemu/include/qapi/qmp/qbool.h \ - $(CONTRIBDIR)/qemu/include/qapi/qmp/qdict.h \ - $(CONTRIBDIR)/qemu/include/qapi/qmp/qerror.h \ - $(CONTRIBDIR)/qemu/include/qapi/qmp/qfloat.h \ - $(CONTRIBDIR)/qemu/include/qapi/qmp/qint.h \ - $(CONTRIBDIR)/qemu/include/qapi/qmp/qjson.h \ - $(CONTRIBDIR)/qemu/include/qapi/qmp/qlist.h \ - $(CONTRIBDIR)/qemu/include/qapi/qmp/qobject.h \ - $(CONTRIBDIR)/qemu/include/qapi/qmp/qstring.h \ - $(CONTRIBDIR)/qemu/include/qapi/qmp/types.h \ - $(CONTRIBDIR)/qemu/include/qemu/aes.h \ - $(CONTRIBDIR)/qemu/include/qemu/atomic.h \ - $(CONTRIBDIR)/qemu/include/qemu/bitmap.h \ - $(CONTRIBDIR)/qemu/include/qemu/bitops.h \ - $(CONTRIBDIR)/qemu/include/qemu/bswap.h \ - $(CONTRIBDIR)/qemu/include/qemu/compiler.h \ - $(CONTRIBDIR)/qemu/include/qemu/error-report.h \ - $(CONTRIBDIR)/qemu/include/qemu/event_notifier.h \ - $(CONTRIBDIR)/qemu/include/qemu/hbitmap.h \ - $(CONTRIBDIR)/qemu/include/qemu/host-utils.h \ - $(CONTRIBDIR)/qemu/include/qemu/iov.h \ - $(CONTRIBDIR)/qemu/include/qemu/main-loop.h \ - $(CONTRIBDIR)/qemu/include/qemu/module.h \ - $(CONTRIBDIR)/qemu/include/qemu/notify.h \ - $(CONTRIBDIR)/qemu/include/qemu/option.h \ - $(CONTRIBDIR)/qemu/include/qemu/option_int.h \ - $(CONTRIBDIR)/qemu/include/qemu/osdep.h \ - $(CONTRIBDIR)/qemu/include/qemu/queue.h \ - $(CONTRIBDIR)/qemu/include/qemu/sockets.h \ - $(CONTRIBDIR)/qemu/include/qemu/thread-posix.h \ - $(CONTRIBDIR)/qemu/include/qemu/thread.h \ - $(CONTRIBDIR)/qemu/include/qemu/timer.h \ - $(CONTRIBDIR)/qemu/include/qemu/typedefs.h \ - $(CONTRIBDIR)/qemu/include/sysemu/sysemu.h \ - $(CONTRIBDIR)/qemu/include/sysemu/os-posix.h \ - $(CONTRIBDIR)/qemu/block/qcow2.h \ - $(CONTRIBDIR)/qemu/block/qed.h - -noinst_HEADERS = \ - $(noinst_HEADERS_qemu) \ - qemu-block.h \ - qemu-block-memory-types.h \ - qb-coroutines.h - -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ - -I$(CONTRIBDIR)/qemu \ - -I$(CONTRIBDIR)/qemu/include \ - -DGLUSTER_XLATOR - -AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS) $(GLIB_CFLAGS) - -CLEANFILES = - -endif diff --git a/xlators/features/qemu-block/src/bdrv-xlator.c b/xlators/features/qemu-block/src/bdrv-xlator.c deleted file mode 100644 index 106c5977535..00000000000 --- a/xlators/features/qemu-block/src/bdrv-xlator.c +++ /dev/null @@ -1,397 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "inode.h" -#include "syncop.h" -#include "qemu-block.h" -#include "block/block_int.h" - -typedef struct BDRVGlusterState { - inode_t *inode; -} BDRVGlusterState; - -static QemuOptsList runtime_opts = { - .name = "gluster", - .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), - .desc = { - { - .name = "filename", - .type = QEMU_OPT_STRING, - .help = "GFID of file", - }, - { /* end of list */ } - }, -}; - -inode_t * -qb_inode_from_filename (const char *filename) -{ - const char *iptr = NULL; - inode_t *inode = NULL; - - iptr = filename + 17; - sscanf (iptr, "%p", &inode); - - return inode; -} - - -int -qb_inode_to_filename (inode_t *inode, char *filename, int size) -{ - return snprintf (filename, size, "gluster://inodep:%p", inode); -} - - -static fd_t * -fd_from_bs (BlockDriverState *bs) -{ - BDRVGlusterState *s = bs->opaque; - - return fd_anonymous (s->inode); -} - - -static int -qemu_gluster_open (BlockDriverState *bs, QDict *options, int bdrv_flags) -{ - inode_t *inode = NULL; - BDRVGlusterState *s = bs->opaque; - QemuOpts *opts = NULL; - Error *local_err = NULL; - const char *filename = NULL; - char gfid_str[128]; - int ret; - qb_conf_t *conf = THIS->private; - - opts = qemu_opts_create_nofail(&runtime_opts); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); - return -EINVAL; - } - - filename = qemu_opt_get(opts, "filename"); - - /* - * gfid:<gfid> format means we're opening a backing image. - */ - ret = sscanf(filename, "gluster://gfid:%s", gfid_str); - if (ret) { - loc_t loc = {0,}; - struct iatt buf = {0,}; - uuid_t gfid; - - uuid_parse(gfid_str, gfid); - - loc.inode = inode_find(conf->root_inode->table, gfid); - if (!loc.inode) { - loc.inode = inode_new(conf->root_inode->table); - uuid_copy(loc.inode->gfid, gfid); - } - - uuid_copy(loc.gfid, loc.inode->gfid); - ret = syncop_lookup(FIRST_CHILD(THIS), &loc, NULL, &buf, NULL, - NULL); - if (ret) { - loc_wipe(&loc); - return -errno; - } - - s->inode = inode_ref(loc.inode); - loc_wipe(&loc); - } else { - inode = qb_inode_from_filename (filename); - if (!inode) - return -EINVAL; - - s->inode = inode_ref(inode); - } - - return 0; -} - - -static int -qemu_gluster_create (const char *filename, QEMUOptionParameter *options) -{ - uint64_t total_size = 0; - inode_t *inode = NULL; - fd_t *fd = NULL; - struct iatt stat = {0, }; - int ret = 0; - - inode = qb_inode_from_filename (filename); - if (!inode) - return -EINVAL; - - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - total_size = options->value.n / BDRV_SECTOR_SIZE; - } - options++; - } - - fd = fd_anonymous (inode); - if (!fd) - return -ENOMEM; - - ret = syncop_fstat (FIRST_CHILD(THIS), fd, &stat); - if (ret) { - fd_unref (fd); - return -errno; - } - - if (stat.ia_size) { - /* format ONLY if the filesize is 0 bytes */ - fd_unref (fd); - return -EFBIG; - } - - if (total_size) { - ret = syncop_ftruncate (FIRST_CHILD(THIS), fd, total_size); - if (ret) { - fd_unref (fd); - return -errno; - } - } - - fd_unref (fd); - return 0; -} - - -static int -qemu_gluster_co_readv (BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *qiov) -{ - fd_t *fd = NULL; - off_t offset = 0; - size_t size = 0; - struct iovec *iov = NULL; - int count = 0; - struct iobref *iobref = NULL; - int ret = 0; - - fd = fd_from_bs (bs); - if (!fd) - return -EIO; - - offset = sector_num * BDRV_SECTOR_SIZE; - size = nb_sectors * BDRV_SECTOR_SIZE; - - ret = syncop_readv (FIRST_CHILD(THIS), fd, size, offset, 0, - &iov, &count, &iobref); - if (ret < 0) { - ret = -errno; - goto out; - } - - iov_copy (qiov->iov, qiov->niov, iov, count); /* *choke!* */ - -out: - GF_FREE (iov); - if (iobref) - iobref_unref (iobref); - fd_unref (fd); - return ret; -} - - -static int -qemu_gluster_co_writev (BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *qiov) -{ - fd_t *fd = NULL; - off_t offset = 0; - size_t size = 0; - struct iobref *iobref = NULL; - struct iobuf *iobuf = NULL; - struct iovec iov = {0, }; - int ret = -ENOMEM; - - fd = fd_from_bs (bs); - if (!fd) - return -EIO; - - offset = sector_num * BDRV_SECTOR_SIZE; - size = nb_sectors * BDRV_SECTOR_SIZE; - - iobuf = iobuf_get2 (THIS->ctx->iobuf_pool, size); - if (!iobuf) - goto out; - - iobref = iobref_new (); - if (!iobref) { - iobuf_unref (iobuf); - goto out; - } - - iobref_add (iobref, iobuf); - - iov_unload (iobuf_ptr (iobuf), qiov->iov, qiov->niov); /* *choke!* */ - - iov.iov_base = iobuf_ptr (iobuf); - iov.iov_len = size; - - ret = syncop_writev (FIRST_CHILD(THIS), fd, &iov, 1, offset, iobref, 0); - if (ret < 0) - ret = -errno; - -out: - if (iobuf) - iobuf_unref (iobuf); - if (iobref) - iobref_unref (iobref); - fd_unref (fd); - return ret; -} - - -static int -qemu_gluster_co_flush (BlockDriverState *bs) -{ - fd_t *fd = NULL; - int ret = 0; - - fd = fd_from_bs (bs); - - ret = syncop_flush (FIRST_CHILD(THIS), fd); - - fd_unref (fd); - - return ret; -} - - -static int -qemu_gluster_co_fsync (BlockDriverState *bs) -{ - fd_t *fd = NULL; - int ret = 0; - - fd = fd_from_bs (bs); - - ret = syncop_fsync (FIRST_CHILD(THIS), fd, 0); - - fd_unref (fd); - - return ret; -} - - -static int -qemu_gluster_truncate (BlockDriverState *bs, int64_t offset) -{ - fd_t *fd = NULL; - int ret = 0; - - fd = fd_from_bs (bs); - - ret = syncop_ftruncate (FIRST_CHILD(THIS), fd, offset); - - fd_unref (fd); - - if (ret < 0) - return ret; - - return ret; -} - - -static int64_t -qemu_gluster_getlength (BlockDriverState *bs) -{ - fd_t *fd = NULL; - int ret = 0; - struct iatt iatt = {0, }; - - fd = fd_from_bs (bs); - - ret = syncop_fstat (FIRST_CHILD(THIS), fd, &iatt); - if (ret < 0) - return -1; - - return iatt.ia_size; -} - - -static int64_t -qemu_gluster_allocated_file_size (BlockDriverState *bs) -{ - fd_t *fd = NULL; - int ret = 0; - struct iatt iatt = {0, }; - - fd = fd_from_bs (bs); - - ret = syncop_fstat (FIRST_CHILD(THIS), fd, &iatt); - if (ret < 0) - return -1; - - return iatt.ia_blocks * 512; -} - - -static void -qemu_gluster_close (BlockDriverState *bs) -{ - BDRVGlusterState *s = NULL; - - s = bs->opaque; - - inode_unref (s->inode); - - return; -} - - -static QEMUOptionParameter qemu_gluster_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { NULL } -}; - - -static BlockDriver bdrv_gluster = { - .format_name = "gluster", - .protocol_name = "gluster", - .instance_size = sizeof(BDRVGlusterState), - .bdrv_file_open = qemu_gluster_open, - .bdrv_close = qemu_gluster_close, - .bdrv_create = qemu_gluster_create, - .bdrv_getlength = qemu_gluster_getlength, - .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, - .bdrv_co_readv = qemu_gluster_co_readv, - .bdrv_co_writev = qemu_gluster_co_writev, - .bdrv_co_flush_to_os = qemu_gluster_co_flush, - .bdrv_co_flush_to_disk = qemu_gluster_co_fsync, - .bdrv_truncate = qemu_gluster_truncate, - .create_options = qemu_gluster_create_options, -}; - - -static void bdrv_gluster_init(void) -{ - bdrv_register(&bdrv_gluster); -} - - -block_init(bdrv_gluster_init); diff --git a/xlators/features/qemu-block/src/bh-syncop.c b/xlators/features/qemu-block/src/bh-syncop.c deleted file mode 100644 index e8686f6d4ba..00000000000 --- a/xlators/features/qemu-block/src/bh-syncop.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "syncop.h" -#include "qemu-block-memory-types.h" - -#include "block/aio.h" - -void -qemu_bh_schedule (QEMUBH *bh) -{ - return; -} - -void -qemu_bh_cancel (QEMUBH *bh) -{ - return; -} - -void -qemu_bh_delete (QEMUBH *bh) -{ - -} - -QEMUBH * -qemu_bh_new (QEMUBHFunc *cb, void *opaque) -{ - return NULL; -} diff --git a/xlators/features/qemu-block/src/clock-timer.c b/xlators/features/qemu-block/src/clock-timer.c deleted file mode 100644 index fcbec6ad1cd..00000000000 --- a/xlators/features/qemu-block/src/clock-timer.c +++ /dev/null @@ -1,60 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "syncop.h" -#include "qemu-block-memory-types.h" - -#include "qemu/timer.h" - -QEMUClock *vm_clock; -int use_rt_clock = 0; - -QEMUTimer *qemu_new_timer (QEMUClock *clock, int scale, - QEMUTimerCB *cb, void *opaque) -{ - return NULL; -} - -int64_t qemu_get_clock_ns (QEMUClock *clock) -{ - return 0; -} - -void qemu_mod_timer (QEMUTimer *ts, int64_t expire_time) -{ - return; -} - -void qemu_free_timer (QEMUTimer *ts) -{ - -} - -void qemu_del_timer (QEMUTimer *ts) -{ - -} - -bool qemu_aio_wait() -{ - synctask_wake (synctask_get()); - synctask_yield (synctask_get()); - return 0; -} diff --git a/xlators/features/qemu-block/src/coroutine-synctask.c b/xlators/features/qemu-block/src/coroutine-synctask.c deleted file mode 100644 index e43988a953f..00000000000 --- a/xlators/features/qemu-block/src/coroutine-synctask.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "syncop.h" -#include "qemu-block-memory-types.h" - -#include "qemu-block.h" - -/* - * This code serves as the bridge from the main glusterfs context to the qemu - * coroutine context via synctask. We create a single threaded syncenv with a - * single synctask responsible for processing a queue of coroutines. The qemu - * code invoked from within the synctask function handlers uses the ucontext - * coroutine implementation and scheduling logic internal to qemu. This - * effectively donates a thread of execution to qemu and its internal coroutine - * management. - * - * NOTE: The existence of concurrent synctasks has proven quite racy with regard - * to qemu coroutine management, particularly related to the lifecycle - * differences with top-level synctasks and internally created coroutines and - * interactions with qemu-internal queues (and locks, in turn). We explicitly - * disallow this scenario, via the queue, until it is more well supported. - */ - -static struct { - struct list_head queue; - gf_lock_t lock; - struct synctask *task; -} qb_co; - -static void -init_qbco() -{ - INIT_LIST_HEAD(&qb_co.queue); - LOCK_INIT(&qb_co.lock); -} - -static int -synctask_nop_cbk (int ret, call_frame_t *frame, void *opaque) -{ - return 0; -} - -static int -qb_synctask_wrap (void *opaque) -{ - qb_local_t *qb_local, *tmp; - - LOCK(&qb_co.lock); - - while (!list_empty(&qb_co.queue)) { - list_for_each_entry_safe(qb_local, tmp, &qb_co.queue, list) { - list_del_init(&qb_local->list); - break; - } - - UNLOCK(&qb_co.lock); - - qb_local->synctask_fn(qb_local); - /* qb_local is now unwound and gone! */ - - LOCK(&qb_co.lock); - } - - qb_co.task = NULL; - - UNLOCK(&qb_co.lock); - - return 0; -} - -int -qb_coroutine (call_frame_t *frame, synctask_fn_t fn) -{ - qb_local_t *qb_local = NULL; - qb_conf_t *qb_conf = NULL; - static int init = 0; - - qb_local = frame->local; - qb_local->synctask_fn = fn; - qb_conf = frame->this->private; - - if (!init) { - init = 1; - init_qbco(); - } - - LOCK(&qb_co.lock); - - if (!qb_co.task) - qb_co.task = synctask_create(qb_conf->env, qb_synctask_wrap, - synctask_nop_cbk, frame, NULL); - - list_add_tail(&qb_local->list, &qb_co.queue); - - UNLOCK(&qb_co.lock); - - return 0; -} diff --git a/xlators/features/qemu-block/src/monitor-logging.c b/xlators/features/qemu-block/src/monitor-logging.c deleted file mode 100644 index d37c37f0f29..00000000000 --- a/xlators/features/qemu-block/src/monitor-logging.c +++ /dev/null @@ -1,50 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "qemu-block-memory-types.h" - -#include "block/block_int.h" - -Monitor *cur_mon; - -int -monitor_cur_is_qmp() -{ - /* No QMP support here */ - return 0; -} - -void -monitor_set_error (Monitor *mon, QError *qerror) -{ - /* NOP here */ - return; -} - - -void -monitor_vprintf(Monitor *mon, const char *fmt, va_list ap) -{ - char buf[4096]; - - vsnprintf(buf, sizeof(buf), fmt, ap); - - gf_log (THIS->name, GF_LOG_ERROR, "%s", buf); -} diff --git a/xlators/features/qemu-block/src/qb-coroutines.c b/xlators/features/qemu-block/src/qb-coroutines.c deleted file mode 100644 index 7c52adb21ed..00000000000 --- a/xlators/features/qemu-block/src/qb-coroutines.c +++ /dev/null @@ -1,662 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "inode.h" -#include "call-stub.h" -#include "defaults.h" -#include "qemu-block-memory-types.h" -#include "qemu-block.h" -#include "qb-coroutines.h" - - -int -qb_format_and_resume (void *opaque) -{ - qb_local_t *local = NULL; - call_frame_t *frame = NULL; - call_stub_t *stub = NULL; - inode_t *inode = NULL; - char filename[64]; - char base_filename[128]; - int use_base = 0; - qb_inode_t *qb_inode = NULL; - Error *local_err = NULL; - fd_t *fd = NULL; - dict_t *xattr = NULL; - qb_conf_t *qb_conf = NULL; - int ret = -1; - - local = opaque; - frame = local->frame; - stub = local->stub; - inode = local->inode; - qb_conf = frame->this->private; - - qb_inode_to_filename (inode, filename, 64); - - qb_inode = qb_inode_ctx_get (frame->this, inode); - - /* - * See if the caller specified a backing image. - */ - if (!uuid_is_null(qb_inode->backing_gfid) || qb_inode->backing_fname) { - loc_t loc = {0,}; - char gfid_str[64]; - struct iatt buf; - - if (!uuid_is_null(qb_inode->backing_gfid)) { - loc.inode = inode_find(qb_conf->root_inode->table, - qb_inode->backing_gfid); - if (!loc.inode) { - loc.inode = inode_new(qb_conf->root_inode->table); - uuid_copy(loc.inode->gfid, - qb_inode->backing_gfid); - } - uuid_copy(loc.gfid, loc.inode->gfid); - } else if (qb_inode->backing_fname) { - loc.inode = inode_new(qb_conf->root_inode->table); - loc.name = qb_inode->backing_fname; - loc.parent = inode_parent(inode, NULL, NULL); - loc_path(&loc, loc.name); - } - - /* - * Lookup the backing image. Verify existence and/or get the - * gfid if we don't already have it. - */ - ret = syncop_lookup(FIRST_CHILD(frame->this), &loc, NULL, &buf, - NULL, NULL); - GF_FREE(qb_inode->backing_fname); - if (ret) { - loc_wipe(&loc); - ret = errno; - goto err; - } - - uuid_copy(qb_inode->backing_gfid, buf.ia_gfid); - loc_wipe(&loc); - - /* - * We pass the filename of the backing image into the qemu block - * subsystem as the associated gfid. This is embedded into the - * clone image and passed along to the gluster bdrv backend when - * the block subsystem needs to operate on the backing image on - * behalf of the clone. - */ - uuid_unparse(qb_inode->backing_gfid, gfid_str); - snprintf(base_filename, sizeof(base_filename), - "gluster://gfid:%s", gfid_str); - use_base = 1; - } - - bdrv_img_create (filename, qb_inode->fmt, - use_base ? base_filename : NULL, 0, 0, qb_inode->size, - 0, &local_err, true); - - if (error_is_set (&local_err)) { - gf_log (frame->this->name, GF_LOG_ERROR, "%s", - error_get_pretty (local_err)); - error_free (local_err); - QB_STUB_UNWIND (stub, -1, EIO); - return 0; - } - - fd = fd_anonymous (inode); - if (!fd) { - gf_log (frame->this->name, GF_LOG_ERROR, - "could not create anonymous fd for %s", - uuid_utoa (inode->gfid)); - QB_STUB_UNWIND (stub, -1, ENOMEM); - return 0; - } - - xattr = dict_new (); - if (!xattr) { - gf_log (frame->this->name, GF_LOG_ERROR, - "could not allocate xattr dict for %s", - uuid_utoa (inode->gfid)); - QB_STUB_UNWIND (stub, -1, ENOMEM); - fd_unref (fd); - return 0; - } - - ret = dict_set_str (xattr, qb_conf->qb_xattr_key, local->fmt); - if (ret) { - gf_log (frame->this->name, GF_LOG_ERROR, - "could not dict_set for %s", - uuid_utoa (inode->gfid)); - QB_STUB_UNWIND (stub, -1, ENOMEM); - fd_unref (fd); - dict_unref (xattr); - return 0; - } - - ret = syncop_fsetxattr (FIRST_CHILD(THIS), fd, xattr, 0); - if (ret) { - ret = errno; - gf_log (frame->this->name, GF_LOG_ERROR, - "failed to setxattr for %s", - uuid_utoa (inode->gfid)); - QB_STUB_UNWIND (stub, -1, ret); - fd_unref (fd); - dict_unref (xattr); - return 0; - } - - fd_unref (fd); - dict_unref (xattr); - - QB_STUB_UNWIND (stub, 0, 0); - - return 0; - -err: - QB_STUB_UNWIND(stub, -1, ret); - return 0; -} - - -static BlockDriverState * -qb_bs_create (inode_t *inode, const char *fmt) -{ - char filename[64]; - BlockDriverState *bs = NULL; - BlockDriver *drv = NULL; - int op_errno = 0; - int ret = 0; - - bs = bdrv_new (uuid_utoa (inode->gfid)); - if (!bs) { - op_errno = ENOMEM; - gf_log (THIS->name, GF_LOG_ERROR, - "could not allocate @bdrv for gfid:%s", - uuid_utoa (inode->gfid)); - goto err; - } - - drv = bdrv_find_format (fmt); - if (!drv) { - op_errno = EINVAL; - gf_log (THIS->name, GF_LOG_ERROR, - "Unknown file format: %s for gfid:%s", - fmt, uuid_utoa (inode->gfid)); - goto err; - } - - qb_inode_to_filename (inode, filename, 64); - - ret = bdrv_open (bs, filename, NULL, BDRV_O_RDWR, drv); - if (ret < 0) { - op_errno = -ret; - gf_log (THIS->name, GF_LOG_ERROR, - "Unable to bdrv_open() gfid:%s (%s)", - uuid_utoa (inode->gfid), strerror (op_errno)); - goto err; - } - - return bs; -err: - errno = op_errno; - return NULL; -} - - -int -qb_co_open (void *opaque) -{ - qb_local_t *local = NULL; - call_frame_t *frame = NULL; - call_stub_t *stub = NULL; - inode_t *inode = NULL; - qb_inode_t *qb_inode = NULL; - - local = opaque; - frame = local->frame; - stub = local->stub; - inode = local->inode; - - qb_inode = qb_inode_ctx_get (frame->this, inode); - if (!qb_inode->bs) { - /* FIXME: we need locks around this when - enabling multithreaded syncop/coroutine - for qemu-block - */ - - qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); - if (!qb_inode->bs) { - QB_STUB_UNWIND (stub, -1, errno); - return 0; - } - } - qb_inode->refcnt++; - - QB_STUB_RESUME (stub); - - return 0; -} - - -int -qb_co_writev (void *opaque) -{ - qb_local_t *local = NULL; - call_frame_t *frame = NULL; - call_stub_t *stub = NULL; - inode_t *inode = NULL; - qb_inode_t *qb_inode = NULL; - QEMUIOVector qiov = {0, }; - int ret = 0; - - local = opaque; - frame = local->frame; - stub = local->stub; - inode = local->inode; - - qb_inode = qb_inode_ctx_get (frame->this, inode); - if (!qb_inode->bs) { - /* FIXME: we need locks around this when - enabling multithreaded syncop/coroutine - for qemu-block - */ - - qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); - if (!qb_inode->bs) { - QB_STUB_UNWIND (stub, -1, errno); - return 0; - } - } - - qemu_iovec_init_external (&qiov, stub->args.vector, stub->args.count); - - ret = bdrv_pwritev (qb_inode->bs, stub->args.offset, &qiov); - - if (ret < 0) { - QB_STUB_UNWIND (stub, -1, -ret); - } else { - QB_STUB_UNWIND (stub, ret, 0); - } - - return 0; -} - - -int -qb_co_readv (void *opaque) -{ - qb_local_t *local = NULL; - call_frame_t *frame = NULL; - call_stub_t *stub = NULL; - inode_t *inode = NULL; - qb_inode_t *qb_inode = NULL; - struct iobuf *iobuf = NULL; - struct iobref *iobref = NULL; - struct iovec iov = {0, }; - int ret = 0; - - local = opaque; - frame = local->frame; - stub = local->stub; - inode = local->inode; - - qb_inode = qb_inode_ctx_get (frame->this, inode); - if (!qb_inode->bs) { - /* FIXME: we need locks around this when - enabling multithreaded syncop/coroutine - for qemu-block - */ - - qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); - if (!qb_inode->bs) { - QB_STUB_UNWIND (stub, -1, errno); - return 0; - } - } - - if (stub->args.offset >= qb_inode->size) { - QB_STUB_UNWIND (stub, 0, 0); - return 0; - } - - iobuf = iobuf_get2 (frame->this->ctx->iobuf_pool, stub->args.size); - if (!iobuf) { - QB_STUB_UNWIND (stub, -1, ENOMEM); - return 0; - } - - iobref = iobref_new (); - if (!iobref) { - QB_STUB_UNWIND (stub, -1, ENOMEM); - iobuf_unref (iobuf); - return 0; - } - - if (iobref_add (iobref, iobuf) < 0) { - iobuf_unref (iobuf); - iobref_unref (iobref); - QB_STUB_UNWIND (stub, -1, ENOMEM); - return 0; - } - - ret = bdrv_pread (qb_inode->bs, stub->args.offset, iobuf_ptr (iobuf), - stub->args.size); - - if (ret < 0) { - QB_STUB_UNWIND (stub, -1, -ret); - iobref_unref (iobref); - return 0; - } - - iov.iov_base = iobuf_ptr (iobuf); - iov.iov_len = ret; - - stub->args_cbk.vector = iov_dup (&iov, 1); - stub->args_cbk.count = 1; - stub->args_cbk.iobref = iobref; - - QB_STUB_UNWIND (stub, ret, 0); - - return 0; -} - - -int -qb_co_fsync (void *opaque) -{ - qb_local_t *local = NULL; - call_frame_t *frame = NULL; - call_stub_t *stub = NULL; - inode_t *inode = NULL; - qb_inode_t *qb_inode = NULL; - int ret = 0; - - local = opaque; - frame = local->frame; - stub = local->stub; - inode = local->inode; - - qb_inode = qb_inode_ctx_get (frame->this, inode); - if (!qb_inode->bs) { - /* FIXME: we need locks around this when - enabling multithreaded syncop/coroutine - for qemu-block - */ - - qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); - if (!qb_inode->bs) { - QB_STUB_UNWIND (stub, -1, errno); - return 0; - } - } - - ret = bdrv_flush (qb_inode->bs); - - if (ret < 0) { - QB_STUB_UNWIND (stub, -1, -ret); - } else { - QB_STUB_UNWIND (stub, ret, 0); - } - - return 0; -} - - -static void -qb_update_size_xattr (xlator_t *this, fd_t *fd, const char *fmt, off_t offset) -{ - char val[QB_XATTR_VAL_MAX]; - qb_conf_t *qb_conf = NULL; - dict_t *xattr = NULL; - - qb_conf = this->private; - - snprintf (val, QB_XATTR_VAL_MAX, "%s:%llu", - fmt, (long long unsigned) offset); - - xattr = dict_new (); - if (!xattr) - return; - - if (dict_set_str (xattr, qb_conf->qb_xattr_key, val) != 0) { - dict_unref (xattr); - return; - } - - syncop_fsetxattr (FIRST_CHILD(this), fd, xattr, 0); - dict_unref (xattr); -} - - -int -qb_co_truncate (void *opaque) -{ - qb_local_t *local = NULL; - call_frame_t *frame = NULL; - call_stub_t *stub = NULL; - inode_t *inode = NULL; - qb_inode_t *qb_inode = NULL; - int ret = 0; - off_t offset = 0; - xlator_t *this = NULL; - - this = THIS; - - local = opaque; - frame = local->frame; - stub = local->stub; - inode = local->inode; - - qb_inode = qb_inode_ctx_get (frame->this, inode); - if (!qb_inode->bs) { - /* FIXME: we need locks around this when - enabling multithreaded syncop/coroutine - for qemu-block - */ - - qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); - if (!qb_inode->bs) { - QB_STUB_UNWIND (stub, -1, errno); - return 0; - } - } - - syncop_fstat (FIRST_CHILD(this), local->fd, &stub->args_cbk.prestat); - stub->args_cbk.prestat.ia_size = qb_inode->size; - - ret = bdrv_truncate (qb_inode->bs, stub->args.offset); - if (ret < 0) - goto out; - - offset = bdrv_getlength (qb_inode->bs); - - qb_inode->size = offset; - - syncop_fstat (FIRST_CHILD(this), local->fd, &stub->args_cbk.poststat); - stub->args_cbk.poststat.ia_size = qb_inode->size; - - qb_update_size_xattr (this, local->fd, qb_inode->fmt, qb_inode->size); - -out: - if (ret < 0) { - QB_STUB_UNWIND (stub, -1, -ret); - } else { - QB_STUB_UNWIND (stub, ret, 0); - } - - return 0; -} - - -int -qb_co_close (void *opaque) -{ - qb_local_t *local = NULL; - call_frame_t *frame = NULL; - inode_t *inode = NULL; - qb_inode_t *qb_inode = NULL; - BlockDriverState *bs = NULL; - - local = opaque; - inode = local->inode; - - qb_inode = qb_inode_ctx_get (THIS, inode); - - if (!--qb_inode->refcnt) { - bs = qb_inode->bs; - qb_inode->bs = NULL; - bdrv_delete (bs); - } - - frame = local->frame; - frame->local = NULL; - qb_local_free (THIS, local); - STACK_DESTROY (frame->root); - - return 0; -} - - -int -qb_snapshot_create (void *opaque) -{ - qb_local_t *local = NULL; - call_frame_t *frame = NULL; - call_stub_t *stub = NULL; - inode_t *inode = NULL; - qb_inode_t *qb_inode = NULL; - QEMUSnapshotInfo sn; - struct timeval tv = {0, }; - int ret = 0; - - local = opaque; - frame = local->frame; - stub = local->stub; - inode = local->inode; - - qb_inode = qb_inode_ctx_get (frame->this, inode); - if (!qb_inode->bs) { - /* FIXME: we need locks around this when - enabling multithreaded syncop/coroutine - for qemu-block - */ - - qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); - if (!qb_inode->bs) { - QB_STUB_UNWIND (stub, -1, errno); - return 0; - } - } - - memset (&sn, 0, sizeof (sn)); - pstrcpy (sn.name, sizeof(sn.name), local->name); - gettimeofday (&tv, NULL); - sn.date_sec = tv.tv_sec; - sn.date_nsec = tv.tv_usec * 1000; - - ret = bdrv_snapshot_create (qb_inode->bs, &sn); - if (ret < 0) { - QB_STUB_UNWIND (stub, -1, -ret); - } else { - QB_STUB_UNWIND (stub, ret, 0); - } - - return 0; -} - - -int -qb_snapshot_delete (void *opaque) -{ - qb_local_t *local = NULL; - call_frame_t *frame = NULL; - call_stub_t *stub = NULL; - inode_t *inode = NULL; - qb_inode_t *qb_inode = NULL; - int ret = 0; - - local = opaque; - frame = local->frame; - stub = local->stub; - inode = local->inode; - - qb_inode = qb_inode_ctx_get (frame->this, inode); - if (!qb_inode->bs) { - /* FIXME: we need locks around this when - enabling multithreaded syncop/coroutine - for qemu-block - */ - - qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); - if (!qb_inode->bs) { - QB_STUB_UNWIND (stub, -1, errno); - return 0; - } - } - - ret = bdrv_snapshot_delete (qb_inode->bs, local->name); - - if (ret < 0) { - QB_STUB_UNWIND (stub, -1, -ret); - } else { - QB_STUB_UNWIND (stub, ret, 0); - } - - return 0; -} - - -int -qb_snapshot_goto (void *opaque) -{ - qb_local_t *local = NULL; - call_frame_t *frame = NULL; - call_stub_t *stub = NULL; - inode_t *inode = NULL; - qb_inode_t *qb_inode = NULL; - int ret = 0; - - local = opaque; - frame = local->frame; - stub = local->stub; - inode = local->inode; - - qb_inode = qb_inode_ctx_get (frame->this, inode); - if (!qb_inode->bs) { - /* FIXME: we need locks around this when - enabling multithreaded syncop/coroutine - for qemu-block - */ - - qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); - if (!qb_inode->bs) { - QB_STUB_UNWIND (stub, -1, errno); - return 0; - } - } - - ret = bdrv_snapshot_goto (qb_inode->bs, local->name); - - if (ret < 0) { - QB_STUB_UNWIND (stub, -1, -ret); - } else { - QB_STUB_UNWIND (stub, ret, 0); - } - - return 0; -} diff --git a/xlators/features/qemu-block/src/qb-coroutines.h b/xlators/features/qemu-block/src/qb-coroutines.h deleted file mode 100644 index 583319f3b06..00000000000 --- a/xlators/features/qemu-block/src/qb-coroutines.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef __QB_COROUTINES_H -#define __QB_COROUTINES_H - -#include "syncop.h" -#include "call-stub.h" -#include "block/block_int.h" -#include "monitor/monitor.h" - -int qb_format_and_resume (void *opaque); -int qb_snapshot_create (void *opaque); -int qb_snapshot_delete (void *opaque); -int qb_snapshot_goto (void *opaque); -int qb_co_open (void *opaque); -int qb_co_close (void *opaque); -int qb_co_writev (void *opaque); -int qb_co_readv (void *opaque); -int qb_co_fsync (void *opaque); -int qb_co_truncate (void *opaque); - -#endif /* __QB_COROUTINES_H */ diff --git a/xlators/features/qemu-block/src/qemu-block-memory-types.h b/xlators/features/qemu-block/src/qemu-block-memory-types.h deleted file mode 100644 index 267b3893fed..00000000000 --- a/xlators/features/qemu-block/src/qemu-block-memory-types.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - - -#ifndef __QB_MEM_TYPES_H__ -#define __QB_MEM_TYPES_H__ - -#include "mem-types.h" - -enum gf_qb_mem_types_ { - gf_qb_mt_qb_conf_t = gf_common_mt_end + 1, - gf_qb_mt_qb_inode_t, - gf_qb_mt_qb_local_t, - gf_qb_mt_coroutinesynctask_t, - gf_qb_mt_end -}; -#endif - diff --git a/xlators/features/qemu-block/src/qemu-block.c b/xlators/features/qemu-block/src/qemu-block.c deleted file mode 100644 index 48bbf314048..00000000000 --- a/xlators/features/qemu-block/src/qemu-block.c +++ /dev/null @@ -1,1140 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "inode.h" -#include "call-stub.h" -#include "defaults.h" -#include "qemu-block-memory-types.h" -#include "qemu-block.h" -#include "qb-coroutines.h" - - -qb_inode_t * -__qb_inode_ctx_get (xlator_t *this, inode_t *inode) -{ - uint64_t value = 0; - qb_inode_t *qb_inode = NULL; - - __inode_ctx_get (inode, this, &value); - qb_inode = (qb_inode_t *)(unsigned long) value; - - return qb_inode; -} - - -qb_inode_t * -qb_inode_ctx_get (xlator_t *this, inode_t *inode) -{ - qb_inode_t *qb_inode = NULL; - - LOCK (&inode->lock); - { - qb_inode = __qb_inode_ctx_get (this, inode); - } - UNLOCK (&inode->lock); - - return qb_inode; -} - - -qb_inode_t * -qb_inode_ctx_del (xlator_t *this, inode_t *inode) -{ - uint64_t value = 0; - qb_inode_t *qb_inode = NULL; - - inode_ctx_del (inode, this, &value); - qb_inode = (qb_inode_t *)(unsigned long) value; - - return qb_inode; -} - - -int -qb_inode_cleanup (xlator_t *this, inode_t *inode, int warn) -{ - qb_inode_t *qb_inode = NULL; - - qb_inode = qb_inode_ctx_del (this, inode); - - if (!qb_inode) - return 0; - - if (warn) - gf_log (this->name, GF_LOG_WARNING, - "inode %s no longer block formatted", - uuid_utoa (inode->gfid)); - - /* free (qb_inode->bs); */ - - GF_FREE (qb_inode); - - return 0; -} - - -int -qb_iatt_fixup (xlator_t *this, inode_t *inode, struct iatt *iatt) -{ - qb_inode_t *qb_inode = NULL; - - qb_inode = qb_inode_ctx_get (this, inode); - if (!qb_inode) - return 0; - - iatt->ia_size = qb_inode->size; - - return 0; -} - - -int -qb_format_extract (xlator_t *this, char *format, inode_t *inode) -{ - char *s, *save; - uint64_t size = 0; - char fmt[QB_XATTR_VAL_MAX+1] = {0, }; - qb_inode_t *qb_inode = NULL; - char *formatstr = NULL; - uuid_t gfid = {0,}; - char gfid_str[64] = {0,}; - int ret; - - strncpy(fmt, format, QB_XATTR_VAL_MAX); - - s = strtok_r(fmt, ":", &save); - if (!s) - goto invalid; - formatstr = gf_strdup(s); - - s = strtok_r(NULL, ":", &save); - if (!s) - goto invalid; - if (gf_string2bytesize (s, &size)) - goto invalid; - if (!size) - goto invalid; - - s = strtok_r(NULL, "\0", &save); - if (s && !strncmp(s, "<gfid:", strlen("<gfid:"))) { - /* - * Check for valid gfid backing image specifier. - */ - if (strlen(s) + 1 > sizeof(gfid_str)) - goto invalid; - ret = sscanf(s, "<gfid:%[^>]s", gfid_str); - if (ret == 1) { - ret = uuid_parse(gfid_str, gfid); - if (ret < 0) - goto invalid; - } - } - - qb_inode = qb_inode_ctx_get (this, inode); - if (!qb_inode) - qb_inode = GF_CALLOC (1, sizeof (*qb_inode), - gf_qb_mt_qb_inode_t); - if (!qb_inode) { - GF_FREE(formatstr); - return ENOMEM; - } - - strncpy(qb_inode->fmt, formatstr, QB_XATTR_VAL_MAX); - qb_inode->size = size; - - /* - * If a backing gfid was not specified, interpret any remaining bytes - * associated with a backing image as a filename local to the parent - * directory. The format processing will validate further. - */ - if (!uuid_is_null(gfid)) - uuid_copy(qb_inode->backing_gfid, gfid); - else if (s) - qb_inode->backing_fname = gf_strdup(s); - - inode_ctx_set (inode, this, (void *)&qb_inode); - - GF_FREE(formatstr); - - return 0; - -invalid: - GF_FREE(formatstr); - - gf_log (this->name, GF_LOG_WARNING, - "invalid format '%s' in inode %s", format, - uuid_utoa (inode->gfid)); - return EINVAL; -} - - -void -qb_local_free (xlator_t *this, qb_local_t *local) -{ - if (local->inode) - inode_unref (local->inode); - if (local->fd) - fd_unref (local->fd); - GF_FREE (local); -} - - -int -qb_local_init (call_frame_t *frame) -{ - qb_local_t *qb_local = NULL; - - qb_local = GF_CALLOC (1, sizeof (*qb_local), gf_qb_mt_qb_local_t); - if (!qb_local) - return -1; - INIT_LIST_HEAD(&qb_local->list); - - qb_local->frame = frame; - frame->local = qb_local; - - return 0; -} - - -int -qb_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, struct iatt *buf, - dict_t *xdata, struct iatt *postparent) -{ - char *format = NULL; - qb_conf_t *conf = NULL; - - conf = this->private; - - if (op_ret == -1) - goto out; - - /* - * Cache the root inode for dealing with backing images. The format - * coroutine and the gluster qemu backend driver both use the root inode - * table to verify and/or redirect I/O to the backing image via - * anonymous fd's. - */ - if (!conf->root_inode && __is_root_gfid(inode->gfid)) - conf->root_inode = inode_ref(inode); - - if (!xdata) - goto out; - - if (dict_get_str (xdata, conf->qb_xattr_key, &format)) - goto out; - - if (!format) { - qb_inode_cleanup (this, inode, 1); - goto out; - } - - op_errno = qb_format_extract (this, format, inode); - if (op_errno) - op_ret = -1; - - qb_iatt_fixup (this, inode, buf); -out: - QB_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, - xdata, postparent); - return 0; -} - - -int -qb_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) -{ - qb_conf_t *conf = NULL; - - conf = this->private; - - xdata = xdata ? dict_ref (xdata) : dict_new (); - - if (!xdata) - goto enomem; - - if (dict_set_int32 (xdata, conf->qb_xattr_key, 0)) - goto enomem; - - STACK_WIND (frame, qb_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, xdata); - dict_unref (xdata); - return 0; -enomem: - QB_STACK_UNWIND (lookup, frame, -1, ENOMEM, 0, 0, 0, 0); - if (xdata) - dict_unref (xdata); - return 0; -} - - -int -qb_setxattr_format (call_frame_t *frame, xlator_t *this, call_stub_t *stub, - dict_t *xattr, inode_t *inode) -{ - char *format = NULL; - int op_errno = 0; - qb_local_t *qb_local = NULL; - data_t *data = NULL; - qb_inode_t *qb_inode; - - if (!(data = dict_get (xattr, "trusted.glusterfs.block-format"))) { - QB_STUB_RESUME (stub); - return 0; - } - - format = alloca (data->len + 1); - memcpy (format, data->data, data->len); - format[data->len] = 0; - - op_errno = qb_format_extract (this, format, inode); - if (op_errno) { - QB_STUB_UNWIND (stub, -1, op_errno); - return 0; - } - qb_inode = qb_inode_ctx_get(this, inode); - - qb_local = frame->local; - - qb_local->stub = stub; - qb_local->inode = inode_ref (inode); - - snprintf(qb_local->fmt, QB_XATTR_VAL_MAX, "%s:%lu", qb_inode->fmt, - qb_inode->size); - - qb_coroutine (frame, qb_format_and_resume); - - return 0; -} - - -int -qb_setxattr_snapshot_create (call_frame_t *frame, xlator_t *this, - call_stub_t *stub, dict_t *xattr, inode_t *inode) -{ - qb_local_t *qb_local = NULL; - char *name = NULL; - data_t *data = NULL; - - if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-create"))) { - QB_STUB_RESUME (stub); - return 0; - } - - name = alloca (data->len + 1); - memcpy (name, data->data, data->len); - name[data->len] = 0; - - qb_local = frame->local; - - qb_local->stub = stub; - qb_local->inode = inode_ref (inode); - strncpy (qb_local->name, name, 128); - - qb_coroutine (frame, qb_snapshot_create); - - return 0; -} - - -int -qb_setxattr_snapshot_delete (call_frame_t *frame, xlator_t *this, - call_stub_t *stub, dict_t *xattr, inode_t *inode) -{ - qb_local_t *qb_local = NULL; - char *name = NULL; - data_t *data = NULL; - - if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-delete"))) { - QB_STUB_RESUME (stub); - return 0; - } - - name = alloca (data->len + 1); - memcpy (name, data->data, data->len); - name[data->len] = 0; - - qb_local = frame->local; - - qb_local->stub = stub; - qb_local->inode = inode_ref (inode); - strncpy (qb_local->name, name, 128); - - qb_coroutine (frame, qb_snapshot_delete); - - return 0; -} - -int -qb_setxattr_snapshot_goto (call_frame_t *frame, xlator_t *this, - call_stub_t *stub, dict_t *xattr, inode_t *inode) -{ - qb_local_t *qb_local = NULL; - char *name = NULL; - data_t *data = NULL; - - if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-goto"))) { - QB_STUB_RESUME (stub); - return 0; - } - - name = alloca (data->len + 1); - memcpy (name, data->data, data->len); - name[data->len] = 0; - - qb_local = frame->local; - - qb_local->stub = stub; - qb_local->inode = inode_ref (inode); - strncpy (qb_local->name, name, 128); - - qb_coroutine (frame, qb_snapshot_goto); - - return 0; -} - - -int -qb_setxattr_common (call_frame_t *frame, xlator_t *this, call_stub_t *stub, - dict_t *xattr, inode_t *inode) -{ - data_t *data = NULL; - - if ((data = dict_get (xattr, "trusted.glusterfs.block-format"))) { - qb_setxattr_format (frame, this, stub, xattr, inode); - return 0; - } - - if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-create"))) { - qb_setxattr_snapshot_create (frame, this, stub, xattr, inode); - return 0; - } - - if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-delete"))) { - qb_setxattr_snapshot_delete (frame, this, stub, xattr, inode); - return 0; - } - - if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-goto"))) { - qb_setxattr_snapshot_goto (frame, this, stub, xattr, inode); - return 0; - } - - QB_STUB_RESUME (stub); - - return 0; -} - - -int -qb_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr, - int flags, dict_t *xdata) -{ - call_stub_t *stub = NULL; - - if (qb_local_init (frame) != 0) - goto enomem; - - stub = fop_setxattr_stub (frame, default_setxattr_resume, loc, xattr, - flags, xdata); - if (!stub) - goto enomem; - - qb_setxattr_common (frame, this, stub, xattr, loc->inode); - - return 0; -enomem: - QB_STACK_UNWIND (setxattr, frame, -1, ENOMEM, 0); - return 0; -} - - -int -qb_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr, - int flags, dict_t *xdata) -{ - call_stub_t *stub = NULL; - - if (qb_local_init (frame) != 0) - goto enomem; - - stub = fop_fsetxattr_stub (frame, default_fsetxattr_resume, fd, xattr, - flags, xdata); - if (!stub) - goto enomem; - - qb_setxattr_common (frame, this, stub, xattr, fd->inode); - - return 0; -enomem: - QB_STACK_UNWIND (fsetxattr, frame, -1, ENOMEM, 0); - return 0; -} - - -int -qb_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd, dict_t *xdata) -{ - call_stub_t *stub = NULL; - qb_local_t *qb_local = NULL; - - qb_local = frame->local; - - if (op_ret < 0) - goto unwind; - - if (!qb_inode_ctx_get (this, qb_local->inode)) - goto unwind; - - stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata); - if (!stub) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - - qb_local->stub = stub; - - qb_coroutine (frame, qb_co_open); - - return 0; -unwind: - QB_STACK_UNWIND (open, frame, op_ret, op_errno, fd, xdata); - return 0; -} - - -int -qb_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, - fd_t *fd, dict_t *xdata) -{ - qb_local_t *qb_local = NULL; - qb_inode_t *qb_inode = NULL; - - qb_inode = qb_inode_ctx_get (this, loc->inode); - if (!qb_inode) { - STACK_WIND (frame, default_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, loc, flags, fd, - xdata); - return 0; - } - - if (qb_local_init (frame) != 0) - goto enomem; - - qb_local = frame->local; - - qb_local->inode = inode_ref (loc->inode); - qb_local->fd = fd_ref (fd); - - STACK_WIND (frame, qb_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); - return 0; -enomem: - QB_STACK_UNWIND (open, frame, -1, ENOMEM, 0, 0); - return 0; -} - - -int -qb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, - int count, off_t offset, uint32_t flags, struct iobref *iobref, - dict_t *xdata) -{ - qb_local_t *qb_local = NULL; - qb_inode_t *qb_inode = NULL; - - qb_inode = qb_inode_ctx_get (this, fd->inode); - if (!qb_inode) { - STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, fd, vector, count, - offset, flags, iobref, xdata); - return 0; - } - - if (qb_local_init (frame) != 0) - goto enomem; - - qb_local = frame->local; - - qb_local->inode = inode_ref (fd->inode); - qb_local->fd = fd_ref (fd); - - qb_local->stub = fop_writev_stub (frame, NULL, fd, vector, count, - offset, flags, iobref, xdata); - if (!qb_local->stub) - goto enomem; - - qb_coroutine (frame, qb_co_writev); - - return 0; -enomem: - QB_STACK_UNWIND (writev, frame, -1, ENOMEM, 0, 0, 0); - return 0; -} - - -int -qb_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) -{ - qb_local_t *qb_local = NULL; - qb_inode_t *qb_inode = NULL; - - qb_inode = qb_inode_ctx_get (this, fd->inode); - if (!qb_inode) { - STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, fd, size, offset, - flags, xdata); - return 0; - } - - if (qb_local_init (frame) != 0) - goto enomem; - - qb_local = frame->local; - - qb_local->inode = inode_ref (fd->inode); - qb_local->fd = fd_ref (fd); - - qb_local->stub = fop_readv_stub (frame, NULL, fd, size, offset, - flags, xdata); - if (!qb_local->stub) - goto enomem; - - qb_coroutine (frame, qb_co_readv); - - return 0; -enomem: - QB_STACK_UNWIND (readv, frame, -1, ENOMEM, 0, 0, 0, 0, 0); - return 0; -} - - -int -qb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int dsync, - dict_t *xdata) -{ - qb_local_t *qb_local = NULL; - qb_inode_t *qb_inode = NULL; - - qb_inode = qb_inode_ctx_get (this, fd->inode); - if (!qb_inode) { - STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsync, fd, dsync, xdata); - return 0; - } - - if (qb_local_init (frame) != 0) - goto enomem; - - qb_local = frame->local; - - qb_local->inode = inode_ref (fd->inode); - qb_local->fd = fd_ref (fd); - - qb_local->stub = fop_fsync_stub (frame, NULL, fd, dsync, xdata); - - if (!qb_local->stub) - goto enomem; - - qb_coroutine (frame, qb_co_fsync); - - return 0; -enomem: - QB_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); - return 0; -} - - -int -qb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) -{ - qb_local_t *qb_local = NULL; - qb_inode_t *qb_inode = NULL; - - qb_inode = qb_inode_ctx_get (this, fd->inode); - if (!qb_inode) { - STACK_WIND (frame, default_flush_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, fd, xdata); - return 0; - } - - if (qb_local_init (frame) != 0) - goto enomem; - - qb_local = frame->local; - - qb_local->inode = inode_ref (fd->inode); - qb_local->fd = fd_ref (fd); - - qb_local->stub = fop_flush_stub (frame, NULL, fd, xdata); - - if (!qb_local->stub) - goto enomem; - - qb_coroutine (frame, qb_co_fsync); - - return 0; -enomem: - QB_STACK_UNWIND (flush, frame, -1, ENOMEM, 0); - return 0; -} - -static int32_t -qb_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, - dict_t *xdata) -{ - qb_conf_t *conf = this->private; - gf_dirent_t *entry; - char *format; - - list_for_each_entry(entry, &entries->list, list) { - if (!entry->inode || !entry->dict) - continue; - - format = NULL; - if (dict_get_str(entry->dict, conf->qb_xattr_key, &format)) - continue; - - if (!format) { - qb_inode_cleanup(this, entry->inode, 1); - continue; - } - - if (qb_format_extract(this, format, entry->inode)) - continue; - - qb_iatt_fixup(this, entry->inode, &entry->d_stat); - } - - STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata); - return 0; -} - -static int32_t -qb_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t off, dict_t *xdata) -{ - qb_conf_t *conf = this->private; - - xdata = xdata ? dict_ref(xdata) : dict_new(); - if (!xdata) - goto enomem; - - if (dict_set_int32 (xdata, conf->qb_xattr_key, 0)) - goto enomem; - - STACK_WIND(frame, qb_readdirp_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata); - - dict_unref(xdata); - return 0; - -enomem: - QB_STACK_UNWIND(readdirp, frame, -1, ENOMEM, NULL, NULL); - if (xdata) - dict_unref(xdata); - return 0; -} - -int -qb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, - dict_t *xdata) -{ - qb_local_t *qb_local = NULL; - qb_inode_t *qb_inode = NULL; - - qb_inode = qb_inode_ctx_get (this, loc->inode); - if (!qb_inode) { - STACK_WIND (frame, default_truncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, loc, offset, - xdata); - return 0; - } - - if (qb_local_init (frame) != 0) - goto enomem; - - qb_local = frame->local; - - qb_local->inode = inode_ref (loc->inode); - qb_local->fd = fd_anonymous (loc->inode); - - qb_local->stub = fop_truncate_stub (frame, NULL, loc, offset, xdata); - - if (!qb_local->stub) - goto enomem; - - qb_coroutine (frame, qb_co_truncate); - - return 0; -enomem: - QB_STACK_UNWIND (truncate, frame, -1, ENOMEM, 0, 0, 0); - return 0; -} - - -int -qb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - dict_t *xdata) -{ - qb_local_t *qb_local = NULL; - qb_inode_t *qb_inode = NULL; - - qb_inode = qb_inode_ctx_get (this, fd->inode); - if (!qb_inode) { - STACK_WIND (frame, default_ftruncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, fd, offset, - xdata); - return 0; - } - - if (qb_local_init (frame) != 0) - goto enomem; - - qb_local = frame->local; - - qb_local->inode = inode_ref (fd->inode); - qb_local->fd = fd_ref (fd); - - qb_local->stub = fop_ftruncate_stub (frame, NULL, fd, offset, xdata); - - if (!qb_local->stub) - goto enomem; - - qb_coroutine (frame, qb_co_truncate); - - return 0; -enomem: - QB_STACK_UNWIND (ftruncate, frame, -1, ENOMEM, 0, 0, 0); - return 0; -} - - -int -qb_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *iatt, dict_t *xdata) -{ - inode_t *inode = NULL; - - inode = frame->local; - frame->local = NULL; - - if (inode) { - qb_iatt_fixup (this, inode, iatt); - inode_unref (inode); - } - - QB_STACK_UNWIND (stat, frame, op_ret, op_errno, iatt, xdata); - - return 0; -} - -int -qb_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) -{ - if (qb_inode_ctx_get (this, loc->inode)) - frame->local = inode_ref (loc->inode); - - STACK_WIND (frame, qb_stat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, loc, xdata); - return 0; -} - - -int -qb_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *iatt, dict_t *xdata) -{ - inode_t *inode = NULL; - - inode = frame->local; - frame->local = NULL; - - if (inode) { - qb_iatt_fixup (this, inode, iatt); - inode_unref (inode); - } - - QB_STACK_UNWIND (fstat, frame, op_ret, op_errno, iatt, xdata); - - return 0; -} - - -int -qb_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) -{ - if (qb_inode_ctx_get (this, fd->inode)) - frame->local = inode_ref (fd->inode); - - STACK_WIND (frame, qb_fstat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fstat, fd, xdata); - return 0; -} - - -int -qb_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *pre, struct iatt *post, - dict_t *xdata) -{ - inode_t *inode = NULL; - - inode = frame->local; - frame->local = NULL; - - if (inode) { - qb_iatt_fixup (this, inode, pre); - qb_iatt_fixup (this, inode, post); - inode_unref (inode); - } - - QB_STACK_UNWIND (setattr, frame, op_ret, op_errno, pre, post, xdata); - - return 0; -} - - -int -qb_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, - int valid, dict_t *xdata) -{ - if (qb_inode_ctx_get (this, loc->inode)) - frame->local = inode_ref (loc->inode); - - STACK_WIND (frame, qb_setattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setattr, loc, buf, valid, xdata); - return 0; -} - - -int -qb_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *pre, struct iatt *post, - dict_t *xdata) -{ - inode_t *inode = NULL; - - inode = frame->local; - frame->local = NULL; - - if (inode) { - qb_iatt_fixup (this, inode, pre); - qb_iatt_fixup (this, inode, post); - inode_unref (inode); - } - - QB_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, pre, post, xdata); - - return 0; -} - - -int -qb_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, - int valid, dict_t *xdata) -{ - if (qb_inode_ctx_get (this, fd->inode)) - frame->local = inode_ref (fd->inode); - - STACK_WIND (frame, qb_setattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsetattr, fd, buf, valid, xdata); - return 0; -} - - -int -qb_forget (xlator_t *this, inode_t *inode) -{ - return qb_inode_cleanup (this, inode, 0); -} - - -int -qb_release (xlator_t *this, fd_t *fd) -{ - call_frame_t *frame = NULL; - - frame = create_frame (this, this->ctx->pool); - if (!frame) { - gf_log (this->name, GF_LOG_ERROR, - "Could not allocate frame. " - "Leaking QEMU BlockDriverState"); - return -1; - } - - if (qb_local_init (frame) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not allocate local. " - "Leaking QEMU BlockDriverState"); - STACK_DESTROY (frame->root); - return -1; - } - - if (qb_coroutine (frame, qb_co_close) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not allocate coroutine. " - "Leaking QEMU BlockDriverState"); - qb_local_free (this, frame->local); - frame->local = NULL; - STACK_DESTROY (frame->root); - } - - return 0; -} - -int -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - ret = xlator_mem_acct_init (this, gf_qb_mt_end + 1); - - if (ret) - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init " - "failed"); - return ret; -} - - -int -reconfigure (xlator_t *this, dict_t *options) -{ - return 0; -} - - -int -init (xlator_t *this) -{ - qb_conf_t *conf = NULL; - int32_t ret = -1; - static int bdrv_inited = 0; - - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: qemu-block (%s) not configured with exactly " - "one child", this->name); - goto out; - } - - conf = GF_CALLOC (1, sizeof (*conf), gf_qb_mt_qb_conf_t); - if (!conf) - goto out; - - /* configure 'option window-size <size>' */ - GF_OPTION_INIT ("default-password", conf->default_password, str, out); - - /* qemu coroutines use "co_mutex" for synchronizing among themselves. - However "co_mutex" itself is not threadsafe if the coroutine framework - is multithreaded (which usually is not). However synctasks are - fundamentally multithreaded, so for now create a syncenv which has - scaling limits set to max 1 thread so that the qemu coroutines can - execute "safely". - - Future work: provide an implementation of "co_mutex" which is - threadsafe and use the global multithreaded ctx->env syncenv. - */ - conf->env = syncenv_new (0, 1, 1); - - this->private = conf; - - ret = 0; - - snprintf (conf->qb_xattr_key, QB_XATTR_KEY_MAX, QB_XATTR_KEY_FMT, - this->name); - - cur_mon = (void *) 1; - - if (!bdrv_inited) { - bdrv_init (); - bdrv_inited = 1; - } - -out: - if (ret) - GF_FREE (conf); - - return ret; -} - - -void -fini (xlator_t *this) -{ - qb_conf_t *conf = NULL; - - conf = this->private; - - this->private = NULL; - - if (conf->root_inode) - inode_unref(conf->root_inode); - GF_FREE (conf); - - return; -} - - -struct xlator_fops fops = { - .lookup = qb_lookup, - .fsetxattr = qb_fsetxattr, - .setxattr = qb_setxattr, - .open = qb_open, - .writev = qb_writev, - .readv = qb_readv, - .fsync = qb_fsync, - .truncate = qb_truncate, - .ftruncate = qb_ftruncate, - .stat = qb_stat, - .fstat = qb_fstat, - .setattr = qb_setattr, - .fsetattr = qb_fsetattr, - .flush = qb_flush, -/* - .getxattr = qb_getxattr, - .fgetxattr = qb_fgetxattr -*/ - .readdirp = qb_readdirp, -}; - - -struct xlator_cbks cbks = { - .forget = qb_forget, - .release = qb_release, -}; - - -struct xlator_dumpops dumpops = { -}; - - -struct volume_options options[] = { - { .key = {"default-password"}, - .type = GF_OPTION_TYPE_STR, - .default_value = "", - .description = "Default password for the AES encrypted block images." - }, - { .key = {NULL} }, -}; diff --git a/xlators/features/qemu-block/src/qemu-block.h b/xlators/features/qemu-block/src/qemu-block.h deleted file mode 100644 index c95f2799ac6..00000000000 --- a/xlators/features/qemu-block/src/qemu-block.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef __QEMU_BLOCK_H -#define __QEMU_BLOCK_H - -#include "syncop.h" -#include "call-stub.h" -#include "block/block_int.h" -#include "monitor/monitor.h" - -/* QB_XATTR_KEY_FMT is the on-disk xattr stored in the inode which - indicates that the file must be "interpreted" by the block format - logic. The value of the key is of the pattern: - - "format:virtual_size" - - e.g - - "qcow2:20GB" or "qed:100GB" - - The format and virtual size are colon separated. The format is - a case sensitive string which qemu recognizes. virtual_size is - specified as a size which glusterfs recognizes as size (i.e., - value accepted by gf_string2bytesize()) -*/ -#define QB_XATTR_KEY_FMT "trusted.glusterfs.%s.format" - -#define QB_XATTR_KEY_MAX 64 - -#define QB_XATTR_VAL_MAX 64 - - -typedef struct qb_inode { - char fmt[QB_XATTR_VAL_MAX]; /* this is only the format, not "format:size" */ - size_t size; /* virtual size in bytes */ - BlockDriverState *bs; - int refcnt; - uuid_t backing_gfid; - char *backing_fname; -} qb_inode_t; - - -typedef struct qb_conf { - Monitor *mon; - struct syncenv *env; - char qb_xattr_key[QB_XATTR_KEY_MAX]; - char *default_password; - inode_t *root_inode; -} qb_conf_t; - - -typedef struct qb_local { - call_frame_t *frame; /* backpointer */ - call_stub_t *stub; - inode_t *inode; - fd_t *fd; - char fmt[QB_XATTR_VAL_MAX+1]; - char name[256]; - synctask_fn_t synctask_fn; - struct list_head list; -} qb_local_t; - -void qb_local_free (xlator_t *this, qb_local_t *local); -int qb_coroutine (call_frame_t *frame, synctask_fn_t fn); -inode_t *qb_inode_from_filename (const char *filename); -int qb_inode_to_filename (inode_t *inode, char *filename, int size); -int qb_format_extract (xlator_t *this, char *format, inode_t *inode); - -qb_inode_t *qb_inode_ctx_get (xlator_t *this, inode_t *inode); - -#define QB_STACK_UNWIND(typ, frame, args ...) do { \ - qb_local_t *__local = frame->local; \ - xlator_t *__this = frame->this; \ - \ - frame->local = NULL; \ - STACK_UNWIND_STRICT (typ, frame, args); \ - if (__local) \ - qb_local_free (__this, __local); \ - } while (0) - -#define QB_STUB_UNWIND(stub, op_ret, op_errno) do { \ - qb_local_t *__local = stub->frame->local; \ - xlator_t *__this = stub->frame->this; \ - \ - stub->frame->local = NULL; \ - call_unwind_error (stub, op_ret, op_errno); \ - if (__local) \ - qb_local_free (__this, __local); \ - } while (0) - -#define QB_STUB_RESUME(stub_errno) do { \ - qb_local_t *__local = stub->frame->local; \ - xlator_t *__this = stub->frame->this; \ - \ - stub->frame->local = NULL; \ - call_resume (stub); \ - if (__local) \ - qb_local_free (__this, __local); \ - } while (0) - -#endif /* !__QEMU_BLOCK_H */ diff --git a/xlators/features/quiesce/src/Makefile.am b/xlators/features/quiesce/src/Makefile.am index 15e46629e78..74ea999c045 100644 --- a/xlators/features/quiesce/src/Makefile.am +++ b/xlators/features/quiesce/src/Makefile.am @@ -1,14 +1,15 @@ xlator_LTLIBRARIES = quiesce.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features -quiesce_la_LDFLAGS = -module -avoid-version +quiesce_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) quiesce_la_SOURCES = quiesce.c quiesce_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = quiesce.h quiesce-mem-types.h +noinst_HEADERS = quiesce.h quiesce-mem-types.h quiesce-messages.h -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/features/quiesce/src/quiesce-mem-types.h b/xlators/features/quiesce/src/quiesce-mem-types.h index 6e582f424ea..416456b13af 100644 --- a/xlators/features/quiesce/src/quiesce-mem-types.h +++ b/xlators/features/quiesce/src/quiesce-mem-types.h @@ -11,10 +11,11 @@ #ifndef __QUIESCE_MEM_TYPES_H__ #define __QUIESCE_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_quiesce_mem_types_ { - gf_quiesce_mt_priv_t = gf_common_mt_end + 1, - gf_quiesce_mt_end + gf_quiesce_mt_priv_t = gf_common_mt_end + 1, + gf_quiesce_mt_failover_hosts, + gf_quiesce_mt_end }; #endif diff --git a/xlators/features/quiesce/src/quiesce-messages.h b/xlators/features/quiesce/src/quiesce-messages.h new file mode 100644 index 00000000000..32ffd409807 --- /dev/null +++ b/xlators/features/quiesce/src/quiesce-messages.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#ifndef __QUIESCE_MESSAGES_H__ +#define __QUIESCE_MESSAGES_H__ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(QUIESCE, QUIESCE_MSG_INVAL_HOST, QUIESCE_MSG_FAILOVER_FAILED); + +#endif /* __NL_CACHE_MESSAGES_H__ */ diff --git a/xlators/features/quiesce/src/quiesce.c b/xlators/features/quiesce/src/quiesce.c index 24c7dc6ed31..0e5eb60a16f 100644 --- a/xlators/features/quiesce/src/quiesce.c +++ b/xlators/features/quiesce/src/quiesce.c @@ -7,669 +7,815 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include "quiesce.h" -#include "defaults.h" -#include "call-stub.h" +#include <glusterfs/defaults.h> +#include <glusterfs/call-stub.h> /* TODO: */ /* Think about 'writev/_*_lk/setattr/xattrop/' fops to do re-transmittion */ +void +gf_quiesce_timeout(void *data); /* Quiesce Specific Functions */ void -gf_quiesce_local_wipe (xlator_t *this, quiesce_local_t *local) +gf_quiesce_local_wipe(xlator_t *this, quiesce_local_t *local) { - if (!local || !this || !this->private) - return; + if (!local || !this || !this->private) + return; - if (local->loc.inode) - loc_wipe (&local->loc); - if (local->fd) - fd_unref (local->fd); - GF_FREE (local->name); - GF_FREE (local->volname); - if (local->dict) - dict_unref (local->dict); - if (local->iobref) - iobref_unref (local->iobref); - GF_FREE (local->vector); + if (local->loc.inode) + loc_wipe(&local->loc); + if (local->fd) + fd_unref(local->fd); + GF_FREE(local->name); + GF_FREE(local->volname); + if (local->dict) + dict_unref(local->dict); + if (local->iobref) + iobref_unref(local->iobref); + GF_FREE(local->vector); - mem_put (local); + mem_put(local); } -call_stub_t * -gf_quiesce_dequeue (xlator_t *this) +void +__gf_quiesce_start_timer(xlator_t *this, quiesce_priv_t *priv) { - call_stub_t *stub = NULL; - quiesce_priv_t *priv = NULL; + struct timespec timeout = { + 0, + }; - priv = this->private; + if (!priv->timer) { + timeout.tv_sec = priv->timeout; + timeout.tv_nsec = 0; - if (!priv || list_empty (&priv->req)) - return NULL; - - LOCK (&priv->lock); - { - stub = list_entry (priv->req.next, call_stub_t, list); - list_del_init (&stub->list); - priv->queue_size--; + priv->timer = gf_timer_call_after(this->ctx, timeout, + gf_quiesce_timeout, (void *)this); + if (priv->timer == NULL) { + gf_log(this->name, GF_LOG_ERROR, "Cannot create timer"); } - UNLOCK (&priv->lock); + } +} + +static void +__gf_quiesce_cleanup_failover_hosts(xlator_t *this, quiesce_priv_t *priv) +{ + quiesce_failover_hosts_t *tmp = NULL; + quiesce_failover_hosts_t *failover_host = NULL; - return stub; + list_for_each_entry_safe(failover_host, tmp, &priv->failover_list, list) + { + GF_FREE(failover_host->addr); + list_del(&failover_host->list); + GF_FREE(failover_host); + } + return; } -void * -gf_quiesce_dequeue_start (void *data) +void +gf_quiesce_populate_failover_hosts(xlator_t *this, quiesce_priv_t *priv, + const char *value) +{ + char *dup_val = NULL; + char *addr_tok = NULL; + char *save_ptr = NULL; + quiesce_failover_hosts_t *failover_host = NULL; + + if (!value) + goto out; + + dup_val = gf_strdup(value); + if (!dup_val) + goto out; + + addr_tok = strtok_r(dup_val, ",", &save_ptr); + LOCK(&priv->lock); + { + if (!list_empty(&priv->failover_list)) + __gf_quiesce_cleanup_failover_hosts(this, priv); + + while (addr_tok) { + if (!valid_internet_address(addr_tok, _gf_true, _gf_false)) { + gf_msg(this->name, GF_LOG_INFO, 0, QUIESCE_MSG_INVAL_HOST, + "Specified " + "invalid internet address:%s", + addr_tok); + continue; + } + failover_host = GF_CALLOC(1, sizeof(*failover_host), + gf_quiesce_mt_failover_hosts); + failover_host->addr = gf_strdup(addr_tok); + INIT_LIST_HEAD(&failover_host->list); + list_add(&failover_host->list, &priv->failover_list); + addr_tok = strtok_r(NULL, ",", &save_ptr); + } + } + UNLOCK(&priv->lock); + GF_FREE(dup_val); +out: + return; +} + +int32_t +gf_quiesce_failover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - xlator_t *this = NULL; - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; - this = data; - priv = this->private; - THIS = this; + if (op_ret < 0) { + /* Failure here doesn't mean the failover to another host didn't + * succeed, we will know if failover succeeds or not by the + * CHILD_UP/CHILD_DOWN event. A failure here indicates something + * went wrong with the submission of failover command, hence + * just abort the failover attempts without retrying with other + * hosts. + */ + gf_msg(this->name, GF_LOG_INFO, op_errno, QUIESCE_MSG_FAILOVER_FAILED, + "Initiating failover to host:%s failed:", (char *)cookie); + } - while (!list_empty (&priv->req)) { - stub = gf_quiesce_dequeue (this); - if (stub) { - call_resume (stub); - } - } + GF_FREE(cookie); + STACK_DESTROY(frame->root); - return 0; + priv = this->private; + __gf_quiesce_start_timer(this, priv); + + return 0; } +int +__gf_quiesce_perform_failover(xlator_t *this) +{ + int ret = 0; + call_frame_t *frame = NULL; + dict_t *dict = NULL; + quiesce_priv_t *priv = NULL; + quiesce_failover_hosts_t *failover_host = NULL; + quiesce_failover_hosts_t *host = NULL; + + priv = this->private; + + if (priv->pass_through) { + gf_msg_trace(this->name, 0, + "child is up, hence not " + "performing any failover"); + goto out; + } + + list_for_each_entry(failover_host, &priv->failover_list, list) + { + if (failover_host->tried == 0) { + host = failover_host; + failover_host->tried = 1; + break; + } + } + if (!host) { + /*TODO: Keep trying until any of the gfproxy comes back up. + Currently it tries failing over once for each host, + if it doesn't succeed then returns error to mount point + list_for_each_entry (failover_host, + &priv->failover_list, list) { + failover_host->tried = 0; + }*/ + gf_msg_debug(this->name, 0, + "all the failover hosts have " + "been tried and looks like didn't succeed"); + ret = -1; + goto out; + } + + frame = create_frame(this, this->ctx->pool); + if (!frame) { + gf_msg_debug(this->name, 0, "failed to create the frame"); + ret = -1; + goto out; + } + + dict = dict_new(); + + ret = dict_set_dynstr(dict, CLIENT_CMD_CONNECT, gf_strdup(host->addr)); + + gf_msg_trace(this->name, 0, "Initiating failover to:%s", host->addr); + + STACK_WIND_COOKIE(frame, gf_quiesce_failover_cbk, NULL, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, NULL, dict, 0, NULL); +out: -void -gf_quiesce_timeout (void *data) + if (dict) + dict_unref(dict); + + return ret; +} + +call_stub_t * +gf_quiesce_dequeue(xlator_t *this) { - xlator_t *this = NULL; - quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; - this = data; - priv = this->private; - THIS = this; + priv = this->private; - LOCK (&priv->lock); - { - priv->pass_through = _gf_true; - } - UNLOCK (&priv->lock); + if (!priv || list_empty(&priv->req)) + return NULL; - gf_quiesce_dequeue_start (this); + LOCK(&priv->lock); + { + stub = list_entry(priv->req.next, call_stub_t, list); + list_del_init(&stub->list); + priv->queue_size--; + } + UNLOCK(&priv->lock); - return; + return stub; } -void -gf_quiesce_enqueue (xlator_t *this, call_stub_t *stub) +void * +gf_quiesce_dequeue_start(void *data) { - quiesce_priv_t *priv = NULL; - struct timespec timeout = {0,}; + xlator_t *this = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; - if (!priv) { - gf_log_callingfn (this->name, GF_LOG_ERROR, - "this->private == NULL"); - return; - } + this = data; + priv = this->private; + THIS = this; - LOCK (&priv->lock); - { - list_add_tail (&stub->list, &priv->req); - priv->queue_size++; + while (!list_empty(&priv->req)) { + stub = gf_quiesce_dequeue(this); + if (stub) { + call_resume(stub); } - UNLOCK (&priv->lock); + } + + return 0; +} + +void +gf_quiesce_timeout(void *data) +{ + xlator_t *this = NULL; + quiesce_priv_t *priv = NULL; + int ret = -1; - if (!priv->timer) { - timeout.tv_sec = 20; - timeout.tv_nsec = 0; + this = data; + priv = this->private; + THIS = this; - priv->timer = gf_timer_call_after (this->ctx, - timeout, - gf_quiesce_timeout, - (void *) this); + LOCK(&priv->lock); + { + priv->timer = NULL; + if (priv->pass_through) { + UNLOCK(&priv->lock); + goto out; } + ret = __gf_quiesce_perform_failover(THIS); + } + UNLOCK(&priv->lock); - return; + if (ret < 0) { + priv->pass_through = _gf_true; + gf_quiesce_dequeue_start(this); + } + +out: + return; } +void +gf_quiesce_enqueue(xlator_t *this, call_stub_t *stub) +{ + quiesce_priv_t *priv = NULL; + priv = this->private; + if (!priv) { + gf_log_callingfn(this->name, GF_LOG_ERROR, "this->private == NULL"); + return; + } + + LOCK(&priv->lock); + { + list_add_tail(&stub->list, &priv->req); + priv->queue_size++; + __gf_quiesce_start_timer(this, priv); + } + UNLOCK(&priv->lock); + + return; +} /* _CBK function section */ int32_t -quiesce_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *dict, struct iatt *postparent) +quiesce_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *dict, struct iatt *postparent) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_lookup_stub (frame, default_lookup_resume, - &local->loc, local->dict); - if (!stub) { - STACK_UNWIND_STRICT (lookup, frame, -1, ENOMEM, - NULL, NULL, NULL, NULL); - goto out; - } - - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_lookup_stub(frame, default_lookup_resume, &local->loc, + local->dict); + if (!stub) { + STACK_UNWIND_STRICT(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, + NULL); + goto out; } - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, - dict, postparent); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, dict, + postparent); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } int32_t -quiesce_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - dict_t *xdata) +quiesce_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; - - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_stat_stub (frame, default_stat_resume, - &local->loc, xdata); - if (!stub) { - STACK_UNWIND_STRICT (stat, frame, -1, ENOMEM, - NULL, NULL); - goto out; - } + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_stat_stub(frame, default_stat_resume, &local->loc, xdata); + if (!stub) { + STACK_UNWIND_STRICT(stat, frame, -1, ENOMEM, NULL, NULL); + goto out; } - STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } int32_t -quiesce_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +quiesce_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; - - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_access_stub (frame, default_access_resume, - &local->loc, local->flag, xdata); - if (!stub) { - STACK_UNWIND_STRICT (access, frame, -1, ENOMEM, NULL); - goto out; - } + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_access_stub(frame, default_access_resume, &local->loc, + local->flag, xdata); + if (!stub) { + STACK_UNWIND_STRICT(access, frame, -1, ENOMEM, NULL); + goto out; } - STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(access, frame, op_ret, op_errno, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } int32_t -quiesce_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, const char *path, - struct iatt *buf, dict_t *xdata) +quiesce_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, const char *path, + struct iatt *buf, dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; - - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_readlink_stub (frame, default_readlink_resume, - &local->loc, local->size, xdata); - if (!stub) { - STACK_UNWIND_STRICT (readlink, frame, -1, ENOMEM, - NULL, NULL, NULL); - goto out; - } + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_readlink_stub(frame, default_readlink_resume, &local->loc, + local->size, xdata); + if (!stub) { + STACK_UNWIND_STRICT(readlink, frame, -1, ENOMEM, NULL, NULL, NULL); + goto out; } - STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, path, buf, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(readlink, frame, op_ret, op_errno, path, buf, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } int32_t -quiesce_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +quiesce_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_open_stub (frame, default_open_resume, - &local->loc, local->flag, local->fd, - xdata); - if (!stub) { - STACK_UNWIND_STRICT (open, frame, -1, ENOMEM, - NULL, NULL); - goto out; - } - - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_open_stub(frame, default_open_resume, &local->loc, + local->flag, local->fd, xdata); + if (!stub) { + STACK_UNWIND_STRICT(open, frame, -1, ENOMEM, NULL, NULL); + goto out; } - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } int32_t -quiesce_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) +quiesce_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; - - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_readv_stub (frame, default_readv_resume, - local->fd, local->size, local->offset, - local->io_flag, xdata); - if (!stub) { - STACK_UNWIND_STRICT (readv, frame, -1, ENOMEM, - NULL, 0, NULL, NULL, NULL); - goto out; - } + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_readv_stub(frame, default_readv_resume, local->fd, + local->size, local->offset, local->io_flag, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(readv, frame, -1, ENOMEM, NULL, 0, NULL, NULL, + NULL); + goto out; } - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, - stbuf, iobref, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } int32_t -quiesce_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +quiesce_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_flush_stub (frame, default_flush_resume, - local->fd, xdata); - if (!stub) { - STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM, NULL); - goto out; - } - - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_flush_stub(frame, default_flush_resume, local->fd, xdata); + if (!stub) { + STACK_UNWIND_STRICT(flush, frame, -1, ENOMEM, NULL); + goto out; } - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } - - int32_t -quiesce_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +quiesce_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_fsync_stub (frame, default_fsync_resume, - local->fd, local->flag, xdata); - if (!stub) { - STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, - NULL, NULL, NULL); - goto out; - } - - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_fsync_stub(frame, default_fsync_resume, local->fd, + local->flag, xdata); + if (!stub) { + STACK_UNWIND_STRICT(fsync, frame, -1, ENOMEM, NULL, NULL, NULL); + goto out; } - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } int32_t -quiesce_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) +quiesce_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; - - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_fstat_stub (frame, default_fstat_resume, - local->fd, xdata); - if (!stub) { - STACK_UNWIND_STRICT (fstat, frame, -1, ENOMEM, - NULL, NULL); - goto out; - } + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_fstat_stub(frame, default_fstat_resume, local->fd, xdata); + if (!stub) { + STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, NULL, NULL); + goto out; } - STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(fstat, frame, op_ret, op_errno, buf, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } int32_t -quiesce_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +quiesce_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_opendir_stub (frame, default_opendir_resume, - &local->loc, local->fd, xdata); - if (!stub) { - STACK_UNWIND_STRICT (opendir, frame, -1, ENOMEM, - NULL, NULL); - goto out; - } - - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_opendir_stub(frame, default_opendir_resume, &local->loc, + local->fd, xdata); + if (!stub) { + STACK_UNWIND_STRICT(opendir, frame, -1, ENOMEM, NULL, NULL); + goto out; } - STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } int32_t -quiesce_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +quiesce_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_fsyncdir_stub (frame, default_fsyncdir_resume, - local->fd, local->flag, xdata); - if (!stub) { - STACK_UNWIND_STRICT (fsyncdir, frame, -1, ENOMEM, NULL); - goto out; - } - - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_fsyncdir_stub(frame, default_fsyncdir_resume, local->fd, + local->flag, xdata); + if (!stub) { + STACK_UNWIND_STRICT(fsyncdir, frame, -1, ENOMEM, NULL); + goto out; } - STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(fsyncdir, frame, op_ret, op_errno, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } int32_t -quiesce_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct statvfs *buf, dict_t *xdata) +quiesce_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct statvfs *buf, + dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_statfs_stub (frame, default_statfs_resume, - &local->loc, xdata); - if (!stub) { - STACK_UNWIND_STRICT (statfs, frame, -1, ENOMEM, - NULL, NULL); - goto out; - } - - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_statfs_stub(frame, default_statfs_resume, &local->loc, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(statfs, frame, -1, ENOMEM, NULL, NULL); + goto out; } - STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(statfs, frame, op_ret, op_errno, buf, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } int32_t -quiesce_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +quiesce_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; - - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_fgetxattr_stub (frame, default_fgetxattr_resume, - local->fd, local->name, xdata); - if (!stub) { - STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOMEM, - NULL, NULL); - goto out; - } + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_fgetxattr_stub(frame, default_fgetxattr_resume, local->fd, + local->name, xdata); + if (!stub) { + STACK_UNWIND_STRICT(fgetxattr, frame, -1, ENOMEM, NULL, NULL); + goto out; } - STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, dict, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } - int32_t -quiesce_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +quiesce_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_getxattr_stub (frame, default_getxattr_resume, - &local->loc, local->name, xdata); - if (!stub) { - STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, - NULL, NULL); - goto out; - } - - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_getxattr_stub(frame, default_getxattr_resume, &local->loc, + local->name, xdata); + if (!stub) { + STACK_UNWIND_STRICT(getxattr, frame, -1, ENOMEM, NULL, NULL); + goto out; } - STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } - int32_t -quiesce_rchecksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, uint32_t weak_checksum, - uint8_t *strong_checksum, dict_t *xdata) +quiesce_rchecksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, uint32_t weak_checksum, + uint8_t *strong_checksum, dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_rchecksum_stub (frame, default_rchecksum_resume, - local->fd, local->offset, local->flag, xdata); - if (!stub) { - STACK_UNWIND_STRICT (rchecksum, frame, -1, ENOMEM, - 0, NULL, NULL); - goto out; - } - - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_rchecksum_stub(frame, default_rchecksum_resume, local->fd, + local->offset, local->flag, xdata); + if (!stub) { + STACK_UNWIND_STRICT(rchecksum, frame, -1, ENOMEM, 0, NULL, NULL); + goto out; } - STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno, weak_checksum, - strong_checksum, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(rchecksum, frame, op_ret, op_errno, weak_checksum, + strong_checksum, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } - int32_t -quiesce_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, dict_t *xdata) +quiesce_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_readdir_stub (frame, default_readdir_resume, - local->fd, local->size, local->offset, xdata); - if (!stub) { - STACK_UNWIND_STRICT (readdir, frame, -1, ENOMEM, - NULL, NULL); - goto out; - } - - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_readdir_stub(frame, default_readdir_resume, local->fd, + local->size, local->offset, xdata); + if (!stub) { + STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL); + goto out; } - STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, entries, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(readdir, frame, op_ret, op_errno, entries, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } - int32_t -quiesce_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, dict_t *xdata) +quiesce_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) { - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - local = frame->local; - frame->local = NULL; - if ((op_ret == -1) && (op_errno == ENOTCONN)) { - /* Re-transmit (by putting in the queue) */ - stub = fop_readdirp_stub (frame, default_readdirp_resume, - local->fd, local->size, local->offset, - local->dict); - if (!stub) { - STACK_UNWIND_STRICT (readdirp, frame, -1, ENOMEM, - NULL, NULL); - goto out; - } - - gf_quiesce_enqueue (this, stub); - goto out; + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_readdirp_stub(frame, default_readdirp_resume, local->fd, + local->size, local->offset, local->dict); + if (!stub) { + STACK_UNWIND_STRICT(readdirp, frame, -1, ENOMEM, NULL, NULL); + goto out; } - STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); + gf_quiesce_enqueue(this, stub); + goto out; + } + + STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata); out: - gf_quiesce_local_wipe (this, local); + gf_quiesce_local_wipe(this, local); - return 0; + return 0; } - #if 0 int32_t @@ -1015,1596 +1161,1544 @@ out: #endif /* if 0 */ - /* FOP */ /* No retransmittion */ int32_t -quiesce_removexattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name, dict_t *xdata) +quiesce_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv->pass_through) { - STACK_WIND (frame, - default_removexattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->removexattr, - loc, - name, xdata); - return 0; - } + if (priv->pass_through) { + STACK_WIND(frame, default_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; + } - stub = fop_removexattr_stub (frame, default_removexattr_resume, - loc, name, xdata); - if (!stub) { - STACK_UNWIND_STRICT (removexattr, frame, -1, ENOMEM, NULL); - return 0; - } + stub = fop_removexattr_stub(frame, default_removexattr_resume, loc, name, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(removexattr, frame, -1, ENOMEM, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } int32_t -quiesce_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset, dict_t *xdata) +quiesce_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv->pass_through) { - STACK_WIND (frame, - default_truncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, - loc, - offset, xdata); - return 0; - } + if (priv->pass_through) { + STACK_WIND(frame, default_fremovexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + return 0; + } - stub = fop_truncate_stub (frame, default_truncate_resume, loc, offset, xdata); - if (!stub) { - STACK_UNWIND_STRICT (truncate, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; - } + stub = fop_fremovexattr_stub(frame, default_fremovexattr_resume, fd, name, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(fremovexattr, frame, -1, ENOMEM, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } int32_t -quiesce_fsetxattr (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - dict_t *dict, - int32_t flags, dict_t *xdata) +quiesce_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv->pass_through) { - STACK_WIND (frame, - default_fsetxattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsetxattr, - fd, - dict, - flags, xdata); - return 0; - } + if (priv->pass_through) { + STACK_WIND(frame, default_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; + } - stub = fop_fsetxattr_stub (frame, default_fsetxattr_resume, - fd, dict, flags, xdata); - if (!stub) { - STACK_UNWIND_STRICT (fsetxattr, frame, -1, ENOMEM, NULL); - return 0; - } + stub = fop_truncate_stub(frame, default_truncate_resume, loc, offset, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(truncate, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } int32_t -quiesce_setxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *dict, - int32_t flags, dict_t *xdata) +quiesce_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv->pass_through) { - STACK_WIND (frame, - default_setxattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, - loc, - dict, - flags, xdata); - return 0; - } + if (priv->pass_through) { + STACK_WIND(frame, default_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + return 0; + } - stub = fop_setxattr_stub (frame, default_setxattr_resume, - loc, dict, flags, xdata); - if (!stub) { - STACK_UNWIND_STRICT (setxattr, frame, -1, ENOMEM, NULL); - return 0; - } + stub = fop_fsetxattr_stub(frame, default_fsetxattr_resume, fd, dict, flags, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(fsetxattr, frame, -1, ENOMEM, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } int32_t -quiesce_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *xdata) +quiesce_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv->pass_through) { - /* Don't send O_APPEND below, as write() re-transmittions can - fail with O_APPEND */ - STACK_WIND (frame, default_create_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - loc, (flags & ~O_APPEND), mode, umask, fd, xdata); - return 0; - } + if (priv->pass_through) { + STACK_WIND(frame, default_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata); + return 0; + } - stub = fop_create_stub (frame, default_create_resume, - loc, (flags & ~O_APPEND), mode, umask, fd, xdata); - if (!stub) { - STACK_UNWIND_STRICT (create, frame, -1, ENOMEM, - NULL, NULL, NULL, NULL, NULL, NULL); - return 0; - } + stub = fop_setxattr_stub(frame, default_setxattr_resume, loc, dict, flags, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(setxattr, frame, -1, ENOMEM, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } int32_t -quiesce_link (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc, dict_t *xdata) +quiesce_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv->pass_through) { - STACK_WIND (frame, - default_link_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->link, - oldloc, newloc, xdata); - return 0; - } + if (priv->pass_through) { + /* Don't send O_APPEND below, as write() re-transmittions can + fail with O_APPEND */ + STACK_WIND(frame, default_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, (flags & ~O_APPEND), + mode, umask, fd, xdata); + return 0; + } - stub = fop_link_stub (frame, default_link_resume, oldloc, newloc, xdata); - if (!stub) { - STACK_UNWIND_STRICT (link, frame, -1, ENOMEM, - NULL, NULL, NULL, NULL, NULL); - return 0; - } + stub = fop_create_stub(frame, default_create_resume, loc, + (flags & ~O_APPEND), mode, umask, fd, xdata); + if (!stub) { + STACK_UNWIND_STRICT(create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, + NULL, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } int32_t -quiesce_rename (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc, dict_t *xdata) +quiesce_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv->pass_through) { - STACK_WIND (frame, - default_rename_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rename, - oldloc, newloc, xdata); - return 0; - } + if (priv->pass_through) { + STACK_WIND(frame, default_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; + } - stub = fop_rename_stub (frame, default_rename_resume, oldloc, newloc, xdata); - if (!stub) { - STACK_UNWIND_STRICT (rename, frame, -1, ENOMEM, - NULL, NULL, NULL, NULL, NULL, NULL); - return 0; - } + stub = fop_link_stub(frame, default_link_resume, oldloc, newloc, xdata); + if (!stub) { + STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, + NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } +int32_t +quiesce_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + + priv = this->private; + + if (priv->pass_through) { + STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; + } + + stub = fop_rename_stub(frame, default_rename_resume, oldloc, newloc, xdata); + if (!stub) { + STACK_UNWIND_STRICT(rename, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, + NULL, NULL); + return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; +} int -quiesce_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, mode_t umask, dict_t *xdata) +quiesce_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv->pass_through) { - STACK_WIND (frame, default_symlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->symlink, - linkpath, loc, umask, xdata); - return 0; - } + if (priv->pass_through) { + STACK_WIND(frame, default_symlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, + xdata); + return 0; + } - stub = fop_symlink_stub (frame, default_symlink_resume, - linkpath, loc, umask, xdata); - if (!stub) { - STACK_UNWIND_STRICT (symlink, frame, -1, ENOMEM, - NULL, NULL, NULL, NULL, NULL); - return 0; - } + stub = fop_symlink_stub(frame, default_symlink_resume, linkpath, loc, umask, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(symlink, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, + NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } - int -quiesce_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata) +quiesce_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv->pass_through) { - STACK_WIND (frame, default_rmdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rmdir, - loc, flags, xdata); - return 0; - } + if (priv->pass_through) { + STACK_WIND(frame, default_rmdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata); + return 0; + } - stub = fop_rmdir_stub (frame, default_rmdir_resume, loc, flags, xdata); - if (!stub) { - STACK_UNWIND_STRICT (rmdir, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; - } + stub = fop_rmdir_stub(frame, default_rmdir_resume, loc, flags, xdata); + if (!stub) { + STACK_UNWIND_STRICT(rmdir, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } int32_t -quiesce_unlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, int xflag, dict_t *xdata) +quiesce_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv->pass_through) { - STACK_WIND (frame, - default_unlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, - loc, xflag, xdata); - return 0; - } + if (priv->pass_through) { + STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); + return 0; + } - stub = fop_unlink_stub (frame, default_unlink_resume, loc, xflag, xdata); - if (!stub) { - STACK_UNWIND_STRICT (unlink, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; - } + stub = fop_unlink_stub(frame, default_unlink_resume, loc, xflag, xdata); + if (!stub) { + STACK_UNWIND_STRICT(unlink, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } int -quiesce_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) +quiesce_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv->pass_through) { - STACK_WIND (frame, default_mkdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mkdir, - loc, mode, umask, xdata); - return 0; - } + if (priv->pass_through) { + STACK_WIND(frame, default_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); + return 0; + } - stub = fop_mkdir_stub (frame, default_mkdir_resume, - loc, mode, umask, xdata); - if (!stub) { - STACK_UNWIND_STRICT (mkdir, frame, -1, ENOMEM, - NULL, NULL, NULL, NULL, NULL); - return 0; - } + stub = fop_mkdir_stub(frame, default_mkdir_resume, loc, mode, umask, xdata); + if (!stub) { + STACK_UNWIND_STRICT(mkdir, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, + NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } - int -quiesce_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) +quiesce_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv->pass_through) { - STACK_WIND (frame, default_mknod_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mknod, - loc, mode, rdev, umask, xdata); - return 0; - } + if (priv->pass_through) { + STACK_WIND(frame, default_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, + xdata); + return 0; + } - stub = fop_mknod_stub (frame, default_mknod_resume, - loc, mode, rdev, umask, xdata); - if (!stub) { - STACK_UNWIND_STRICT (mknod, frame, -1, ENOMEM, - NULL, NULL, NULL, NULL, NULL); - return 0; - } + stub = fop_mknod_stub(frame, default_mknod_resume, loc, mode, rdev, umask, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(mknod, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, + NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } int32_t -quiesce_ftruncate (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - off_t offset, dict_t *xdata) +quiesce_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv->pass_through) { - STACK_WIND (frame, - default_ftruncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, - fd, - offset, xdata); - return 0; - } + if (priv->pass_through) { + STACK_WIND(frame, default_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; + } - stub = fop_ftruncate_stub (frame, default_ftruncate_resume, fd, offset, xdata); - if (!stub) { - STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; - } + stub = fop_ftruncate_stub(frame, default_ftruncate_resume, fd, offset, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } /* Re-transmittion */ int32_t -quiesce_readlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - size_t size, dict_t *xdata) +quiesce_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - loc_dup (loc, &local->loc); - local->size = size; - frame->local = local; - - STACK_WIND (frame, - quiesce_readlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readlink, - loc, - size, xdata); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + loc_dup(loc, &local->loc); + local->size = size; + frame->local = local; - stub = fop_readlink_stub (frame, default_readlink_resume, loc, size, xdata); - if (!stub) { - STACK_UNWIND_STRICT (readlink, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; - } - - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_readlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, loc, size, xdata); + return 0; + } + stub = fop_readlink_stub(frame, default_readlink_resume, loc, size, xdata); + if (!stub) { + STACK_UNWIND_STRICT(readlink, frame, -1, ENOMEM, NULL, NULL, NULL); return 0; -} + } + gf_quiesce_enqueue(this, stub); + + return 0; +} int32_t -quiesce_access (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t mask, dict_t *xdata) +quiesce_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; - - priv = this->private; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - loc_dup (loc, &local->loc); - local->flag = mask; - frame->local = local; - - STACK_WIND (frame, - quiesce_access_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->access, - loc, - mask, xdata); - return 0; - } + priv = this->private; - stub = fop_access_stub (frame, default_access_resume, loc, mask, xdata); - if (!stub) { - STACK_UNWIND_STRICT (access, frame, -1, ENOMEM, NULL); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + loc_dup(loc, &local->loc); + local->flag = mask; + frame->local = local; - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_access_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->access, loc, mask, xdata); + return 0; + } + stub = fop_access_stub(frame, default_access_resume, loc, mask, xdata); + if (!stub) { + STACK_UNWIND_STRICT(access, frame, -1, ENOMEM, NULL); return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; } int32_t -quiesce_fgetxattr (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - const char *name, dict_t *xdata) +quiesce_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - local->fd = fd_ref (fd); - if (name) - local->name = gf_strdup (name); - - frame->local = local; - - STACK_WIND (frame, - quiesce_fgetxattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fgetxattr, - fd, - name, xdata); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + local->fd = fd_ref(fd); + if (name) + local->name = gf_strdup(name); - stub = fop_fgetxattr_stub (frame, default_fgetxattr_resume, fd, name, xdata); - if (!stub) { - STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOMEM, NULL, NULL); - return 0; - } + frame->local = local; - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + return 0; + } + stub = fop_fgetxattr_stub(frame, default_fgetxattr_resume, fd, name, xdata); + if (!stub) { + STACK_UNWIND_STRICT(fgetxattr, frame, -1, ENOMEM, NULL, NULL); return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; } int32_t -quiesce_statfs (call_frame_t *frame, - xlator_t *this, - loc_t *loc, dict_t *xdata) +quiesce_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - loc_dup (loc, &local->loc); - frame->local = local; - - STACK_WIND (frame, - quiesce_statfs_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->statfs, - loc, xdata); - return 0; - } - - stub = fop_statfs_stub (frame, default_statfs_resume, loc, xdata); - if (!stub) { - STACK_UNWIND_STRICT (statfs, frame, -1, ENOMEM, NULL, NULL); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + loc_dup(loc, &local->loc); + frame->local = local; - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_statfs_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, loc, xdata); + return 0; + } + stub = fop_statfs_stub(frame, default_statfs_resume, loc, xdata); + if (!stub) { + STACK_UNWIND_STRICT(statfs, frame, -1, ENOMEM, NULL, NULL); return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; } int32_t -quiesce_fsyncdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags, dict_t *xdata) +quiesce_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - local->fd = fd_ref (fd); - local->flag = flags; - frame->local = local; - - STACK_WIND (frame, - quiesce_fsyncdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsyncdir, - fd, - flags, xdata); - return 0; - } - - stub = fop_fsyncdir_stub (frame, default_fsyncdir_resume, fd, flags, xdata); - if (!stub) { - STACK_UNWIND_STRICT (fsyncdir, frame, -1, ENOMEM, NULL); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + local->fd = fd_ref(fd); + local->flag = flags; + frame->local = local; - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_fsyncdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsyncdir, fd, flags, xdata); + return 0; + } + stub = fop_fsyncdir_stub(frame, default_fsyncdir_resume, fd, flags, xdata); + if (!stub) { + STACK_UNWIND_STRICT(fsyncdir, frame, -1, ENOMEM, NULL); return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; } int32_t -quiesce_opendir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, fd_t *fd, dict_t *xdata) +quiesce_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - loc_dup (loc, &local->loc); - local->fd = fd_ref (fd); - frame->local = local; - - STACK_WIND (frame, - quiesce_opendir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->opendir, - loc, fd, xdata); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + loc_dup(loc, &local->loc); + local->fd = fd_ref(fd); + frame->local = local; - stub = fop_opendir_stub (frame, default_opendir_resume, loc, fd, xdata); - if (!stub) { - STACK_UNWIND_STRICT (opendir, frame, -1, ENOMEM, NULL, NULL); - return 0; - } - - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + return 0; + } + stub = fop_opendir_stub(frame, default_opendir_resume, loc, fd, xdata); + if (!stub) { + STACK_UNWIND_STRICT(opendir, frame, -1, ENOMEM, NULL, NULL); return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; } int32_t -quiesce_fstat (call_frame_t *frame, - xlator_t *this, - fd_t *fd, dict_t *xdata) +quiesce_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - priv = this->private; - - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - local->fd = fd_ref (fd); - frame->local = local; - - STACK_WIND (frame, - quiesce_fstat_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fstat, - fd, xdata); - return 0; - } + priv = this->private; - stub = fop_fstat_stub (frame, default_fstat_resume, fd, xdata); - if (!stub) { - STACK_UNWIND_STRICT (fstat, frame, -1, ENOMEM, NULL, NULL); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + local->fd = fd_ref(fd); + frame->local = local; - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; + } + stub = fop_fstat_stub(frame, default_fstat_resume, fd, xdata); + if (!stub) { + STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, NULL, NULL); return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; } int32_t -quiesce_fsync (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags, dict_t *xdata) +quiesce_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - local->fd = fd_ref (fd); - local->flag = flags; - frame->local = local; - - STACK_WIND (frame, - quiesce_fsync_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsync, - fd, - flags, xdata); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + local->fd = fd_ref(fd); + local->flag = flags; + frame->local = local; - stub = fop_fsync_stub (frame, default_fsync_resume, fd, flags, xdata); - if (!stub) { - STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; - } - - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, flags, xdata); + return 0; + } + stub = fop_fsync_stub(frame, default_fsync_resume, fd, flags, xdata); + if (!stub) { + STACK_UNWIND_STRICT(fsync, frame, -1, ENOMEM, NULL, NULL, NULL); return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; } int32_t -quiesce_flush (call_frame_t *frame, - xlator_t *this, - fd_t *fd, dict_t *xdata) +quiesce_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; - - priv = this->private; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - local->fd = fd_ref (fd); - frame->local = local; - - STACK_WIND (frame, - quiesce_flush_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, - fd, xdata); - return 0; - } + priv = this->private; - stub = fop_flush_stub (frame, default_flush_resume, fd, xdata); - if (!stub) { - STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM, NULL); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + local->fd = fd_ref(fd); + frame->local = local; - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; + } + stub = fop_flush_stub(frame, default_flush_resume, fd, xdata); + if (!stub) { + STACK_UNWIND_STRICT(flush, frame, -1, ENOMEM, NULL); return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; } int32_t -quiesce_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t off, uint32_t flags, - struct iobref *iobref, dict_t *xdata) +quiesce_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - STACK_WIND (frame, - default_writev_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, - fd, - vector, - count, - off, flags, - iobref, xdata); - return 0; - } + if (priv && priv->pass_through) { + STACK_WIND(frame, default_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, off, + flags, iobref, xdata); + return 0; + } - stub = fop_writev_stub (frame, default_writev_resume, - fd, vector, count, off, flags, iobref, xdata); - if (!stub) { - STACK_UNWIND_STRICT (writev, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; - } + stub = fop_writev_stub(frame, default_writev_resume, fd, vector, count, off, + flags, iobref, xdata); + if (!stub) { + STACK_UNWIND_STRICT(writev, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } int32_t -quiesce_readv (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset, uint32_t flags, dict_t *xdata) +quiesce_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; - - priv = this->private; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - local->fd = fd_ref (fd); - local->size = size; - local->offset = offset; - local->io_flag = flags; - frame->local = local; - - STACK_WIND (frame, - quiesce_readv_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, - fd, - size, - offset, flags, xdata); - return 0; - } + priv = this->private; - stub = fop_readv_stub (frame, default_readv_resume, fd, size, offset, - flags, xdata); - if (!stub) { - STACK_UNWIND_STRICT (readv, frame, -1, ENOMEM, - NULL, 0, NULL, NULL, NULL); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + local->fd = fd_ref(fd); + local->size = size; + local->offset = offset; + local->io_flag = flags; + frame->local = local; - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); + return 0; + } + stub = fop_readv_stub(frame, default_readv_resume, fd, size, offset, flags, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(readv, frame, -1, ENOMEM, NULL, 0, NULL, NULL, + NULL); return 0; -} + } + + gf_quiesce_enqueue(this, stub); + return 0; +} int32_t -quiesce_open (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, fd_t *fd, - dict_t *xdata) +quiesce_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - loc_dup (loc, &local->loc); - local->fd = fd_ref (fd); - - /* Don't send O_APPEND below, as write() re-transmittions can - fail with O_APPEND */ - local->flag = (flags & ~O_APPEND); - frame->local = local; - - STACK_WIND (frame, - quiesce_open_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, - loc, (flags & ~O_APPEND), fd, xdata); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + loc_dup(loc, &local->loc); + local->fd = fd_ref(fd); - stub = fop_open_stub (frame, default_open_resume, loc, - (flags & ~O_APPEND), fd, xdata); - if (!stub) { - STACK_UNWIND_STRICT (open, frame, -1, ENOMEM, NULL, NULL); - return 0; - } + /* Don't send O_APPEND below, as write() re-transmittions can + fail with O_APPEND */ + local->flag = (flags & ~O_APPEND); + frame->local = local; - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, (flags & ~O_APPEND), fd, + xdata); + return 0; + } + stub = fop_open_stub(frame, default_open_resume, loc, (flags & ~O_APPEND), + fd, xdata); + if (!stub) { + STACK_UNWIND_STRICT(open, frame, -1, ENOMEM, NULL, NULL); return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; } int32_t -quiesce_getxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name, dict_t *xdata) +quiesce_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - loc_dup (loc, &local->loc); - if (name) - local->name = gf_strdup (name); - - frame->local = local; - - STACK_WIND (frame, - quiesce_getxattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, - loc, - name, xdata); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + loc_dup(loc, &local->loc); + if (name) + local->name = gf_strdup(name); - stub = fop_getxattr_stub (frame, default_getxattr_resume, loc, name, xdata); - if (!stub) { - STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL); - return 0; - } + frame->local = local; - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + return 0; + } + stub = fop_getxattr_stub(frame, default_getxattr_resume, loc, name, xdata); + if (!stub) { + STACK_UNWIND_STRICT(getxattr, frame, -1, ENOMEM, NULL, NULL); return 0; -} + } + gf_quiesce_enqueue(this, stub); + + return 0; +} int32_t -quiesce_xattrop (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - gf_xattrop_flags_t flags, - dict_t *dict, dict_t *xdata) +quiesce_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - STACK_WIND (frame, - default_xattrop_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->xattrop, - loc, - flags, - dict, xdata); - return 0; - } + if (priv && priv->pass_through) { + STACK_WIND(frame, default_xattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, loc, flags, dict, xdata); + return 0; + } - stub = fop_xattrop_stub (frame, default_xattrop_resume, - loc, flags, dict, xdata); - if (!stub) { - STACK_UNWIND_STRICT (xattrop, frame, -1, ENOMEM, NULL, NULL); - return 0; - } + stub = fop_xattrop_stub(frame, default_xattrop_resume, loc, flags, dict, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(xattrop, frame, -1, ENOMEM, NULL, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } int32_t -quiesce_fxattrop (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - gf_xattrop_flags_t flags, - dict_t *dict, dict_t *xdata) +quiesce_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - STACK_WIND (frame, - default_fxattrop_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fxattrop, - fd, - flags, - dict, xdata); - return 0; - } + if (priv && priv->pass_through) { + STACK_WIND(frame, default_fxattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, fd, flags, dict, xdata); + return 0; + } - stub = fop_fxattrop_stub (frame, default_fxattrop_resume, - fd, flags, dict, xdata); - if (!stub) { - STACK_UNWIND_STRICT (fxattrop, frame, -1, ENOMEM, NULL, NULL); - return 0; - } + stub = fop_fxattrop_stub(frame, default_fxattrop_resume, fd, flags, dict, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(fxattrop, frame, -1, ENOMEM, NULL, NULL); + return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; +} + +int32_t +quiesce_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) +{ + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - gf_quiesce_enqueue (this, stub); + priv = this->private; + if (priv && priv->pass_through) { + STACK_WIND(frame, default_lk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lk, fd, cmd, lock, xdata); return 0; + } + + stub = fop_lk_stub(frame, default_lk_resume, fd, cmd, lock, xdata); + if (!stub) { + STACK_UNWIND_STRICT(lk, frame, -1, ENOMEM, NULL, NULL); + return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; } int32_t -quiesce_lk (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t cmd, - struct gf_flock *lock, dict_t *xdata) +quiesce_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - STACK_WIND (frame, - default_lk_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lk, - fd, - cmd, + if (priv && priv->pass_through) { + STACK_WIND(frame, default_inodelk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->inodelk, volume, loc, cmd, lock, + xdata); + return 0; + } + + stub = fop_inodelk_stub(frame, default_inodelk_resume, volume, loc, cmd, lock, xdata); - return 0; - } + if (!stub) { + STACK_UNWIND_STRICT(inodelk, frame, -1, ENOMEM, NULL); + return 0; + } - stub = fop_lk_stub (frame, default_lk_resume, fd, cmd, lock, xdata); - if (!stub) { - STACK_UNWIND_STRICT (lk, frame, -1, ENOMEM, NULL, NULL); - return 0; - } + gf_quiesce_enqueue(this, stub); - gf_quiesce_enqueue (this, stub); + return 0; +} + +int32_t +quiesce_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, + fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata) +{ + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + + priv = this->private; + if (priv && priv->pass_through) { + STACK_WIND(frame, default_finodelk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, volume, fd, cmd, lock, + xdata); return 0; -} + } + stub = fop_finodelk_stub(frame, default_finodelk_resume, volume, fd, cmd, + lock, xdata); + if (!stub) { + STACK_UNWIND_STRICT(finodelk, frame, -1, ENOMEM, NULL); + return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; +} int32_t -quiesce_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, - struct gf_flock *lock, dict_t *xdata) +quiesce_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - STACK_WIND (frame, - default_inodelk_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, - volume, loc, cmd, lock, xdata); - return 0; - } + if (priv && priv->pass_through) { + STACK_WIND(frame, default_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, volume, loc, basename, cmd, + type, xdata); + return 0; + } - stub = fop_inodelk_stub (frame, default_inodelk_resume, - volume, loc, cmd, lock, xdata); - if (!stub) { - STACK_UNWIND_STRICT (inodelk, frame, -1, ENOMEM, NULL); - return 0; - } + stub = fop_entrylk_stub(frame, default_entrylk_resume, volume, loc, + basename, cmd, type, xdata); + if (!stub) { + STACK_UNWIND_STRICT(entrylk, frame, -1, ENOMEM, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } int32_t -quiesce_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata) +quiesce_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, + fd_t *fd, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - STACK_WIND (frame, - default_finodelk_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->finodelk, - volume, fd, cmd, lock, xdata); - return 0; - } + if (priv && priv->pass_through) { + STACK_WIND(frame, default_fentrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fentrylk, volume, fd, basename, cmd, + type, xdata); + return 0; + } - stub = fop_finodelk_stub (frame, default_finodelk_resume, - volume, fd, cmd, lock, xdata); - if (!stub) { - STACK_UNWIND_STRICT (finodelk, frame, -1, ENOMEM, NULL); - return 0; - } + stub = fop_fentrylk_stub(frame, default_fentrylk_resume, volume, fd, + basename, cmd, type, xdata); + if (!stub) { + STACK_UNWIND_STRICT(fentrylk, frame, -1, ENOMEM, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } int32_t -quiesce_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +quiesce_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + int32_t len, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - - priv = this->private; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - if (priv && priv->pass_through) { - STACK_WIND (frame, default_entrylk_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->entrylk, - volume, loc, basename, cmd, type, xdata); - return 0; - } + priv = this->private; - stub = fop_entrylk_stub (frame, default_entrylk_resume, - volume, loc, basename, cmd, type, xdata); - if (!stub) { - STACK_UNWIND_STRICT (entrylk, frame, -1, ENOMEM, NULL); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + local->fd = fd_ref(fd); + local->offset = offset; + local->flag = len; + frame->local = local; - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_rchecksum_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata); + return 0; + } + stub = fop_rchecksum_stub(frame, default_rchecksum_resume, fd, offset, len, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(rchecksum, frame, -1, ENOMEM, 0, NULL, NULL); return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; } int32_t -quiesce_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +quiesce_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - STACK_WIND (frame, default_fentrylk_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fentrylk, - volume, fd, basename, cmd, type, xdata); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + local->fd = fd_ref(fd); + local->size = size; + local->offset = off; + frame->local = local; - stub = fop_fentrylk_stub (frame, default_fentrylk_resume, - volume, fd, basename, cmd, type, xdata); - if (!stub) { - STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOMEM, NULL); - return 0; - } - - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, fd, size, off, xdata); + return 0; + } + stub = fop_readdir_stub(frame, default_readdir_resume, fd, size, off, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL); return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; } int32_t -quiesce_rchecksum (call_frame_t *frame, - xlator_t *this, - fd_t *fd, off_t offset, - int32_t len, dict_t *xdata) +quiesce_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; - - priv = this->private; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - local->fd = fd_ref (fd); - local->offset = offset; - local->flag = len; - frame->local = local; - - STACK_WIND (frame, - quiesce_rchecksum_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rchecksum, - fd, offset, len, xdata); - return 0; - } + priv = this->private; - stub = fop_rchecksum_stub (frame, default_rchecksum_resume, - fd, offset, len, xdata); - if (!stub) { - STACK_UNWIND_STRICT (rchecksum, frame, -1, ENOMEM, 0, NULL, NULL); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + local->fd = fd_ref(fd); + local->size = size; + local->offset = off; + local->dict = dict_ref(dict); + frame->local = local; - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, dict); + return 0; + } + stub = fop_readdirp_stub(frame, default_readdirp_resume, fd, size, off, + dict); + if (!stub) { + STACK_UNWIND_STRICT(readdirp, frame, -1, ENOMEM, NULL, NULL); return 0; -} + } + gf_quiesce_enqueue(this, stub); + + return 0; +} int32_t -quiesce_readdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t off, dict_t *xdata) +quiesce_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - local->fd = fd_ref (fd); - local->size = size; - local->offset = off; - frame->local = local; - - STACK_WIND (frame, - quiesce_readdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdir, - fd, size, off, xdata); - return 0; - } + if (priv && priv->pass_through) { + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; + } - stub = fop_readdir_stub (frame, default_readdir_resume, fd, size, off, xdata); - if (!stub) { - STACK_UNWIND_STRICT (readdir, frame, -1, ENOMEM, NULL, NULL); - return 0; - } + stub = fop_setattr_stub(frame, default_setattr_resume, loc, stbuf, valid, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(setattr, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } - int32_t -quiesce_readdirp (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t off, dict_t *dict) +quiesce_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; - - priv = this->private; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - local->fd = fd_ref (fd); - local->size = size; - local->offset = off; - local->dict = dict_ref (dict); - frame->local = local; - - STACK_WIND (frame, - quiesce_readdirp_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdirp, - fd, size, off, dict); - return 0; - } + priv = this->private; - stub = fop_readdirp_stub (frame, default_readdirp_resume, fd, size, - off, dict); - if (!stub) { - STACK_UNWIND_STRICT (readdirp, frame, -1, ENOMEM, NULL, NULL); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + loc_dup(loc, &local->loc); + frame->local = local; - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; + } + stub = fop_stat_stub(frame, default_stat_resume, loc, xdata); + if (!stub) { + STACK_UNWIND_STRICT(stat, frame, -1, ENOMEM, NULL, NULL); return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; } int32_t -quiesce_setattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - struct iatt *stbuf, - int32_t valid, dict_t *xdata) +quiesce_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xattr_req) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - priv = this->private; - - if (priv && priv->pass_through) { - STACK_WIND (frame, - default_setattr_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->setattr, - loc, stbuf, valid, xdata); - return 0; - } + priv = this->private; - stub = fop_setattr_stub (frame, default_setattr_resume, - loc, stbuf, valid, xdata); - if (!stub) { - STACK_UNWIND_STRICT (setattr, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + loc_dup(loc, &local->loc); + local->dict = dict_ref(xattr_req); + frame->local = local; - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xattr_req); + return 0; + } + stub = fop_lookup_stub(frame, default_lookup_resume, loc, xattr_req); + if (!stub) { + STACK_UNWIND_STRICT(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL); return 0; -} + } + + gf_quiesce_enqueue(this, stub); + return 0; +} int32_t -quiesce_stat (call_frame_t *frame, - xlator_t *this, - loc_t *loc, dict_t *xdata) +quiesce_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - loc_dup (loc, &local->loc); - frame->local = local; - - STACK_WIND (frame, - quiesce_stat_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, - loc, xdata); - return 0; - } + if (priv && priv->pass_through) { + STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; + } - stub = fop_stat_stub (frame, default_stat_resume, loc, xdata); - if (!stub) { - STACK_UNWIND_STRICT (stat, frame, -1, ENOMEM, NULL, NULL); - return 0; - } + stub = fop_fsetattr_stub(frame, default_fsetattr_resume, fd, stbuf, valid, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); - return 0; + return 0; } int32_t -quiesce_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xattr_req) +quiesce_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; - quiesce_local_t *local = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - local = mem_get0 (priv->local_pool); - loc_dup (loc, &local->loc); - local->dict = dict_ref (xattr_req); - frame->local = local; - - STACK_WIND (frame, - quiesce_lookup_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, - loc, xattr_req); - return 0; - } + if (priv && priv->pass_through) { + STACK_WIND(frame, default_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, + xdata); + return 0; + } + + stub = fop_fallocate_stub(frame, default_fallocate_resume, fd, mode, offset, + len, xdata); + if (!stub) { + STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } - stub = fop_lookup_stub (frame, default_lookup_resume, loc, xattr_req); + gf_quiesce_enqueue(this, stub); + + return 0; +} + +int +quiesce_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, off_t offset, dict_t *xdata) +{ + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; + + local = frame->local; + frame->local = NULL; + if ((op_ret == -1) && (op_errno == ENOTCONN)) { + /* Re-transmit (by putting in the queue) */ + stub = fop_seek_stub(frame, default_seek_resume, local->fd, + local->offset, local->what, xdata); if (!stub) { - STACK_UNWIND_STRICT (lookup, frame, -1, ENOMEM, - NULL, NULL, NULL, NULL); - return 0; + STACK_UNWIND_STRICT(seek, frame, -1, ENOMEM, 0, NULL); + goto out; } - gf_quiesce_enqueue (this, stub); + gf_quiesce_enqueue(this, stub); + goto out; + } - return 0; + STACK_UNWIND_STRICT(seek, frame, op_ret, op_errno, offset, xdata); +out: + gf_quiesce_local_wipe(this, local); + + return 0; } -int32_t -quiesce_fsetattr (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iatt *stbuf, - int32_t valid, dict_t *xdata) +int +quiesce_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) { - quiesce_priv_t *priv = NULL; - call_stub_t *stub = NULL; + quiesce_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quiesce_local_t *local = NULL; - priv = this->private; + priv = this->private; - if (priv && priv->pass_through) { - STACK_WIND (frame, - default_fsetattr_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsetattr, - fd, stbuf, valid, xdata); - return 0; - } + if (priv && priv->pass_through) { + local = mem_get0(priv->local_pool); + local->fd = fd_ref(fd); + local->offset = offset; + local->what = what; - stub = fop_fsetattr_stub (frame, default_fsetattr_resume, - fd, stbuf, valid, xdata); - if (!stub) { - STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; - } + frame->local = local; - gf_quiesce_enqueue (this, stub); + STACK_WIND(frame, quiesce_seek_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->seek, fd, offset, what, xdata); + return 0; + } + stub = fop_seek_stub(frame, default_seek_resume, fd, offset, what, xdata); + if (!stub) { + STACK_UNWIND_STRICT(seek, frame, -1, ENOMEM, 0, NULL); return 0; + } + + gf_quiesce_enqueue(this, stub); + + return 0; } int32_t -mem_acct_init (xlator_t *this) +mem_acct_init(xlator_t *this) { - int ret = -1; + int ret = -1; - ret = xlator_mem_acct_init (this, gf_quiesce_mt_end + 1); + ret = xlator_mem_acct_init(this, gf_quiesce_mt_end + 1); - return ret; + return ret; } int -init (xlator_t *this) +reconfigure(xlator_t *this, dict_t *options) { - int ret = -1; - quiesce_priv_t *priv = NULL; + int32_t ret = -1; + quiesce_priv_t *priv = NULL; - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "'quiesce' not configured with exactly one child"); - goto out; - } + priv = this->private; - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } + GF_OPTION_RECONF("timeout", priv->timeout, options, time, out); + GF_OPTION_RECONF("failover-hosts", priv->failover_hosts, options, str, out); + gf_quiesce_populate_failover_hosts(this, priv, priv->failover_hosts); - priv = GF_CALLOC (1, sizeof (*priv), gf_quiesce_mt_priv_t); - if (!priv) - goto out; + ret = 0; +out: + return ret; +} + +int +init(xlator_t *this) +{ + int ret = -1; + quiesce_priv_t *priv = NULL; - priv->local_pool = mem_pool_new (quiesce_local_t, - GF_FOPS_EXPECTED_IN_PARALLEL); + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_ERROR, + "'quiesce' not configured with exactly one child"); + goto out; + } - LOCK_INIT (&priv->lock); - priv->pass_through = _gf_false; + if (!this->parents) { + gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile "); + } - INIT_LIST_HEAD (&priv->req); + priv = GF_CALLOC(1, sizeof(*priv), gf_quiesce_mt_priv_t); + if (!priv) + goto out; - this->private = priv; - ret = 0; + INIT_LIST_HEAD(&priv->failover_list); + + GF_OPTION_INIT("timeout", priv->timeout, time, out); + GF_OPTION_INIT("failover-hosts", priv->failover_hosts, str, out); + gf_quiesce_populate_failover_hosts(this, priv, priv->failover_hosts); + + priv->local_pool = mem_pool_new(quiesce_local_t, + GF_FOPS_EXPECTED_IN_PARALLEL); + + LOCK_INIT(&priv->lock); + priv->pass_through = _gf_false; + + INIT_LIST_HEAD(&priv->req); + + this->private = priv; + ret = 0; out: - return ret; + return ret; } void -fini (xlator_t *this) +fini(xlator_t *this) { - quiesce_priv_t *priv = NULL; + quiesce_priv_t *priv = NULL; - priv = this->private; - if (!priv) - goto out; - this->private = NULL; + priv = this->private; + if (!priv) + goto out; + this->private = NULL; - mem_pool_destroy (priv->local_pool); - LOCK_DESTROY (&priv->lock); - GF_FREE (priv); + mem_pool_destroy(priv->local_pool); + priv->local_pool = NULL; + LOCK_DESTROY(&priv->lock); + GF_FREE(priv); out: - return; + return; } int -notify (xlator_t *this, int event, void *data, ...) -{ - int ret = 0; - quiesce_priv_t *priv = NULL; - struct timespec timeout = {0,}; - - priv = this->private; - if (!priv) - goto out; - - switch (event) { - case GF_EVENT_CHILD_UP: - { - ret = pthread_create (&priv->thr, NULL, gf_quiesce_dequeue_start, - this); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create the quiesce-dequeue thread"); - } - - LOCK (&priv->lock); - { - priv->pass_through = _gf_true; - } - UNLOCK (&priv->lock); - break; +notify(xlator_t *this, int event, void *data, ...) +{ + int ret = 0; + quiesce_priv_t *priv = NULL; + + priv = this->private; + if (!priv) + goto out; + + switch (event) { + case GF_EVENT_CHILD_UP: { + ret = gf_thread_create(&priv->thr, NULL, gf_quiesce_dequeue_start, + this, "quiesce"); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "failed to create the quiesce-dequeue thread"); + } + + LOCK(&priv->lock); + { + priv->pass_through = _gf_true; + } + UNLOCK(&priv->lock); + break; } case GF_EVENT_CHILD_DOWN: - LOCK (&priv->lock); - { - priv->pass_through = _gf_false; - } - UNLOCK (&priv->lock); - - if (priv->timer) - break; - timeout.tv_sec = 20; - timeout.tv_nsec = 0; - - priv->timer = gf_timer_call_after (this->ctx, - timeout, - gf_quiesce_timeout, - (void *) this); - - if (priv->timer == NULL) { - gf_log (this->name, GF_LOG_ERROR, - "Cannot create timer"); - } - - break; + LOCK(&priv->lock); + { + priv->pass_through = _gf_false; + __gf_quiesce_start_timer(this, priv); + } + UNLOCK(&priv->lock); + break; default: - break; - } + break; + } - ret = default_notify (this, event, data); + ret = default_notify(this, event, data); out: - return ret; + return ret; } - struct xlator_fops fops = { - /* write/modifying fops */ - .mknod = quiesce_mknod, - .create = quiesce_create, - .truncate = quiesce_truncate, - .ftruncate = quiesce_ftruncate, - .setxattr = quiesce_setxattr, - .removexattr = quiesce_removexattr, - .symlink = quiesce_symlink, - .unlink = quiesce_unlink, - .link = quiesce_link, - .mkdir = quiesce_mkdir, - .rmdir = quiesce_rmdir, - .rename = quiesce_rename, - - /* The below calls are known to change state, hence - re-transmittion is not advised */ - .lk = quiesce_lk, - .inodelk = quiesce_inodelk, - .finodelk = quiesce_finodelk, - .entrylk = quiesce_entrylk, - .fentrylk = quiesce_fentrylk, - .xattrop = quiesce_xattrop, - .fxattrop = quiesce_fxattrop, - .setattr = quiesce_setattr, - .fsetattr = quiesce_fsetattr, - - /* Special case, re-transmittion is not harmful * - * as offset is properly sent from above layers */ - /* TODO: not re-transmitted as of now */ - .writev = quiesce_writev, - - /* re-transmittable fops */ - .lookup = quiesce_lookup, - .stat = quiesce_stat, - .fstat = quiesce_fstat, - .access = quiesce_access, - .readlink = quiesce_readlink, - .getxattr = quiesce_getxattr, - .open = quiesce_open, - .readv = quiesce_readv, - .flush = quiesce_flush, - .fsync = quiesce_fsync, - .statfs = quiesce_statfs, - .opendir = quiesce_opendir, - .readdir = quiesce_readdir, - .readdirp = quiesce_readdirp, - .fsyncdir = quiesce_fsyncdir, - + /* write/modifying fops */ + .mknod = quiesce_mknod, + .create = quiesce_create, + .truncate = quiesce_truncate, + .ftruncate = quiesce_ftruncate, + .setxattr = quiesce_setxattr, + .fsetxattr = quiesce_fsetxattr, + .removexattr = quiesce_removexattr, + .fremovexattr = quiesce_fremovexattr, + .symlink = quiesce_symlink, + .unlink = quiesce_unlink, + .link = quiesce_link, + .mkdir = quiesce_mkdir, + .rmdir = quiesce_rmdir, + .rename = quiesce_rename, + .fallocate = quiesce_fallocate, + + /* The below calls are known to change state, hence + re-transmittion is not advised */ + .lk = quiesce_lk, + .inodelk = quiesce_inodelk, + .finodelk = quiesce_finodelk, + .entrylk = quiesce_entrylk, + .fentrylk = quiesce_fentrylk, + .xattrop = quiesce_xattrop, + .fxattrop = quiesce_fxattrop, + .setattr = quiesce_setattr, + .fsetattr = quiesce_fsetattr, + + /* Special case, re-transmittion is not harmful * + * as offset is properly sent from above layers */ + /* TODO: not re-transmitted as of now */ + .writev = quiesce_writev, + + /* re-transmittable fops */ + .lookup = quiesce_lookup, + .stat = quiesce_stat, + .fstat = quiesce_fstat, + .access = quiesce_access, + .readlink = quiesce_readlink, + .getxattr = quiesce_getxattr, + .fgetxattr = quiesce_fgetxattr, + .open = quiesce_open, + .readv = quiesce_readv, + .flush = quiesce_flush, + .fsync = quiesce_fsync, + .statfs = quiesce_statfs, + .opendir = quiesce_opendir, + .readdir = quiesce_readdir, + .readdirp = quiesce_readdirp, + .fsyncdir = quiesce_fsyncdir, + .seek = quiesce_seek, }; struct xlator_dumpops dumpops; - struct xlator_cbks cbks; - struct volume_options options[] = { - { .key = {NULL} }, + { + .key = {"timeout"}, + .type = GF_OPTION_TYPE_TIME, + .default_value = "45", + .description = + "After 'timeout' seconds since the time 'quiesce' " + "option was set to \"!pass-through\", acknowledgements to file " + "operations are no longer quiesced and previously " + "quiesced acknowledgements are sent to the application", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + }, + {.key = {"failover-hosts"}, + .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST, + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .description = "It is a comma separated list of hostname/IP " + "addresses. It Specifies the list of hosts where " + "the gfproxy daemons are running, to which the " + "the thin clients can failover to."}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {GD_OP_VERSION_3_12_0}, + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "quiesce", + .category = GF_TECH_PREVIEW, }; diff --git a/xlators/features/quiesce/src/quiesce.h b/xlators/features/quiesce/src/quiesce.h index 878ed77e928..6ab2af40a56 100644 --- a/xlators/features/quiesce/src/quiesce.h +++ b/xlators/features/quiesce/src/quiesce.h @@ -12,40 +12,54 @@ #define __QUIESCE_H__ #include "quiesce-mem-types.h" -#include "xlator.h" -#include "timer.h" +#include "quiesce-messages.h" +#include <glusterfs/xlator.h> +#include <glusterfs/timer.h> #define GF_FOPS_EXPECTED_IN_PARALLEL 512 typedef struct { - gf_timer_t *timer; - gf_boolean_t pass_through; - gf_lock_t lock; - struct list_head req; - int queue_size; - pthread_t thr; - struct mem_pool *local_pool; + struct list_head list; + char *addr; + gf_boolean_t tried; /* indicates attempted connecting */ +} quiesce_failover_hosts_t; + +typedef struct { + gf_timer_t *timer; + gf_boolean_t pass_through; + gf_lock_t lock; + struct list_head req; + int queue_size; + pthread_t thr; + struct mem_pool *local_pool; + uint32_t timeout; + char *failover_hosts; + struct list_head failover_list; } quiesce_priv_t; typedef struct { - fd_t *fd; - char *name; - char *volname; - loc_t loc; - off_t size; - off_t offset; - mode_t mode; - int32_t flag; - struct iatt stbuf; - struct iovec *vector; - struct iobref *iobref; - dict_t *dict; - struct gf_flock flock; - entrylk_cmd cmd; - entrylk_type type; - gf_xattrop_flags_t xattrop_flags; - int32_t wbflags; - uint32_t io_flag; + fd_t *fd; + char *name; + char *volname; + loc_t loc; + off_t size; + off_t offset; + mode_t mode; + int32_t flag; + struct iatt stbuf; + struct iovec *vector; + struct iobref *iobref; + dict_t *dict; + struct gf_flock flock; + entrylk_cmd cmd; + entrylk_type type; + gf_xattrop_flags_t xattrop_flags; + int32_t wbflags; + uint32_t io_flag; + /* for fallocate */ + size_t len; + /* for lseek */ + gf_seek_what_t what; } quiesce_local_t; #endif diff --git a/xlators/features/quota/src/Makefile.am b/xlators/features/quota/src/Makefile.am index 7165adc59ef..1c2dcef0ca3 100644 --- a/xlators/features/quota/src/Makefile.am +++ b/xlators/features/quota/src/Makefile.am @@ -1,22 +1,29 @@ +if WITH_SERVER xlator_LTLIBRARIES = quota.la quotad.la +endif xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features -quota_la_LDFLAGS = -module -avoid-version -quotad_la_LDFLAGS = -module -avoid-version +quota_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) +quotad_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) quota_la_SOURCES = quota.c quota-enforcer-client.c -quota_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +quota_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/rpc/xdr/src/libgfxdr.la \ + $(top_builddir)/rpc/rpc-lib/src/libgfrpc.la quotad_la_SOURCES = quotad.c quotad-helpers.c quotad-aggregator.c -quotad_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +quotad_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/rpc/xdr/src/libgfxdr.la \ + $(top_builddir)/rpc/rpc-lib/src/libgfrpc.la -noinst_HEADERS = quota-mem-types.h quota.h quotad-aggregator.h quotad-helpers.h +noinst_HEADERS = quota-mem-types.h quota.h quotad-aggregator.h \ + quotad-helpers.h quota-messages.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ - -I$(top_srcdir)/xlators/cluster/dht/src -I$(top_srcdir)/rpc/xdr/src/ \ - -I$(top_srcdir)/rpc/rpc-lib/src + -I$(top_srcdir)/rpc/xdr/src/ -I$(top_builddir)/rpc/xdr/src/ \ + -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/xlators/cluster/dht/src AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = - diff --git a/xlators/features/quota/src/quota-enforcer-client.c b/xlators/features/quota/src/quota-enforcer-client.c index bfea5e42014..480d64ade27 100644 --- a/xlators/features/quota/src/quota-enforcer-client.c +++ b/xlators/features/quota/src/quota-enforcer-client.c @@ -28,337 +28,476 @@ #include <semaphore.h> #include <errno.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #ifdef HAVE_MALLOC_H #include <malloc.h> #endif -#ifdef HAVE_MALLOC_STATS -#ifdef DEBUG -#include <mcheck.h> -#endif -#endif - #include "quota.h" +#include "quota-messages.h" extern struct rpc_clnt_program quota_enforcer_clnt; int32_t -quota_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xdata, struct iatt *postparent); +quota_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent); int -quota_enforcer_submit_request (void *req, call_frame_t *frame, - rpc_clnt_prog_t *prog, - int procnum, struct iobref *iobref, - xlator_t *this, fop_cbk_fn_t cbkfn, - xdrproc_t xdrproc) +quota_enforcer_submit_request(void *req, call_frame_t *frame, + rpc_clnt_prog_t *prog, int procnum, + struct iobref *iobref, xlator_t *this, + fop_cbk_fn_t cbkfn, xdrproc_t xdrproc) { - int ret = -1; - int count = 0; - struct iovec iov = {0, }; - struct iobuf *iobuf = NULL; - char new_iobref = 0; - ssize_t xdr_size = 0; - quota_priv_t *priv = NULL; - - GF_ASSERT (this); - - priv = this->private; - - if (req) { - xdr_size = xdr_sizeof (xdrproc, req); - iobuf = iobuf_get2 (this->ctx->iobuf_pool, xdr_size); - if (!iobuf) { - goto out; - } - - if (!iobref) { - iobref = iobref_new (); - if (!iobref) { - goto out; - } - - new_iobref = 1; - } - - iobref_add (iobref, iobuf); - - iov.iov_base = iobuf->ptr; - iov.iov_len = iobuf_size (iobuf); - - /* Create the xdr payload */ - ret = xdr_serialize_generic (iov, req, xdrproc); - if (ret == -1) { - goto out; - } - iov.iov_len = ret; - count = 1; + int ret = -1; + int count = 0; + struct iovec iov = { + 0, + }; + struct iobuf *iobuf = NULL; + char new_iobref = 0; + ssize_t xdr_size = 0; + quota_priv_t *priv = NULL; + + GF_ASSERT(this); + + priv = this->private; + + if (req) { + xdr_size = xdr_sizeof(xdrproc, req); + iobuf = iobuf_get2(this->ctx->iobuf_pool, xdr_size); + if (!iobuf) { + goto out; + } + + if (!iobref) { + iobref = iobref_new(); + if (!iobref) { + goto out; + } + + new_iobref = 1; } - /* Send the msg */ - ret = rpc_clnt_submit (priv->rpc_clnt, prog, procnum, cbkfn, - &iov, count, - NULL, 0, iobref, frame, NULL, 0, NULL, 0, NULL); - ret = 0; + iobref_add(iobref, iobuf); + + iov.iov_base = iobuf->ptr; + iov.iov_len = iobuf_size(iobuf); + + /* Create the xdr payload */ + ret = xdr_serialize_generic(iov, req, xdrproc); + if (ret == -1) { + goto out; + } + iov.iov_len = ret; + count = 1; + } + + /* Send the msg */ + ret = rpc_clnt_submit(priv->rpc_clnt, prog, procnum, cbkfn, &iov, count, + NULL, 0, iobref, frame, NULL, 0, NULL, 0, NULL); + ret = 0; out: - if (new_iobref) - iobref_unref (iobref); - if (iobuf) - iobuf_unref (iobuf); + if (new_iobref) + iobref_unref(iobref); + if (iobuf) + iobuf_unref(iobuf); - return ret; + return ret; } int -quota_enforcer_lookup_cbk (struct rpc_req *req, struct iovec *iov, - int count, void *myframe) +quota_enforcer_lookup_cbk(struct rpc_req *req, struct iovec *iov, int count, + void *myframe) { - quota_local_t *local = NULL; - call_frame_t *frame = NULL; - int ret = 0; - gfs3_lookup_rsp rsp = {0,}; - struct iatt stbuf = {0,}; - struct iatt postparent = {0,}; - int op_errno = EINVAL; - dict_t *xdata = NULL; - inode_t *inode = NULL; - xlator_t *this = NULL; - - this = THIS; - - frame = myframe; - local = frame->local; - inode = local->validate_loc.inode; - - if (-1 == req->rpc_status) { - rsp.op_ret = -1; - op_errno = ENOTCONN; - goto out; - } + quota_local_t *local = NULL; + call_frame_t *frame = NULL; + int ret = 0; + gfs3_lookup_rsp rsp = { + 0, + }; + struct iatt stbuf = { + 0, + }; + struct iatt postparent = { + 0, + }; + int op_errno = EINVAL; + dict_t *xdata = NULL; + inode_t *inode = NULL; + xlator_t *this = NULL; + quota_priv_t *priv = NULL; + struct timespec retry_delay = { + 0, + }; + gf_timer_t *timer = NULL; + + this = THIS; + + frame = myframe; + local = frame->local; + inode = local->validate_loc.inode; + priv = this->private; + + if (-1 == req->rpc_status) { + rsp.op_ret = -1; + op_errno = ENOTCONN; + goto out; + } + + ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gfs3_lookup_rsp); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, Q_MSG_XDR_DECODING_FAILED, + "XDR decoding failed"); + rsp.op_ret = -1; + op_errno = EINVAL; + goto out; + } - ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gfs3_lookup_rsp); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed"); - rsp.op_ret = -1; - op_errno = EINVAL; - goto out; - } + op_errno = gf_error_to_errno(rsp.op_errno); + gf_stat_to_iatt(&rsp.postparent, &postparent); - op_errno = gf_error_to_errno (rsp.op_errno); - gf_stat_to_iatt (&rsp.postparent, &postparent); + if (rsp.op_ret == -1) + goto out; - if (rsp.op_ret == -1) - goto out; + rsp.op_ret = -1; + gf_stat_to_iatt(&rsp.stat, &stbuf); + + GF_PROTOCOL_DICT_UNSERIALIZE(frame->this, xdata, (rsp.xdata.xdata_val), + (rsp.xdata.xdata_len), rsp.op_ret, op_errno, + out); + if ((!gf_uuid_is_null(inode->gfid)) && + (gf_uuid_compare(stbuf.ia_gfid, inode->gfid) != 0)) { + gf_msg_debug(frame->this->name, ESTALE, "gfid changed for %s", + local->validate_loc.path); rsp.op_ret = -1; - gf_stat_to_iatt (&rsp.stat, &stbuf); - - GF_PROTOCOL_DICT_UNSERIALIZE (frame->this, xdata, (rsp.xdata.xdata_val), - (rsp.xdata.xdata_len), rsp.op_ret, - op_errno, out); - - if ((!uuid_is_null (inode->gfid)) - && (uuid_compare (stbuf.ia_gfid, inode->gfid) != 0)) { - gf_log (frame->this->name, GF_LOG_DEBUG, - "gfid changed for %s", local->validate_loc.path); - rsp.op_ret = -1; - op_errno = ESTALE; - goto out; - } + op_errno = ESTALE; + goto out; + } - rsp.op_ret = 0; + rsp.op_ret = 0; out: - rsp.op_errno = op_errno; - if (rsp.op_ret == -1) { - /* any error other than ENOENT */ - if (rsp.op_errno != ENOENT) - gf_log (this->name, GF_LOG_WARNING, - "remote operation failed: %s. Path: %s (%s)", - strerror (rsp.op_errno), - local->validate_loc.path, - loc_gfid_utoa (&local->validate_loc)); - else - gf_log (this->name, GF_LOG_TRACE, - "not found on remote node"); + rsp.op_errno = op_errno; + + /* We need to retry connecting to quotad on ENOTCONN error. + * Suppose if there are two volumes vol1 and vol2, + * and quota is enabled and limit is set on vol1. + * Now if IO is happening on vol1 and quota is enabled/disabled + * on vol2, quotad gets restarted and client will receive + * ENOTCONN in the IO path of vol1 + */ + if (rsp.op_ret == -1 && rsp.op_errno == ENOTCONN) { + if (local->quotad_conn_retry >= 12) { + priv->quotad_conn_status = 1; + gf_log(this->name, GF_LOG_WARNING, + "failed to connect " + "to quotad after retry count %d)", + local->quotad_conn_retry); + } else { + local->quotad_conn_retry++; + } + if (priv->quotad_conn_status == 0) { + /* retry connecting after 5secs for 12 retries + * (up to 60sec). + */ + gf_log(this->name, GF_LOG_DEBUG, + "retry connecting to " + "quotad (retry count %d)", + local->quotad_conn_retry); + + retry_delay.tv_sec = 5; + retry_delay.tv_nsec = 0; + timer = gf_timer_call_after(this->ctx, retry_delay, + _quota_enforcer_lookup, (void *)frame); + if (timer == NULL) { + gf_log(this->name, GF_LOG_WARNING, + "failed to " + "set quota_enforcer_lookup with timer"); + } else { + goto clean; + } } + } else { + priv->quotad_conn_status = 0; + } + + if (rsp.op_ret == -1) { + /* any error other than ENOENT */ + if (rsp.op_errno != ENOENT) + gf_msg( + this->name, GF_LOG_WARNING, rsp.op_errno, Q_MSG_LOOKUP_FAILED, + "Getting cluster-wide size of directory failed " + "(path: %s gfid:%s)", + local->validate_loc.path, loc_gfid_utoa(&local->validate_loc)); + else + gf_msg_trace(this->name, ENOENT, "not found on remote node"); - local->validate_cbk (frame, NULL, this, rsp.op_ret, rsp.op_errno, inode, - &stbuf, xdata, &postparent); + } else if (local->quotad_conn_retry) { + gf_log(this->name, GF_LOG_DEBUG, + "connected to quotad after " + "retry count %d", + local->quotad_conn_retry); + } - if (xdata) - dict_unref (xdata); + local->validate_cbk(frame, NULL, this, rsp.op_ret, rsp.op_errno, inode, + &stbuf, xdata, &postparent); - free (rsp.xdata.xdata_val); +clean: + if (xdata) + dict_unref(xdata); - return 0; + free(rsp.xdata.xdata_val); + + return 0; } -int -quota_enforcer_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xdata, fop_lookup_cbk_t validate_cbk) +void +_quota_enforcer_lookup(void *data) { - quota_local_t *local = NULL; - gfs3_lookup_req req = {{0,},}; - int ret = 0; - int op_errno = ESTALE; - quota_priv_t *priv = NULL; + quota_local_t *local = NULL; + gfs3_lookup_req req = { + { + 0, + }, + }; + int ret = 0; + int op_errno = ESTALE; + quota_priv_t *priv = NULL; + call_frame_t *frame = NULL; + loc_t *loc = NULL; + xlator_t *this = NULL; + char *dir_path = NULL; + + frame = data; + local = frame->local; + this = local->this; + loc = &local->validate_loc; + + priv = this->private; + + if (!(loc && loc->inode)) + goto unwind; + + if (!gf_uuid_is_null(loc->inode->gfid)) + memcpy(req.gfid, loc->inode->gfid, 16); + else + memcpy(req.gfid, loc->gfid, 16); + + if (local->validate_xdata) { + GF_PROTOCOL_DICT_SERIALIZE(this, local->validate_xdata, + (&req.xdata.xdata_val), req.xdata.xdata_len, + op_errno, unwind); + } + + if (loc->name) + req.bname = (char *)loc->name; + else + req.bname = ""; + + if (loc->path) + dir_path = (char *)loc->path; + else + dir_path = ""; + + ret = quota_enforcer_submit_request( + &req, frame, priv->quota_enforcer, GF_AGGREGATOR_LOOKUP, NULL, this, + quota_enforcer_lookup_cbk, (xdrproc_t)xdr_gfs3_lookup_req); + + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_RPC_SUBMIT_FAILED, + "Couldn't send the request to " + "fetch cluster wide size of directory (path:%s gfid:%s)", + dir_path, req.gfid); + } + + GF_FREE(req.xdata.xdata_val); + + return; - if (!frame || !this || !loc) - goto unwind; +unwind: + local->validate_cbk(frame, NULL, this, -1, op_errno, NULL, NULL, NULL, + NULL); - local = frame->local; - local->validate_cbk = validate_cbk; + GF_FREE(req.xdata.xdata_val); - priv = this->private; + return; +} - if (!(loc && loc->inode)) - goto unwind; +int +quota_enforcer_lookup(call_frame_t *frame, xlator_t *this, dict_t *xdata, + fop_lookup_cbk_t validate_cbk) +{ + quota_local_t *local = NULL; - if (!uuid_is_null (loc->inode->gfid)) - memcpy (req.gfid, loc->inode->gfid, 16); - else - memcpy (req.gfid, loc->gfid, 16); + if (!frame || !this) + goto unwind; - if (xdata) { - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, - (&req.xdata.xdata_val), - req.xdata.xdata_len, - op_errno, unwind); - } + local = frame->local; + local->this = this; + local->validate_cbk = validate_cbk; + local->validate_xdata = dict_ref(xdata); - if (loc->name) - req.bname = (char *)loc->name; - else - req.bname = ""; + _quota_enforcer_lookup(frame); - ret = quota_enforcer_submit_request (&req, frame, - priv->quota_enforcer, - GF_AGGREGATOR_LOOKUP, - NULL, this, - quota_enforcer_lookup_cbk, - (xdrproc_t)xdr_gfs3_lookup_req); + return 0; - if (ret) { - gf_log (this->name, GF_LOG_WARNING, "failed to send the fop"); - } +unwind: + validate_cbk(frame, NULL, this, -1, ESTALE, NULL, NULL, NULL, NULL); - GF_FREE (req.xdata.xdata_val); + return 0; +} - return 0; +int +quota_enforcer_notify(struct rpc_clnt *rpc, void *mydata, + rpc_clnt_event_t event, void *data) +{ + xlator_t *this = NULL; + int ret = 0; + quota_priv_t *priv = NULL; + + this = mydata; + priv = this->private; + switch (event) { + case RPC_CLNT_CONNECT: { + pthread_mutex_lock(&priv->conn_mutex); + { + priv->conn_status = _gf_true; + } + pthread_mutex_unlock(&priv->conn_mutex); + gf_msg_trace(this->name, 0, "got RPC_CLNT_CONNECT"); + break; + } -unwind: - validate_cbk (frame, NULL, this, -1, op_errno, NULL, NULL, NULL, NULL); + case RPC_CLNT_DISCONNECT: { + pthread_mutex_lock(&priv->conn_mutex); + { + priv->conn_status = _gf_false; + pthread_cond_signal(&priv->conn_cond); + } + pthread_mutex_unlock(&priv->conn_mutex); + gf_msg_trace(this->name, 0, "got RPC_CLNT_DISCONNECT"); + break; + } - GF_FREE (req.xdata.xdata_val); + default: + gf_msg_trace(this->name, 0, "got some other RPC event %d", event); + ret = 0; + break; + } - return 0; + return ret; } int -quota_enforcer_notify (struct rpc_clnt *rpc, void *mydata, - rpc_clnt_event_t event, void *data) +quota_enforcer_blocking_connect(rpc_clnt_t *rpc) { - xlator_t *this = NULL; - int ret = 0; + dict_t *options = NULL; + int ret = -1; - this = mydata; + options = dict_new(); + if (options == NULL) + goto out; - switch (event) { - case RPC_CLNT_CONNECT: - { - gf_log (this->name, GF_LOG_TRACE, "got RPC_CLNT_CONNECT"); - break; - } + ret = dict_set_sizen_str_sizen(options, "non-blocking-io", "no"); + if (ret) + goto out; - case RPC_CLNT_DISCONNECT: - { - gf_log (this->name, GF_LOG_TRACE, "got RPC_CLNT_DISCONNECT"); - break; - } + rpc->conn.trans->reconfigure(rpc->conn.trans, options); - default: - gf_log (this->name, GF_LOG_TRACE, - "got some other RPC event %d", event); - ret = 0; - break; - } + rpc_clnt_start(rpc); - return ret; + ret = dict_set_sizen_str_sizen(options, "non-blocking-io", "yes"); + if (ret) + goto out; + + rpc->conn.trans->reconfigure(rpc->conn.trans, options); + + ret = 0; +out: + if (options) + dict_unref(options); + + return ret; } -//Returns a started rpc_clnt. Creates a new rpc_clnt if quota_priv doesn't have -//one already +// Returns a started rpc_clnt. Creates a new rpc_clnt if quota_priv doesn't have +// one already struct rpc_clnt * -quota_enforcer_init (xlator_t *this, dict_t *options) +quota_enforcer_init(xlator_t *this, dict_t *options) { - struct rpc_clnt *rpc = NULL; - quota_priv_t *priv = NULL; - int ret = -1; + struct rpc_clnt *rpc = NULL; + quota_priv_t *priv = NULL; + int ret = -1; - priv = this->private; + priv = this->private; + + LOCK(&priv->lock); + { if (priv->rpc_clnt) { - gf_log (this->name, GF_LOG_TRACE, "quota enforcer clnt already " - "inited"); - //Turns out to be a NOP if the clnt is already connected. - rpc_clnt_start (priv->rpc_clnt); - return priv->rpc_clnt; + ret = 0; + rpc = priv->rpc_clnt; } - priv->quota_enforcer = "a_enforcer_clnt; + } + UNLOCK(&priv->lock); - ret = dict_set_str (options, "transport.address-family", "unix"); - if (ret) - goto out; + if (rpc) + goto out; - ret = dict_set_str (options, "transport-type", "socket"); - if (ret) - goto out; + priv->quota_enforcer = "a_enforcer_clnt; - ret = dict_set_str (options, "transport.socket.connect-path", - "/tmp/quotad.socket"); - if (ret) - goto out; + ret = dict_set_sizen_str_sizen(options, "transport.address-family", "unix"); + if (ret) + goto out; - rpc = rpc_clnt_new (options, this->ctx, this->name, 16); - if (!rpc) { - ret = -1; - goto out; - } + ret = dict_set_sizen_str_sizen(options, "transport-type", "socket"); + if (ret) + goto out; - ret = rpc_clnt_register_notify (rpc, quota_enforcer_notify, this); - if (ret) { - gf_log ("cli", GF_LOG_ERROR, "failed to register notify"); - goto out; - } + ret = dict_set_sizen_str_sizen(options, "transport.socket.connect-path", + "/var/run/gluster/quotad.socket"); + if (ret) + goto out; - rpc_clnt_start (rpc); + rpc = rpc_clnt_new(options, this, this->name, 16); + if (!rpc) { + ret = -1; + goto out; + } + + ret = rpc_clnt_register_notify(rpc, quota_enforcer_notify, this); + if (ret) { + gf_msg("quota", GF_LOG_ERROR, 0, Q_MSG_RPCCLNT_REGISTER_NOTIFY_FAILED, + "failed to register notify"); + goto out; + } + + ret = quota_enforcer_blocking_connect(rpc); + if (ret) + goto out; + + ret = 0; out: - if (ret) { - if (rpc) - rpc_clnt_unref (rpc); - rpc = NULL; - } + if (ret) { + if (rpc) + rpc_clnt_unref(rpc); + rpc = NULL; + } - return rpc; + return rpc; } struct rpc_clnt_procedure quota_enforcer_actors[GF_AGGREGATOR_MAXVALUE] = { - [GF_AGGREGATOR_NULL] = {"NULL", NULL}, - [GF_AGGREGATOR_LOOKUP] = {"LOOKUP", NULL}, + [GF_AGGREGATOR_NULL] = {"NULL", NULL}, + [GF_AGGREGATOR_LOOKUP] = {"LOOKUP", NULL}, }; struct rpc_clnt_program quota_enforcer_clnt = { - .progname = "Quota enforcer", - .prognum = GLUSTER_AGGREGATOR_PROGRAM, - .progver = GLUSTER_AGGREGATOR_VERSION, - .numproc = GF_AGGREGATOR_MAXVALUE, - .proctable = quota_enforcer_actors, + .progname = "Quota enforcer", + .prognum = GLUSTER_AGGREGATOR_PROGRAM, + .progver = GLUSTER_AGGREGATOR_VERSION, + .numproc = GF_AGGREGATOR_MAXVALUE, + .proctable = quota_enforcer_actors, }; diff --git a/xlators/features/quota/src/quota-mem-types.h b/xlators/features/quota/src/quota-mem-types.h index 97d9165681f..782a7de96bb 100644 --- a/xlators/features/quota/src/quota-mem-types.h +++ b/xlators/features/quota/src/quota-mem-types.h @@ -10,21 +10,21 @@ #ifndef __QUOTA_MEM_TYPES_H__ #define __QUOTA_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_quota_mem_types_ { - gf_quota_mt_quota_priv_t = gf_common_mt_end + 1, - gf_quota_mt_quota_inode_ctx_t, - gf_quota_mt_loc_t, - gf_quota_mt_char, - gf_quota_mt_int64_t, - gf_quota_mt_int32_t, - gf_quota_mt_limits_t, - gf_quota_mt_quota_dentry_t, - gf_quota_mt_quota_limits_level_t, - gf_quota_mt_qd_vols_conf_t, - gf_quota_mt_aggregator_state_t, - gf_quota_mt_end + /* Those are used by QUOTA_ALLOC_OR_GOTO macro */ + gf_quota_mt_quota_priv_t = gf_common_mt_end + 1, + gf_quota_mt_quota_inode_ctx_t, + gf_quota_mt_loc_t, + gf_quota_mt_char, + gf_quota_mt_int64_t, + gf_quota_mt_int32_t, + gf_quota_mt_limits_t, + gf_quota_mt_quota_dentry_t, + gf_quota_mt_quota_limits_level_t, + gf_quota_mt_qd_vols_conf_t, + gf_quota_mt_aggregator_state_t, + gf_quota_mt_end }; #endif - diff --git a/xlators/features/quota/src/quota-messages.h b/xlators/features/quota/src/quota-messages.h new file mode 100644 index 00000000000..d434ed75e76 --- /dev/null +++ b/xlators/features/quota/src/quota-messages.h @@ -0,0 +1,39 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _QUOTA_MESSAGES_H_ +#define _QUOTA_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(QUOTA, Q_MSG_ENFORCEMENT_FAILED, Q_MSG_ENOMEM, Q_MSG_PARENT_NULL, + Q_MSG_CROSSED_SOFT_LIMIT, Q_MSG_QUOTA_ENFORCER_RPC_INIT_FAILED, + Q_MSG_REMOTE_OPERATION_FAILED, Q_MSG_FAILED_TO_SEND_FOP, + Q_MSG_INVALID_VOLFILE, Q_MSG_INODE_PARENT_NOT_FOUND, + Q_MSG_XDR_DECODE_ERROR, Q_MSG_DICT_UNSERIALIZE_FAIL, + Q_MSG_DICT_SERIALIZE_FAIL, Q_MSG_RPCSVC_INIT_FAILED, + Q_MSG_RPCSVC_LISTENER_CREATION_FAILED, Q_MSG_RPCSVC_REGISTER_FAILED, + Q_MSG_XDR_DECODING_FAILED, Q_MSG_RPCCLNT_REGISTER_NOTIFY_FAILED, + Q_MSG_ANCESTRY_BUILD_FAILED, Q_MSG_SIZE_KEY_MISSING, + Q_MSG_INODE_CTX_GET_FAILED, Q_MSG_INODE_CTX_SET_FAILED, + Q_MSG_LOOKUP_FAILED, Q_MSG_RPC_SUBMIT_FAILED, + Q_MSG_ENFORCEMENT_SKIPPED, Q_MSG_INTERNAL_FOP_KEY_MISSING); + +#endif /* !_QUOTA_MESSAGES_H_ */ diff --git a/xlators/features/quota/src/quota.c b/xlators/features/quota/src/quota.c index 3c6c31def42..18df9ae6d19 100644 --- a/xlators/features/quota/src/quota.c +++ b/xlators/features/quota/src/quota.c @@ -7,3745 +7,4636 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#include <fnmatch.h> #include "quota.h" -#include "common-utils.h" -#include "defaults.h" -#include "statedump.h" - -int32_t -quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this, - char *name, uuid_t par); - -int -quota_fill_inodectx (xlator_t *this, inode_t *inode, dict_t *dict, - loc_t *loc, struct iatt *buf, int32_t *op_errno); +#include <glusterfs/statedump.h> +#include "quota-messages.h" +#include <glusterfs/events.h> struct volume_options options[]; static int32_t -__quota_init_inode_ctx (inode_t *inode, xlator_t *this, - quota_inode_ctx_t **context) +__quota_init_inode_ctx(inode_t *inode, xlator_t *this, + quota_inode_ctx_t **context) { - int32_t ret = -1; - quota_inode_ctx_t *ctx = NULL; + int32_t ret = -1; + quota_inode_ctx_t *ctx = NULL; - if (inode == NULL) { - goto out; - } + if (inode == NULL) { + goto out; + } - QUOTA_ALLOC_OR_GOTO (ctx, quota_inode_ctx_t, out); + QUOTA_ALLOC_OR_GOTO(ctx, quota_inode_ctx_t, out); - LOCK_INIT(&ctx->lock); + LOCK_INIT(&ctx->lock); - if (context != NULL) { - *context = ctx; - } + if (context != NULL) { + *context = ctx; + } - INIT_LIST_HEAD (&ctx->parents); + INIT_LIST_HEAD(&ctx->parents); - ret = __inode_ctx_put (inode, this, (uint64_t )(long)ctx); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "cannot set quota context in inode (gfid:%s)", - uuid_utoa (inode->gfid)); - } + ret = __inode_ctx_put(inode, this, (uint64_t)(long)ctx); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INODE_CTX_SET_FAILED, + "cannot set quota context " + "in inode (gfid:%s)", + uuid_utoa(inode->gfid)); + GF_FREE(ctx); + } out: - return ret; + return ret; } - static int32_t -quota_inode_ctx_get (inode_t *inode, xlator_t *this, - quota_inode_ctx_t **ctx, char create_if_absent) +quota_inode_ctx_get(inode_t *inode, xlator_t *this, quota_inode_ctx_t **ctx, + char create_if_absent) { - int32_t ret = 0; - uint64_t ctx_int; + int32_t ret = 0; + uint64_t ctx_int; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx_int); + LOCK(&inode->lock); + { + ret = __inode_ctx_get(inode, this, &ctx_int); - if ((ret == 0) && (ctx != NULL)) { - *ctx = (quota_inode_ctx_t *) (unsigned long)ctx_int; - } else if (create_if_absent) { - ret = __quota_init_inode_ctx (inode, this, ctx); - } + if ((ret == 0) && (ctx != NULL)) { + *ctx = (quota_inode_ctx_t *)(unsigned long)ctx_int; + } else if (create_if_absent) { + ret = __quota_init_inode_ctx(inode, this, ctx); } - UNLOCK (&inode->lock); + } + UNLOCK(&inode->lock); - return ret; + return ret; } int -quota_loc_fill (loc_t *loc, inode_t *inode, inode_t *parent, char *path) +quota_loc_fill(loc_t *loc, inode_t *inode, inode_t *parent, char *path) { - int ret = -1; + int ret = -1; - if (!loc || (inode == NULL)) - return ret; + if (!loc || (inode == NULL)) + return ret; - if (inode) { - loc->inode = inode_ref (inode); - uuid_copy (loc->gfid, inode->gfid); - } + if (inode) { + loc->inode = inode_ref(inode); + gf_uuid_copy(loc->gfid, inode->gfid); + } - if (parent) { - loc->parent = inode_ref (parent); - } + if (parent) { + loc->parent = inode_ref(parent); + } - if (path != NULL) { - loc->path = gf_strdup (path); + if (path != NULL) { + loc->path = gf_strdup(path); - loc->name = strrchr (loc->path, '/'); - if (loc->name) { - loc->name++; - } + loc->name = strrchr(loc->path, '/'); + if (loc->name) { + loc->name++; } + } - ret = 0; - - if (ret < 0) { - loc_wipe (loc); - } + ret = 0; - return ret; + return ret; } - int -quota_inode_loc_fill (inode_t *inode, loc_t *loc) +quota_inode_loc_fill(inode_t *inode, loc_t *loc) { - char *resolvedpath = NULL; - inode_t *parent = NULL; - int ret = -1; - xlator_t *this = NULL; + char *resolvedpath = NULL; + inode_t *parent = NULL; + int ret = -1; + xlator_t *this = NULL; - if ((!inode) || (!loc)) { - return ret; - } + if ((!inode) || (!loc)) { + return ret; + } - this = THIS; + this = THIS; - if ((inode) && __is_root_gfid (inode->gfid)) { - loc->parent = NULL; - goto ignore_parent; - } + if ((inode) && __is_root_gfid(inode->gfid)) { + loc->parent = NULL; + goto ignore_parent; + } - parent = inode_parent (inode, 0, NULL); - if (!parent) { - gf_log (this->name, GF_LOG_DEBUG, - "cannot find parent for inode (gfid:%s)", - uuid_utoa (inode->gfid)); - } + parent = inode_parent(inode, 0, NULL); + if (!parent) { + gf_msg_debug(this->name, 0, + "cannot find parent for " + "inode (gfid:%s)", + uuid_utoa(inode->gfid)); + } ignore_parent: - ret = inode_path (inode, NULL, &resolvedpath); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "cannot construct path for inode (gfid:%s)", - uuid_utoa (inode->gfid)); - } - - ret = quota_loc_fill (loc, inode, parent, resolvedpath); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "cannot fill loc"); - goto err; - } + ret = inode_path(inode, NULL, &resolvedpath); + if (ret < 0) { + gf_msg_debug(this->name, 0, + "cannot construct path for " + "inode (gfid:%s)", + uuid_utoa(inode->gfid)); + } + + ret = quota_loc_fill(loc, inode, parent, resolvedpath); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "cannot fill loc"); + goto err; + } err: - if (parent) { - inode_unref (parent); - } + if (parent) { + inode_unref(parent); + } - GF_FREE (resolvedpath); + GF_FREE(resolvedpath); - return ret; + return ret; } - int32_t -quota_local_cleanup (xlator_t *this, quota_local_t *local) +quota_local_cleanup(quota_local_t *local) { - if (local == NULL) { - goto out; - } + if (local == NULL) { + goto out; + } + + loc_wipe(&local->loc); + loc_wipe(&local->newloc); + loc_wipe(&local->oldloc); + loc_wipe(&local->validate_loc); + + inode_unref(local->inode); - loc_wipe (&local->loc); - loc_wipe (&local->newloc); - loc_wipe (&local->oldloc); - loc_wipe (&local->validate_loc); + if (local->xdata) + dict_unref(local->xdata); - inode_unref (local->inode); - LOCK_DESTROY (&local->lock); + if (local->validate_xdata) + dict_unref(local->validate_xdata); - mem_put (local); + if (local->stub) + call_stub_destroy(local->stub); + + LOCK_DESTROY(&local->lock); + + mem_put(local); out: - return 0; + return 0; } - -static inline quota_local_t * -quota_local_new () +static quota_local_t * +quota_local_new() { - quota_local_t *local = NULL; - local = mem_get0 (THIS->local_pool); - if (local == NULL) - goto out; + quota_local_t *local = NULL; + local = mem_get0(THIS->local_pool); + if (local == NULL) + goto out; - LOCK_INIT (&local->lock); - local->space_available = -1; + LOCK_INIT(&local->lock); + local->space_available = -1; out: - return local; + return local; } - quota_dentry_t * -__quota_dentry_new (quota_inode_ctx_t *ctx, char *name, uuid_t par) +__quota_dentry_new(quota_inode_ctx_t *ctx, char *name, uuid_t par) { - quota_dentry_t *dentry = NULL; - GF_UNUSED int32_t ret = 0; + quota_dentry_t *dentry = NULL; + GF_UNUSED int32_t ret = 0; - QUOTA_ALLOC_OR_GOTO (dentry, quota_dentry_t, err); + QUOTA_ALLOC_OR_GOTO(dentry, quota_dentry_t, err); - INIT_LIST_HEAD (&dentry->next); + INIT_LIST_HEAD(&dentry->next); - dentry->name = gf_strdup (name); - if (dentry->name == NULL) { - GF_FREE (dentry); - goto err; - } + dentry->name = gf_strdup(name); + if (dentry->name == NULL) { + GF_FREE(dentry); + dentry = NULL; + goto err; + } - uuid_copy (dentry->par, par); + gf_uuid_copy(dentry->par, par); - if (ctx != NULL) - list_add_tail (&dentry->next, &ctx->parents); + if (ctx != NULL) + list_add_tail(&dentry->next, &ctx->parents); err: - return dentry; + return dentry; } - void -__quota_dentry_free (quota_dentry_t *dentry) +__quota_dentry_free(quota_dentry_t *dentry) { - if (dentry == NULL) { - goto out; - } + if (dentry == NULL) { + goto out; + } - list_del_init (&dentry->next); + list_del_init(&dentry->next); - GF_FREE (dentry->name); - GF_FREE (dentry); + GF_FREE(dentry->name); + GF_FREE(dentry); out: - return; + return; } -inline void -quota_resume_fop_if_validation_done (quota_local_t *local) +void +__quota_dentry_del(quota_inode_ctx_t *ctx, const char *name, uuid_t par) { - call_stub_t *stub = NULL; - int link_count = -1; - - if (local == NULL) - goto out; - - LOCK (&local->lock); - { - link_count = local->link_count; - if (link_count == 0) { - stub = local->stub; - local->stub = NULL; - } - } - UNLOCK (&local->lock); + quota_dentry_t *dentry = NULL; + quota_dentry_t *tmp = NULL; - if (stub != NULL) { - call_resume (stub); + list_for_each_entry_safe(dentry, tmp, &ctx->parents, next) + { + if ((strcmp(dentry->name, name) == 0) && + (gf_uuid_compare(dentry->par, par) == 0)) { + __quota_dentry_free(dentry); + break; } -out: - return; + } } -inline void -quota_handle_validate_error (quota_local_t *local, int32_t op_ret, - int32_t op_errno) +void +quota_dentry_del(quota_inode_ctx_t *ctx, const char *name, uuid_t par) { - if (local == NULL) - goto out; - - LOCK (&local->lock); - { - if (op_ret < 0) { - local->op_ret = op_ret; - local->op_errno = op_errno; - } - - /* we abort checking limits on this path to root */ - local->link_count--; - } - UNLOCK (&local->lock); - - quota_resume_fop_if_validation_done (local); -out: - return; + LOCK(&ctx->lock); + { + __quota_dentry_del(ctx, name, par); + } + UNLOCK(&ctx->lock); } -int32_t -quota_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xdata, struct iatt *postparent) +static inode_t * +__quota_inode_parent(inode_t *inode, uuid_t pargfid, const char *name) { - quota_local_t *local = NULL; - int32_t ret = 0; - quota_inode_ctx_t *ctx = NULL; - int64_t *size = 0; - uint64_t value = 0; - - local = frame->local; - - if (op_ret < 0) { - goto unwind; - } - - GF_ASSERT (local); - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO_WITH_ERROR ("quota", this, unwind, op_errno, - EINVAL); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, xdata, unwind, op_errno, - EINVAL); + inode_t *parent = NULL; - ret = inode_ctx_get (local->validate_loc.inode, this, &value); - - ctx = (quota_inode_ctx_t *)(unsigned long)value; - if ((ret == -1) || (ctx == NULL)) { - gf_log (this->name, GF_LOG_WARNING, - "quota context is not present in inode (gfid:%s)", - uuid_utoa (local->validate_loc.inode->gfid)); - op_errno = EINVAL; - goto unwind; - } - - ret = dict_get_bin (xdata, QUOTA_SIZE_KEY, (void **) &size); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "size key not present in dict"); - op_errno = EINVAL; - goto unwind; - } - - local->just_validated = 1; /* so that we don't go into infinite - * loop of validation and checking - * limit when timeout is zero. - */ - LOCK (&ctx->lock); - { - ctx->size = ntoh64 (*size); - gettimeofday (&ctx->tv, NULL); - } - UNLOCK (&ctx->lock); - - quota_check_limit (frame, local->validate_loc.inode, this, NULL, NULL); - return 0; - -unwind: - quota_handle_validate_error (local, op_ret, op_errno); - return 0; + parent = inode_parent(inode, pargfid, name); + inode_unref(inode); + return parent; } - -static inline uint64_t -quota_time_elapsed (struct timeval *now, struct timeval *then) +static inode_t * +quota_inode_parent(inode_t *inode, uuid_t pargfid, const char *name) { - return (now->tv_sec - then->tv_sec); -} + inode_t *parent = NULL; + parent = __quota_inode_parent(inode, pargfid, name); + if (!parent) + gf_msg_callingfn(THIS->name, GF_LOG_ERROR, 0, Q_MSG_PARENT_NULL, + "Failed to find " + "ancestor for inode (%s)", + uuid_utoa(inode->gfid)); + + return parent; +} int32_t -quota_timeout (struct timeval *tv, int32_t timeout) +quota_inode_depth(inode_t *inode) { - struct timeval now = {0,}; - int32_t timed_out = 0; + int depth = 0; + inode_t *cur_inode = NULL; - gettimeofday (&now, NULL); + cur_inode = inode_ref(inode); + while (cur_inode && !__is_root_gfid(cur_inode->gfid)) { + depth++; + cur_inode = quota_inode_parent(cur_inode, 0, NULL); + if (!cur_inode) + depth = -1; + } - if (quota_time_elapsed (&now, tv) >= timeout) { - timed_out = 1; - } + if (cur_inode) + inode_unref(cur_inode); - return timed_out; + return depth; } int32_t -quota_build_ancestry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) +quota_find_common_ancestor(inode_t *inode1, inode_t *inode2, + uuid_t *common_ancestor) { - inode_t *parent = NULL; - gf_dirent_t *entry = NULL; - loc_t loc = {0, }; - quota_dentry_t *dentry = NULL, *tmp = NULL; - quota_inode_ctx_t *ctx = NULL; - struct list_head parents = {0, }; - quota_local_t *local = NULL; - call_frame_t *continuation_frame = NULL; - - INIT_LIST_HEAD (&parents); - - continuation_frame = frame->local; - frame->local = NULL; + int32_t depth1 = 0; + int32_t depth2 = 0; + int32_t ret = -1; + inode_t *cur_inode1 = NULL; + inode_t *cur_inode2 = NULL; + + depth1 = quota_inode_depth(inode1); + if (depth1 < 0) + goto out; + + depth2 = quota_inode_depth(inode2); + if (depth2 < 0) + goto out; + + cur_inode1 = inode_ref(inode1); + cur_inode2 = inode_ref(inode2); + + while (cur_inode1 && depth1 > depth2) { + cur_inode1 = quota_inode_parent(cur_inode1, 0, NULL); + depth1--; + } + + while (cur_inode2 && depth2 > depth1) { + cur_inode2 = quota_inode_parent(cur_inode2, 0, NULL); + depth2--; + } + + while (depth1 && cur_inode1 && cur_inode2 && cur_inode1 != cur_inode2) { + cur_inode1 = quota_inode_parent(cur_inode1, 0, NULL); + cur_inode2 = quota_inode_parent(cur_inode2, 0, NULL); + depth1--; + } + + if (cur_inode1 && cur_inode2) { + gf_uuid_copy(*common_ancestor, cur_inode1->gfid); + ret = 0; + } +out: + if (cur_inode1) + inode_unref(cur_inode1); - local = continuation_frame->local; + if (cur_inode2) + inode_unref(cur_inode2); - if (op_ret < 0) - goto err; + return ret; +} - parent = inode_parent (local->validate_loc.inode, 0, NULL); - if (parent == NULL) { - gf_log (this->name, GF_LOG_WARNING, "parent is NULL"); - op_errno = EINVAL; - goto err; +void +check_ancestory_continue(struct list_head *parents, inode_t *inode, + int32_t op_ret, int32_t op_errno, void *data) +{ + call_frame_t *frame = NULL; + quota_local_t *local = NULL; + uint32_t link_count = 0; + + frame = data; + local = frame->local; + + if (parents && list_empty(parents)) { + gf_msg(THIS->name, GF_LOG_WARNING, EIO, Q_MSG_ANCESTRY_BUILD_FAILED, + "Couldn't build ancestry for inode (gfid:%s). " + "Without knowing ancestors till root, quota " + "cannot be enforced. " + "Hence, failing fop with EIO", + uuid_utoa(inode->gfid)); + op_errno = EIO; + op_ret = -1; + } + + LOCK(&local->lock); + { + link_count = --local->link_count; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; } + } + UNLOCK(&local->lock); - if ((op_ret > 0) && (entries != NULL)) { - list_for_each_entry (entry, &entries->list, list) { - if (__is_root_gfid (entry->inode->gfid)) { - /* The list contains a sub-list for each - * possible path to the target inode. Each - * sub-list starts with the root entry of the - * tree and is followed by the child entries - * for a particular path to the target entry. - * The root entry is an implied sub-list - * delimiter, as it denotes we have started - * processing a new path. Reset the parent - * pointer and continue - */ + if (link_count == 0) + local->fop_continue_cbk(frame); +} - parent = NULL; - } +void +check_ancestory(call_frame_t *frame, inode_t *inode) +{ + inode_t *cur_inode = NULL; + inode_t *parent = NULL; - uuid_copy (loc.gfid, entry->d_stat.ia_gfid); + cur_inode = inode_ref(inode); + while (cur_inode && !__is_root_gfid(cur_inode->gfid)) { + parent = inode_parent(cur_inode, 0, NULL); + if (!parent) { + quota_build_ancestry(cur_inode, check_ancestory_continue, frame); + inode_unref(cur_inode); + return; + } + inode_unref(cur_inode); + cur_inode = parent; + } + + if (cur_inode) { + inode_unref(cur_inode); + check_ancestory_continue(NULL, NULL, 0, 0, frame); + } else { + check_ancestory_continue(NULL, NULL, -1, ESTALE, frame); + } +} - loc.inode = inode_ref (entry->inode); - loc.parent = inode_ref (parent); - loc.name = entry->d_name; +void +check_ancestory_2_cbk(struct list_head *parents, inode_t *inode, int32_t op_ret, + int32_t op_errno, void *data) +{ + inode_t *this_inode = NULL; + quota_inode_ctx_t *ctx = NULL; - quota_fill_inodectx (this, entry->inode, entry->dict, - &loc, &entry->d_stat, &op_errno); + this_inode = data; - parent = entry->inode; + if (op_ret < 0) + goto out; - loc_wipe (&loc); - } - } + if (parents == NULL || list_empty(parents)) { + gf_msg(THIS->name, GF_LOG_WARNING, 0, Q_MSG_ENFORCEMENT_FAILED, + "Couldn't build ancestry for inode (gfid:%s). " + "Without knowing ancestors till root, quota " + "cannot be enforced.", + uuid_utoa(this_inode->gfid)); + goto out; + } - quota_inode_ctx_get (local->validate_loc.inode, this, &ctx, 0); + quota_inode_ctx_get(this_inode, THIS, &ctx, 0); + if (ctx) + ctx->ancestry_built = _gf_true; - local->link_count = 0; +out: + inode_unref(this_inode); +} - if (ctx != NULL) { - LOCK (&ctx->lock); - { - list_for_each_entry (dentry, &ctx->parents, next) { - tmp = __quota_dentry_new (NULL, dentry->name, - dentry->par); - list_add_tail (&tmp->next, &parents); - local->link_count++; - } - } - UNLOCK (&ctx->lock); - } +void +check_ancestory_2(xlator_t *this, quota_local_t *local, inode_t *inode) +{ + inode_t *cur_inode = NULL; + inode_t *parent = NULL; + quota_inode_ctx_t *ctx = NULL; + char *name = NULL; + uuid_t pgfid = {0}; + + name = (char *)local->loc.name; + if (local->loc.parent) { + gf_uuid_copy(pgfid, local->loc.parent->gfid); + } + + cur_inode = inode_ref(inode); + while (cur_inode && !__is_root_gfid(cur_inode->gfid)) { + quota_inode_ctx_get(cur_inode, this, &ctx, 0); + /* build ancestry is required only on the first lookup, + * so stop crawling when the inode_ctx is set for an inode + */ + if (ctx && ctx->ancestry_built) + goto setctx; - if (local->link_count != 0) { - list_for_each_entry_safe (dentry, tmp, &parents, next) { - quota_check_limit (continuation_frame, - local->validate_loc.inode, - this, dentry->name, dentry->par); - __quota_dentry_free (dentry); - } - } else { - local->link_count = 1; - quota_check_limit (continuation_frame, parent, this, NULL, - NULL); + parent = inode_parent(cur_inode, pgfid, name); + if (!parent) { + quota_build_ancestry(cur_inode, check_ancestory_2_cbk, + inode_ref(inode)); + goto out; } - STACK_DESTROY (frame->root); - return 0; + if (name != NULL) { + name = NULL; + gf_uuid_clear(pgfid); + } -err: - STACK_DESTROY (frame->root); + inode_unref(cur_inode); + cur_inode = parent; + } - quota_handle_validate_error (local, -1, op_errno); - return 0; +setctx: + if (cur_inode && cur_inode != inode) { + quota_inode_ctx_get(inode, this, &ctx, 0); + if (ctx) + ctx->ancestry_built = _gf_true; + } +out: + if (cur_inode) + inode_unref(cur_inode); } -int32_t -quota_build_ancestry_open_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd, dict_t *xdata) +static void +quota_link_count_decrement(call_frame_t *frame) { - int ret = -1; - dict_t *xdata_req = NULL; - quota_local_t *local = NULL; - call_frame_t *continuation_frame = NULL; - - xdata_req = dict_new (); - if (xdata_req == NULL) { - ret = -ENOMEM; - goto err; - } + call_frame_t *tmpframe = NULL; + quota_local_t *local = NULL; + call_stub_t *stub = NULL; + int link_count = -1; + + local = frame->local; + if (local && local->par_frame) { + local = local->par_frame->local; + tmpframe = frame; + } + + if (local == NULL) + goto out; + + LOCK(&local->lock); + { + link_count = --local->link_count; + if (link_count == 0) { + stub = local->stub; + local->stub = NULL; + } + } + UNLOCK(&local->lock); + + if (stub != NULL) { + call_resume(stub); + } - ret = dict_set_int8 (xdata_req, QUOTA_LIMIT_KEY, 1); - if (ret < 0) - goto err; - - ret = dict_set_int8 (xdata_req, GET_ANCESTRY_DENTRY_KEY, 1); - if (ret < 0) - goto err; - - /* This would ask posix layer to construct dentry chain till root */ - STACK_WIND (frame, quota_build_ancestry_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdirp, fd, 0, 0, xdata_req); +out: + if (tmpframe) { + local = tmpframe->local; + tmpframe->local = NULL; - ret = 0; + STACK_DESTROY(frame->root); + if (local) + quota_local_cleanup(local); + } -err: - fd_unref (fd); + return; +} - dict_unref (xdata_req); +static void +quota_handle_validate_error(call_frame_t *frame, int32_t op_ret, + int32_t op_errno) +{ + quota_local_t *local; - if (ret < 0) { - continuation_frame = frame->local; - frame->local = NULL; + local = frame->local; + if (local && local->par_frame) + local = local->par_frame->local; - STACK_DESTROY (frame->root); + if (local == NULL) + goto out; - local = continuation_frame->local; - quota_handle_validate_error (local, -1, op_errno); + if (op_ret < 0) { + LOCK(&local->lock); + { + local->op_ret = op_ret; + local->op_errno = op_errno; } - - return ret; + UNLOCK(&local->lock); + } + /* we abort checking limits on this path to root */ + quota_link_count_decrement(frame); +out: + return; } -int -quota_build_ancestry (call_frame_t *frame, inode_t *inode, xlator_t *this) +int32_t +quota_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) { - loc_t loc = {0, }; - fd_t *fd = NULL; - quota_local_t *local = NULL; - call_frame_t *new_frame = NULL; - int ret = -1; + quota_local_t *local = NULL; + int32_t ret = 0; + quota_inode_ctx_t *ctx = NULL; + uint64_t value = 0; + quota_meta_t size = { + 0, + }; + + local = frame->local; + + if (op_ret < 0) { + goto unwind; + } + + GF_ASSERT(local); + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO_WITH_ERROR("quota", this, unwind, op_errno, EINVAL); + GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, xdata, unwind, op_errno, EINVAL); + + ret = inode_ctx_get(local->validate_loc.inode, this, &value); + + ctx = (quota_inode_ctx_t *)(unsigned long)value; + if ((ret == -1) || (ctx == NULL)) { + gf_msg(this->name, GF_LOG_WARNING, EINVAL, Q_MSG_INODE_CTX_GET_FAILED, + "quota context is" + " not present in inode (gfid:%s)", + uuid_utoa(local->validate_loc.inode->gfid)); + op_errno = EINVAL; + goto unwind; + } + + ret = quota_dict_get_meta(xdata, QUOTA_SIZE_KEY, SLEN(QUOTA_SIZE_KEY), + &size); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, EINVAL, Q_MSG_SIZE_KEY_MISSING, + "quota size key not present " + "in dict"); + op_errno = EINVAL; + } + + local->just_validated = 1; /* so that we don't go into infinite + * loop of validation and checking + * limit when timeout is zero. + */ + LOCK(&ctx->lock); + { + ctx->size = size.size; + ctx->validate_time = gf_time(); + ctx->file_count = size.file_count; + ctx->dir_count = size.dir_count; + } + UNLOCK(&ctx->lock); + + quota_check_limit(frame, local->validate_loc.inode, this); + return 0; - loc.inode = inode_ref (inode); - uuid_copy (loc.gfid, inode->gfid); +unwind: + quota_handle_validate_error(frame, op_ret, op_errno); + return 0; +} - gf_log (this->name, GF_LOG_WARNING, "building ancestry"); +static inline gf_boolean_t +quota_timeout(time_t t, uint32_t timeout) +{ + return (gf_time() - t) >= timeout; +} - local = frame->local; +/* Return: 1 if new entry added + * 0 no entry added + * -1 on errors + */ +static int32_t +quota_add_parent(struct list_head *list, char *name, uuid_t pgfid) +{ + quota_dentry_t *entry = NULL; + gf_boolean_t found = _gf_false; + int ret = 0; - LOCK (&local->lock); + if (!list_empty(list)) { + list_for_each_entry(entry, list, next) { - loc_wipe (&local->validate_loc); - - ret = quota_inode_loc_fill (inode, &local->validate_loc); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "cannot fill loc for inode (gfid:%s), hence " - "aborting quota-checks and continuing with fop", - uuid_utoa (inode->gfid)); - } - } - UNLOCK (&local->lock); - - fd = fd_create (inode, 0); - - new_frame = copy_frame (frame); - new_frame->root->uid = new_frame->root->gid = 0; - - new_frame->local = frame; - - if (IA_ISDIR (inode->ia_type)) { - STACK_WIND (new_frame, quota_build_ancestry_open_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->opendir, &loc, fd, - NULL); - } else { - STACK_WIND (new_frame, quota_build_ancestry_open_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, &loc, 0, fd, - NULL); + if (gf_uuid_compare(pgfid, entry->par) == 0) { + found = _gf_true; + goto out; + } } + } - loc_wipe (&loc); + entry = __quota_dentry_new(NULL, name, pgfid); + if (entry) + list_add_tail(&entry->next, list); + else + ret = -1; +out: + if (found) return 0; + else if (ret == 0) + return 1; + else + return -1; } -int -quota_validate (call_frame_t *frame, inode_t *inode, xlator_t *this, - fop_lookup_cbk_t cbk_fn) +/* This function iterates the parent list in inode + * context and add unique parent to the list + * Returns number of dentry added to the list, or -1 on errors + */ +static int32_t +quota_add_parents_from_ctx(quota_inode_ctx_t *ctx, struct list_head *list) { - quota_local_t *local = NULL; - int ret = 0; - dict_t *xdata = NULL; - quota_priv_t *priv = NULL; + int ret = 0; + quota_dentry_t *dentry = NULL; + int32_t count = 0; - local = frame->local; - priv = this->private; + if (ctx == NULL || list == NULL) + goto out; - LOCK (&local->lock); + LOCK(&ctx->lock); + { + list_for_each_entry(dentry, &ctx->parents, next) { - loc_wipe (&local->validate_loc); - - ret = quota_inode_loc_fill (inode, &local->validate_loc); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "cannot fill loc for inode (gfid:%s), hence " - "aborting quota-checks and continuing with fop", - uuid_utoa (inode->gfid)); - } + ret = quota_add_parent(list, dentry->name, dentry->par); + if (ret == 1) + count++; + else if (ret == -1) + break; } - UNLOCK (&local->lock); + } + UNLOCK(&ctx->lock); - if (ret < 0) { - ret = -ENOMEM; - goto err; - } +out: + return (ret == -1) ? -1 : count; +} - xdata = dict_new (); - if (xdata == NULL) { - ret = -ENOMEM; - goto err; - } +int32_t +quota_build_ancestry_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + inode_t *parent = NULL; + inode_t *tmp_parent = NULL; + inode_t *linked_inode = NULL; + inode_t *tmp_inode = NULL; + gf_dirent_t *entry = NULL; + loc_t loc = { + 0, + }; + quota_dentry_t *dentry = NULL; + quota_dentry_t *tmp = NULL; + quota_inode_ctx_t *ctx = NULL; + struct list_head parents; + quota_local_t *local = NULL; + int ret; + + INIT_LIST_HEAD(&parents); + + local = frame->local; + frame->local = NULL; + + if (op_ret < 0) + goto err; + + if ((op_ret > 0) && (entries != NULL)) { + list_for_each_entry(entry, &entries->list, list) + { + if (__is_root_gfid(entry->inode->gfid)) { + /* The list contains a sub-list for each + * possible path to the target inode. Each + * sub-list starts with the root entry of the + * tree and is followed by the child entries + * for a particular path to the target entry. + * The root entry is an implied sub-list + * delimiter, as it denotes we have started + * processing a new path. Reset the parent + * pointer and continue + */ - ret = dict_set_int8 (xdata, QUOTA_SIZE_KEY, 1); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "dict set failed"); - ret = -ENOMEM; - goto err; + tmp_parent = NULL; + } else { + /* For a non-root entry, link this inode */ + linked_inode = inode_link(entry->inode, tmp_parent, + entry->d_name, &entry->d_stat); + if (linked_inode) { + tmp_inode = entry->inode; + entry->inode = linked_inode; + inode_unref(tmp_inode); + } else { + gf_msg(this->name, GF_LOG_WARNING, EINVAL, + Q_MSG_PARENT_NULL, "inode link failed"); + op_errno = EINVAL; + goto err; + } + } + + gf_uuid_copy(loc.gfid, entry->d_stat.ia_gfid); + + loc.inode = inode_ref(entry->inode); + loc.parent = inode_ref(tmp_parent); + loc.name = entry->d_name; + + quota_fill_inodectx(this, entry->inode, entry->dict, &loc, + &entry->d_stat, &op_errno); + + /* For non-directory, posix_get_ancestry_non_directory + * returns all hard-links that are represented by nodes + * adjacent to each other in the dentry-list. + * (Unlike the directory case where adjacent nodes + * either have a parent/child relationship or belong to + * different paths). + */ + if (entry->inode->ia_type == IA_IFDIR) + tmp_parent = entry->inode; + + loc_wipe(&loc); + } + } + + parent = inode_parent(local->loc.inode, 0, NULL); + if (parent == NULL) { + gf_msg(this->name, GF_LOG_WARNING, EINVAL, Q_MSG_PARENT_NULL, + "parent is NULL"); + op_errno = EINVAL; + goto err; + } + + quota_inode_ctx_get(local->loc.inode, this, &ctx, 0); + + ret = quota_add_parents_from_ctx(ctx, &parents); + if (ret == -1) { + op_errno = errno; + goto err; + } + + if (list_empty(&parents)) { + /* we built ancestry for a directory */ + list_for_each_entry(entry, &entries->list, list) + { + if (entry->inode == local->loc.inode) + break; } - ret = dict_set_str (xdata, "volume-uuid", priv->volume_uuid); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "dict set failed"); - ret = -ENOMEM; - goto err; - } + /* Getting assertion here, need to investigate + comment for now + GF_ASSERT (&entry->list != &entries->list); + */ - ret = quota_enforcer_lookup (frame, this, &local->validate_loc, xdata, - cbk_fn); - if (ret < 0) { - ret = -ENOTCONN; - goto err; + ret = quota_add_parent(&parents, entry->d_name, parent->gfid); + if (ret == -1) { + op_errno = errno; + goto err; } + } - ret = 0; -err: - return ret; -} + local->ancestry_cbk(&parents, local->loc.inode, 0, 0, local->ancestry_data); + goto cleanup; -int32_t -quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this, - char *name, uuid_t par) -{ - int32_t ret = -1, op_errno = EINVAL; - inode_t *_inode = NULL, *parent = NULL; - quota_inode_ctx_t *ctx = NULL; - quota_priv_t *priv = NULL; - quota_local_t *local = NULL; - char need_validate = 0; - gf_boolean_t hard_limit_exceeded = 0; - int64_t delta = 0, wouldbe_size = 0; - int64_t space_available = 0; - uint64_t value = 0; - char just_validated = 0; - uuid_t trav_uuid = {0,}; - uint32_t timeout = 0; - - GF_VALIDATE_OR_GOTO ("quota", this, err); - GF_VALIDATE_OR_GOTO (this->name, frame, err); - GF_VALIDATE_OR_GOTO (this->name, inode, err); - - local = frame->local; - GF_VALIDATE_OR_GOTO (this->name, local, err); - - delta = local->delta; - - GF_VALIDATE_OR_GOTO (this->name, local->stub, err); - /* Allow all the trusted clients - * Don't block the gluster internal processes like rebalance, gsyncd, - * self heal etc from the disk quotas. - * - * Method: Allow all the clients with PID negative. This is by the - * assumption that any kernel assigned pid doesn't have the negative - * number. - */ - if (0 > frame->root->pid) { - ret = 0; - LOCK (&local->lock); - { - --local->link_count; - } - UNLOCK (&local->lock); - goto resume; - } - - priv = this->private; +err: + local->ancestry_cbk(NULL, NULL, -1, op_errno, local->ancestry_data); - inode_ctx_get (inode, this, &value); - ctx = (quota_inode_ctx_t *)(unsigned long)value; +cleanup: + STACK_DESTROY(frame->root); + quota_local_cleanup(local); - _inode = inode_ref (inode); + if (parent != NULL) { + inode_unref(parent); + parent = NULL; + } - LOCK (&local->lock); + if (!list_empty(&parents)) { + list_for_each_entry_safe(dentry, tmp, &parents, next) { - just_validated = local->just_validated; - local->just_validated = 0; + __quota_dentry_free(dentry); } - UNLOCK (&local->lock); + } - if ( par != NULL ) { - uuid_copy (trav_uuid, par); - } - - do { - if (ctx != NULL && (ctx->hard_lim > 0 || ctx->soft_lim > 0)) { - wouldbe_size = ctx->size + delta; - - LOCK (&ctx->lock); - { - timeout = priv->soft_timeout; - - if ((ctx->soft_lim >= 0) - && (wouldbe_size > ctx->soft_lim)) { - timeout = priv->hard_timeout; - } - - if (!just_validated - && quota_timeout (&ctx->tv, timeout)) { - need_validate = 1; - } else if (wouldbe_size >= ctx->hard_lim) { - hard_limit_exceeded = 1; - } - } - UNLOCK (&ctx->lock); - - /* We log usage only if quota limit is configured on - that inode. */ - quota_log_usage (this, ctx, _inode, delta); - - if (need_validate) { - ret = quota_validate (frame, _inode, this, - quota_validate_cbk); - if (ret < 0) { - op_errno = -ret; - goto err; - } - - break; - } - - if (hard_limit_exceeded) { - local->op_ret = -1; - local->op_errno = EDQUOT; - - space_available = ctx->hard_lim - ctx->size; - - if (space_available < 0) - space_available = 0; - - if ((local->space_available < 0) - || (local->space_available - > space_available)){ - local->space_available - = space_available; - - } - } - } - - if (__is_root_gfid (_inode->gfid)) { - LOCK (&local->lock); - { - --local->link_count; - } - UNLOCK (&local->lock); + return 0; +} - break; - } +int +quota_build_ancestry(inode_t *inode, quota_ancestry_built_t ancestry_cbk, + void *data) +{ + fd_t *fd = NULL; + quota_local_t *local = NULL; + call_frame_t *new_frame = NULL; + int op_errno = ENOMEM; + int op_ret = -1; + xlator_t *this = NULL; + dict_t *xdata_req = NULL; + + this = THIS; + + xdata_req = dict_new(); + if (xdata_req == NULL) + goto err; + + fd = fd_anonymous(inode); + if (fd == NULL) + goto err; + + new_frame = create_frame(this, this->ctx->pool); + if (new_frame == NULL) + goto err; + + local = quota_local_new(); + if (local == NULL) + goto err; + + new_frame->root->uid = new_frame->root->gid = 0; + new_frame->local = local; + local->ancestry_cbk = ancestry_cbk; + local->ancestry_data = data; + local->loc.inode = inode_ref(inode); + + op_ret = dict_set_int8(xdata_req, QUOTA_LIMIT_KEY, 1); + if (op_ret < 0) { + op_errno = -op_ret; + goto err; + } + + op_ret = dict_set_int8(xdata_req, QUOTA_LIMIT_OBJECTS_KEY, 1); + if (op_ret < 0) { + op_errno = -op_ret; + goto err; + } + + op_ret = dict_set_int8(xdata_req, GET_ANCESTRY_DENTRY_KEY, 1); + if (op_ret < 0) { + op_errno = -op_ret; + goto err; + } + + /* This would ask posix layer to construct dentry chain till root + * We don't need to do a opendir, we can use the anonymous fd + * here for the readidrp. + * avoiding opendir also reduces the window size where another FOP + * can be executed before completion of build ancestry + */ + STACK_WIND(new_frame, quota_build_ancestry_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, 0, 0, xdata_req); + + op_ret = 0; - parent = inode_parent (_inode, trav_uuid, name); +err: + if (fd) + fd_unref(fd); - if (name != NULL) { - name = NULL; - uuid_clear (trav_uuid); - } + if (xdata_req) + dict_unref(xdata_req); - if (parent == NULL) { - ret = quota_build_ancestry (frame, _inode, this); - if (ret < 0) { - op_errno = -ret; - goto err; - } + if (op_ret < 0) { + ancestry_cbk(NULL, NULL, -1, op_errno, data); - break; - } + if (new_frame) { + local = new_frame->local; + new_frame->local = NULL; + STACK_DESTROY(new_frame->root); + } - inode_unref (_inode); - _inode = parent; - just_validated = 0; + if (local) + quota_local_cleanup(local); + } - if (_inode == NULL) { - break; - } + return 0; +} - value = 0; - inode_ctx_get (_inode, this, &value); - ctx = (quota_inode_ctx_t *)(unsigned long)value; - } while (1); +int +quota_validate(call_frame_t *frame, inode_t *inode, xlator_t *this, + fop_lookup_cbk_t cbk_fn) +{ + quota_local_t *local = NULL; + int ret = 0; + dict_t *xdata = NULL; + quota_priv_t *priv = NULL; - if (_inode != NULL) { - inode_unref (_inode); - _inode = NULL; - } + local = frame->local; + priv = this->private; -resume: - quota_resume_fop_if_validation_done (local); - return 0; + LOCK(&local->lock); + { + loc_wipe(&local->validate_loc); + ret = quota_inode_loc_fill(inode, &local->validate_loc); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENFORCEMENT_FAILED, + "cannot fill loc for inode (gfid:%s), hence " + "aborting quota-checks and continuing with fop", + uuid_utoa(inode->gfid)); + } + } + UNLOCK(&local->lock); + + if (ret < 0) { + ret = -ENOMEM; + goto err; + } + + xdata = dict_new(); + if (xdata == NULL) { + ret = -ENOMEM; + goto err; + } + + ret = dict_set_int8(xdata, QUOTA_SIZE_KEY, 1); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "dict set failed"); + ret = -ENOMEM; + goto err; + } + + ret = dict_set_str(xdata, "volume-uuid", priv->volume_uuid); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "dict set failed"); + ret = -ENOMEM; + goto err; + } + + ret = quota_enforcer_lookup(frame, this, xdata, cbk_fn); + if (ret < 0) { + ret = -ENOTCONN; + goto err; + } + + ret = 0; err: - quota_handle_validate_error (local, -1, op_errno); + if (xdata) + dict_unref(xdata); - inode_unref (_inode); - return 0; + return ret; } -inline int -quota_get_limits (xlator_t *this, dict_t *dict, int64_t *hard_lim, - int64_t *soft_lim) +void +quota_check_limit_continuation(struct list_head *parents, inode_t *inode, + int32_t op_ret, int32_t op_errno, void *data) { - quota_limit_t *limit = NULL; - quota_priv_t *priv = NULL; - int64_t soft_lim_percent = 0, *ptr = NULL; - int ret = 0; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + quota_local_t *local = NULL; + quota_local_t *par_local = NULL; + quota_dentry_t *entry = NULL; + inode_t *parent = NULL; + int parent_count = 0; + + frame = data; + local = frame->local; + this = THIS; + + if (local->par_frame) + par_local = local->par_frame->local; + else + par_local = local; + + if ((op_ret < 0) || list_empty(parents)) { + if (op_ret >= 0) { + gf_msg(this->name, GF_LOG_WARNING, EIO, Q_MSG_ANCESTRY_BUILD_FAILED, + "Couldn't build ancestry for inode (gfid:%s). " + "Without knowing ancestors till root, quota" + "cannot be enforced. " + "Hence, failing fop with EIO", + uuid_utoa(inode->gfid)); + op_errno = EIO; + } + + quota_handle_validate_error(frame, -1, op_errno); + goto out; + } + + list_for_each_entry(entry, parents, next) { parent_count++; } + + LOCK(&par_local->lock); + { + par_local->link_count += (parent_count - 1); + } + UNLOCK(&par_local->lock); + + if (local->par_frame) { + list_for_each_entry(entry, parents, next) + { + parent = inode_find(inode->table, entry->par); + quota_check_limit(frame, parent, this); + inode_unref(parent); + } + } else { + list_for_each_entry(entry, parents, next) + { + parent = do_quota_check_limit(frame, inode, this, entry, _gf_true); + if (parent) + inode_unref(parent); + else + quota_link_count_decrement(frame); + } + } - if ((this == NULL) || (dict == NULL) || (hard_lim == NULL) - || (soft_lim == NULL)) - goto out; +out: + return; +} - priv = this->private; +int32_t +quota_check_object_limit(call_frame_t *frame, quota_inode_ctx_t *ctx, + quota_priv_t *priv, inode_t *_inode, xlator_t *this, + int32_t *op_errno, int just_validated, + quota_local_t *local, gf_boolean_t *skip_check) +{ + int32_t ret = -1; + uint32_t timeout = 0; + char need_validate = 0; + gf_boolean_t hard_limit_exceeded = 0; + int64_t object_aggr_count = 0; + + GF_ASSERT(frame); + GF_ASSERT(priv); + GF_ASSERT(_inode); + GF_ASSERT(this); + GF_ASSERT(local); + + if (ctx != NULL && (ctx->object_hard_lim > 0 || ctx->object_soft_lim)) { + LOCK(&ctx->lock); + { + timeout = priv->soft_timeout; - ret = dict_get_bin (dict, QUOTA_LIMIT_KEY, (void **) &ptr); - limit = (quota_limit_t *)ptr; + object_aggr_count = ctx->file_count + ctx->dir_count + 1; + if (((ctx->object_soft_lim >= 0) && + (object_aggr_count) > ctx->object_soft_lim)) { + timeout = priv->hard_timeout; + } - if (limit) { - *hard_lim = ntoh64 (limit->hard_lim); - soft_lim_percent = ntoh64 (limit->soft_lim_percent); + if (!just_validated && quota_timeout(ctx->validate_time, timeout)) { + need_validate = 1; + } else if ((object_aggr_count) > ctx->object_hard_lim) { + hard_limit_exceeded = 1; + } } + UNLOCK(&ctx->lock); - if (soft_lim_percent < 0) { - soft_lim_percent = priv->default_soft_lim; + if (need_validate && *skip_check != _gf_true) { + *skip_check = _gf_true; + ret = quota_validate(frame, _inode, this, quota_validate_cbk); + if (ret < 0) { + *op_errno = -ret; + *skip_check = _gf_false; + } + goto out; } - if ((*hard_lim > 0) && (soft_lim_percent > 0)) { - *soft_lim = (soft_lim_percent * (*hard_lim))/100; + if (hard_limit_exceeded) { + local->op_ret = -1; + local->op_errno = EDQUOT; + *op_errno = EDQUOT; + goto out; } + /*We log usage only if quota limit is configured on + that inode + */ + quota_log_usage(this, ctx, _inode, 0); + } + + ret = 0; + out: - return 0; + return ret; } -int -quota_fill_inodectx (xlator_t *this, inode_t *inode, dict_t *dict, - loc_t *loc, struct iatt *buf, int32_t *op_errno) +int32_t +quota_check_size_limit(call_frame_t *frame, quota_inode_ctx_t *ctx, + quota_priv_t *priv, inode_t *_inode, xlator_t *this, + int32_t *op_errno, int just_validated, int64_t delta, + quota_local_t *local, gf_boolean_t *skip_check) { - int32_t ret = -1; - char found = 0; - quota_inode_ctx_t *ctx = NULL; - quota_dentry_t *dentry = NULL; - uint64_t value = 0; - int64_t hard_lim = -1, soft_lim = -1; - - quota_get_limits (this, dict, &hard_lim, &soft_lim); + int32_t ret = -1; + uint32_t timeout = 0; + char need_validate = 0; + gf_boolean_t hard_limit_exceeded = 0; + int64_t space_available = 0; + int64_t wouldbe_size = 0; + + GF_ASSERT(frame); + GF_ASSERT(priv); + GF_ASSERT(_inode); + GF_ASSERT(this); + GF_ASSERT(local); + + if (ctx != NULL && (ctx->hard_lim > 0 || ctx->soft_lim > 0)) { + wouldbe_size = ctx->size + delta; + + LOCK(&ctx->lock); + { + timeout = priv->soft_timeout; - inode_ctx_get (inode, this, &value); - ctx = (quota_inode_ctx_t *)(unsigned long)value; + if ((ctx->soft_lim >= 0) && (wouldbe_size > ctx->soft_lim)) { + timeout = priv->hard_timeout; + } - if ((((ctx == NULL) || (ctx->hard_lim == hard_lim)) - && (hard_lim < 0) && !((IA_ISREG (buf->ia_type)) - || (IA_ISLNK (buf->ia_type))))) { - ret = 0; - goto out; + if (!just_validated && quota_timeout(ctx->validate_time, timeout)) { + need_validate = 1; + } else if (wouldbe_size >= ctx->hard_lim) { + hard_limit_exceeded = 1; + } } + UNLOCK(&ctx->lock); - ret = quota_inode_ctx_get (inode, this, &ctx, 1); - if ((ret == -1) || (ctx == NULL)) { - gf_log (this->name, GF_LOG_WARNING, "cannot create quota " - "context in inode(gfid:%s)", - uuid_utoa (inode->gfid)); - ret = -1; - *op_errno = ENOMEM; - goto out; + if (need_validate && *skip_check != _gf_true) { + *skip_check = _gf_true; + ret = quota_validate(frame, _inode, this, quota_validate_cbk); + if (ret < 0) { + *op_errno = -ret; + *skip_check = _gf_false; + } + goto out; } - LOCK (&ctx->lock); - { - ctx->hard_lim = hard_lim; - ctx->soft_lim = soft_lim; + if (hard_limit_exceeded) { + local->op_ret = -1; + local->op_errno = EDQUOT; - ctx->buf = *buf; + space_available = ctx->hard_lim - ctx->size; - if (!(IA_ISREG (buf->ia_type) || IA_ISLNK (buf->ia_type))) { - goto unlock; - } + if (space_available < 0) + space_available = 0; - if (loc->name == NULL) - goto unlock; + if ((local->space_available < 0) || + (local->space_available > space_available)) { + local->space_available = space_available; + } - list_for_each_entry (dentry, &ctx->parents, next) { - if ((strcmp (dentry->name, loc->name) == 0) && - (uuid_compare (loc->parent->gfid, - dentry->par) == 0)) { - found = 1; - break; - } - } - - if (!found) { - dentry = __quota_dentry_new (ctx, - (char *)loc->name, - loc->parent->gfid); - if (dentry == NULL) { - /* - gf_log (this->name, GF_LOG_WARNING, - "cannot create a new dentry (par:%" - PRId64", name:%s) for inode(ino:%" - PRId64", gfid:%s)", - uuid_utoa (local->loc.inode->gfid)); - */ - ret = -1; - *op_errno = ENOMEM; - goto unlock; - } - } + if (space_available == 0) { + *op_errno = EDQUOT; + goto out; + } } -unlock: - UNLOCK (&ctx->lock); + /* We log usage only if quota limit is configured on + that inode. */ + quota_log_usage(this, ctx, _inode, delta); + } + + ret = 0; out: - return ret; + return ret; } int32_t -quota_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *dict, struct iatt *postparent) +quota_check_limit(call_frame_t *frame, inode_t *inode, xlator_t *this) { - quota_local_t *local = NULL; + int32_t ret = -1, op_errno = EINVAL; + inode_t *_inode = NULL, *parent = NULL; + quota_inode_ctx_t *ctx = NULL; + quota_priv_t *priv = NULL; + quota_local_t *local = NULL; + quota_local_t *par_local = NULL; + char just_validated = 0; + int64_t delta = 0; + int8_t object_delta = 0; + uint64_t value = 0; + gf_boolean_t skip_check = _gf_false; + + GF_VALIDATE_OR_GOTO("quota", this, err); + GF_VALIDATE_OR_GOTO(this->name, frame, err); + GF_VALIDATE_OR_GOTO(this->name, inode, err); + + local = frame->local; + GF_VALIDATE_OR_GOTO(this->name, local, err); + + if (local->par_frame) { + par_local = local->par_frame->local; + GF_VALIDATE_OR_GOTO(this->name, par_local, err); + } else { + par_local = local; + } + + delta = par_local->delta; + object_delta = par_local->object_delta; + + GF_VALIDATE_OR_GOTO(this->name, par_local->stub, err); + /* Allow all the trusted clients + * Don't block the gluster internal processes like rebalance, gsyncd, + * self heal etc from the disk quotas. + * + * Method: Allow all the clients with PID negative. This is by the + * assumption that any kernel assigned pid doesn't have the negative + * number. + */ + if (0 > frame->root->pid) { + ret = 0; + quota_link_count_decrement(frame); + goto done; + } - if (op_ret < 0) - goto unwind; + priv = this->private; - local = frame->local; + inode_ctx_get(inode, this, &value); + ctx = (quota_inode_ctx_t *)(unsigned long)value; - op_ret = quota_fill_inodectx (this, inode, dict, &local->loc, buf, - &op_errno); + _inode = inode_ref(inode); -unwind: - QUOTA_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, - dict, postparent); - return 0; -} + LOCK(&local->lock); + { + just_validated = local->just_validated; + local->just_validated = 0; + } + UNLOCK(&local->lock); + do { + /* In a rename operation, enforce should be stopped at common + ancestor */ + if (!gf_uuid_is_null(par_local->common_ancestor) && + !gf_uuid_compare(_inode->gfid, par_local->common_ancestor)) { + quota_link_count_decrement(frame); + break; + } -int32_t -quota_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xattr_req) -{ - quota_priv_t *priv = NULL; - int32_t ret = -1; - quota_local_t *local = NULL; + if (object_delta <= 0) + goto skip_check_object_limit; - priv = this->private; + ret = quota_check_object_limit(frame, ctx, priv, _inode, this, + &op_errno, just_validated, par_local, + &skip_check); + if (skip_check == _gf_true) + goto done; - xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new(); - if (!xattr_req) - goto err; + if (ret) { + if (op_errno != EDQUOT) + gf_msg(this->name, GF_LOG_ERROR, 0, Q_MSG_ENFORCEMENT_FAILED, + "Failed to " + "check quota object limit"); + goto err; + } - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + skip_check_object_limit: + ret = quota_check_size_limit(frame, ctx, priv, _inode, this, &op_errno, + just_validated, delta, par_local, + &skip_check); + if (skip_check == _gf_true) + goto done; - local = quota_local_new (); - if (local == NULL) { - goto err; + if (ret) { + if (op_errno != EDQUOT) + gf_msg(this->name, GF_LOG_ERROR, 0, Q_MSG_ENFORCEMENT_FAILED, + "Failed to " + "check quota size limit"); + goto err; } - frame->local = local; - loc_copy (&local->loc, loc); + if (__is_root_gfid(_inode->gfid)) { + quota_link_count_decrement(frame); + break; + } - ret = dict_set_int8 (xattr_req, QUOTA_LIMIT_KEY, 1); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "dict set of key for hard-limit failed"); + parent = inode_parent(_inode, 0, NULL); + if (parent == NULL) { + ret = quota_build_ancestry(_inode, quota_check_limit_continuation, + frame); + if (ret < 0) { + op_errno = -ret; goto err; + } + + break; } -wind: - /* TODO: check with vshastry@redhat.com to cleanup the ugliness of - * checking priv->is_quota_on here by using STACK_WIND_TAIL macro - */ - STACK_WIND (frame, - priv->is_quota_on ? quota_lookup_cbk : default_lookup_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, loc, - xattr_req); + inode_unref(_inode); + _inode = parent; + just_validated = 0; - ret = 0; + value = 0; + inode_ctx_get(_inode, this, &value); + ctx = (quota_inode_ctx_t *)(unsigned long)value; + } while (1); + +done: + if (_inode != NULL) { + inode_unref(_inode); + _inode = NULL; + } + return 0; err: - if (xattr_req) - dict_unref (xattr_req); + quota_handle_validate_error(frame, -1, op_errno); - if (ret < 0) { - QUOTA_STACK_UNWIND (lookup, frame, -1, ENOMEM, - NULL, NULL, NULL, NULL); + inode_unref(_inode); + return 0; +} + +inode_t * +do_quota_check_limit(call_frame_t *frame, inode_t *inode, xlator_t *this, + quota_dentry_t *dentry, gf_boolean_t force) +{ + int32_t ret = -1; + inode_t *parent = NULL; + call_frame_t *new_frame = NULL; + quota_local_t *new_local = NULL; + + parent = inode_parent(inode, dentry->par, dentry->name); + if (parent == NULL) { + if (force) + parent = inode_find(inode->table, dentry->par); + else + goto out; + } + if (parent == NULL) + goto out; + + new_frame = copy_frame(frame); + if (new_frame == NULL) + goto out; + + new_local = quota_local_new(); + if (new_local == NULL) + goto out; + + new_frame->local = new_local; + new_local->par_frame = frame; + + quota_check_limit(new_frame, parent, this); + + ret = 0; +out: + if (ret < 0) { + if (parent) { + /* Caller should decrement link_count, in case parent is + * NULL + */ + quota_handle_validate_error(frame, -1, ENOMEM); } - return 0; -} + if (new_frame) { + new_frame->local = NULL; + STACK_DESTROY(new_frame->root); + } + } + return parent; +} -void -quota_update_size (xlator_t *this, inode_t *inode, char *name, uuid_t par, - int64_t delta) +static int +quota_get_limits(xlator_t *this, dict_t *dict, int64_t *hard_lim, + int64_t *soft_lim, int64_t *object_hard_limit, + int64_t *object_soft_limit) { - inode_t *_inode = NULL; - inode_t *parent = NULL; - uint64_t value = 0; - quota_inode_ctx_t *ctx = NULL; - uuid_t trav_uuid = {0,}; + quota_limits_t *limit = NULL; + quota_limits_t *object_limit = NULL; + quota_priv_t *priv = NULL; + int64_t soft_lim_percent = 0; + int64_t *ptr = NULL; + int ret = 0; - GF_VALIDATE_OR_GOTO ("quota", this, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); - - inode_ctx_get (inode, this, &value); - ctx = (quota_inode_ctx_t *)(unsigned long)value; + if ((this == NULL) || (dict == NULL) || (hard_lim == NULL) || + (soft_lim == NULL)) + goto out; - _inode = inode_ref (inode); + priv = this->private; - if ( par != NULL ) { - uuid_copy (trav_uuid, par); - } + ret = dict_get_bin(dict, QUOTA_LIMIT_KEY, (void **)&ptr); + limit = (quota_limits_t *)ptr; - do { - if ((ctx != NULL) && (ctx->hard_lim >= 0)) { - quota_log_usage (this, ctx, _inode, delta); - LOCK (&ctx->lock); - { - ctx->size += delta; - if (ctx->size < 0) - ctx->size = 0; - } - UNLOCK (&ctx->lock); - } + if (limit) { + *hard_lim = ntoh64(limit->hl); + soft_lim_percent = ntoh64(limit->sl); + } - if (__is_root_gfid (_inode->gfid)) { - break; - } + if (soft_lim_percent < 0) { + soft_lim_percent = priv->default_soft_lim; + } - parent = inode_parent (_inode, trav_uuid, name); - if (parent == NULL) { - /* TODO: build ancestry and continue updating size */ - gf_log (this->name, GF_LOG_DEBUG, - "cannot find parent for inode (gfid:%s), hence " - "aborting size updation of parents", - uuid_utoa (_inode->gfid)); - } + if ((*hard_lim > 0) && (soft_lim_percent > 0)) { + *soft_lim = (soft_lim_percent * (*hard_lim)) / 100; + } - if (name != NULL) { - name = NULL; - uuid_clear (trav_uuid); - } + ret = dict_get_bin(dict, QUOTA_LIMIT_OBJECTS_KEY, (void **)&ptr); + if (ret) + return 0; + object_limit = (quota_limits_t *)ptr; - inode_unref (_inode); - _inode = parent; + if (object_limit) { + *object_hard_limit = ntoh64(object_limit->hl); + soft_lim_percent = ntoh64(object_limit->sl); + } - if (_inode == NULL) { - break; - } + if (soft_lim_percent < 0) { + soft_lim_percent = priv->default_soft_lim; + } - value = 0; - ctx = NULL; - inode_ctx_get (_inode, this, &value); - ctx = (quota_inode_ctx_t *)(unsigned long)value; - } while (1); + if ((*object_hard_limit > 0) && (soft_lim_percent > 0)) { + *object_soft_limit = (soft_lim_percent * (*object_hard_limit)) / 100; + } out: - return; + return 0; } - -int32_t -quota_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +int +quota_fill_inodectx(xlator_t *this, inode_t *inode, dict_t *dict, loc_t *loc, + struct iatt *buf, int32_t *op_errno) { - int32_t ret = 0; - uint64_t ctx_int = 0; - quota_inode_ctx_t *ctx = NULL; - quota_local_t *local = NULL; - quota_dentry_t *dentry = NULL, *tmp = NULL; - int64_t delta = 0; - struct list_head head = {0, }; - - local = frame->local; + int32_t ret = -1; + char found = 0; + quota_inode_ctx_t *ctx = NULL; + quota_dentry_t *dentry = NULL; + uint64_t value = 0; + int64_t hard_lim = 0; + int64_t soft_lim = 0; + int64_t object_hard_limit = 0; + int64_t object_soft_limit = 0; + + quota_get_limits(this, dict, &hard_lim, &soft_lim, &object_hard_limit, + &object_soft_limit); + + inode_ctx_get(inode, this, &value); + ctx = (quota_inode_ctx_t *)(unsigned long)value; + + if ((((ctx == NULL) || (ctx->hard_lim == hard_lim)) && (hard_lim < 0) && + !QUOTA_REG_OR_LNK_FILE(buf->ia_type))) { + ret = 0; + goto out; + } + + ret = quota_inode_ctx_get(inode, this, &ctx, 1); + if ((ret == -1) || (ctx == NULL)) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_INODE_CTX_GET_FAILED, + "cannot create quota " + "context in inode(gfid:%s)", + uuid_utoa(inode->gfid)); + ret = -1; + *op_errno = ENOMEM; + goto out; + } + + LOCK(&ctx->lock); + { + ctx->hard_lim = hard_lim; + ctx->soft_lim = soft_lim; + ctx->object_hard_lim = object_hard_limit; + ctx->object_soft_lim = object_soft_limit; + + ctx->buf = *buf; + + if (!QUOTA_REG_OR_LNK_FILE(buf->ia_type)) { + goto unlock; + } + + /* do nothing if it is a nameless lookup */ + if (loc->name == NULL || !loc->parent) + goto unlock; + + list_for_each_entry(dentry, &ctx->parents, next) + { + if ((strcmp(dentry->name, loc->name) == 0) && + (gf_uuid_compare(loc->parent->gfid, dentry->par) == 0)) { + found = 1; + break; + } + } - if ((op_ret < 0) || (local == NULL)) { - goto out; + if (!found) { + dentry = __quota_dentry_new(ctx, (char *)loc->name, + loc->parent->gfid); + if (dentry == NULL) { + /* + gf_msg (this->name, GF_LOG_WARNING, ENOMEM, + Q_MSG_ENOMEM, + "cannot create a new dentry (par:%" +- PRId64", name:%s) for inode(ino:%" +- PRId64", gfid:%s)", +- uuid_utoa (local->loc.inode->gfid)); + */ + ret = -1; + *op_errno = ENOMEM; + goto unlock; + } } + } +unlock: + UNLOCK(&ctx->lock); - INIT_LIST_HEAD (&head); +out: + return ret; +} - ret = inode_ctx_get (local->loc.inode, this, &ctx_int); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to get the context", local->loc.path); - goto out; - } +/* + * return _gf_true if enforcement is needed and _gf_false otherwise + */ +gf_boolean_t +should_quota_enforce(xlator_t *this, dict_t *dict, glusterfs_fop_t fop) +{ + int ret = 0; - ctx = (quota_inode_ctx_t *)(unsigned long) ctx_int; + ret = dict_check_flag(dict, GF_INTERNAL_CTX_KEY, GF_DHT_HEAL_DIR); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "quota context not set in %s (gfid:%s)", - local->loc.path, uuid_utoa (local->loc.inode->gfid)); - goto out; - } + if (fop == GF_FOP_MKDIR && ret == DICT_FLAG_SET) { + return _gf_false; + } else if (ret == -ENOENT) { + gf_msg(this->name, GF_LOG_DEBUG, EINVAL, Q_MSG_INTERNAL_FOP_KEY_MISSING, + "No internal fop context present"); + goto out; + } +out: + return _gf_true; +} - LOCK (&ctx->lock); - { - ctx->buf = *postbuf; +int32_t +quota_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *dict, struct iatt *postparent) +{ + quota_local_t *local = NULL; + inode_t *this_inode = NULL; - list_for_each_entry (dentry, &ctx->parents, next) { - tmp = __quota_dentry_new (NULL, dentry->name, - dentry->par); - list_add_tail (&tmp->next, &head); - } + local = frame->local; + frame->local = NULL; - } - UNLOCK (&ctx->lock); + if (op_ret >= 0 && inode) { + this_inode = inode_ref(inode); - if (postbuf->ia_blocks != prebuf->ia_blocks) - delta = local->delta; + op_ret = quota_fill_inodectx(this, inode, dict, &local->loc, buf, + &op_errno); + if (op_ret < 0) + op_errno = ENOMEM; + } - list_for_each_entry_safe (dentry, tmp, &head, next) { - quota_update_size (this, local->loc.inode, dentry->name, - dentry->par, delta); - __quota_dentry_free (dentry); - } + QUOTA_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, dict, + postparent); + + if (op_ret < 0 || this_inode == NULL || gf_uuid_is_null(this_inode->gfid)) + goto out; + + check_ancestory_2(this, local, this_inode); out: - QUOTA_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + if (this_inode) + inode_unref(this_inode); - return 0; -} + quota_local_cleanup(local); + return 0; +} int32_t -quota_writev_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t off, - uint32_t flags, struct iobref *iobref, dict_t *xdata) +quota_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - quota_local_t *local = NULL; - int32_t op_errno = EINVAL; - quota_priv_t *priv = NULL; - struct iovec *new_vector = NULL; - int32_t new_count = 0; + quota_priv_t *priv = NULL; + int32_t ret = -1; + quota_local_t *local = NULL; - priv = this->private; + priv = this->private; - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto unwind; - } + WIND_IF_QUOTAOFF(priv->is_quota_on, off); - if (local->op_ret == -1) { - op_errno = local->op_errno; + xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new(); + if (!xattr_req) + goto err; - if ((op_errno == EDQUOT) && (local->space_available > 0)) { - new_count = iov_subset (vector, count, 0, - local->space_available, NULL); + local = quota_local_new(); + if (local == NULL) { + goto err; + } - new_vector = GF_CALLOC (new_count, - sizeof (struct iovec), - gf_common_mt_iovec); - if (new_vector == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } + frame->local = local; + loc_copy(&local->loc, loc); - new_count = iov_subset (vector, count, 0, - local->space_available, - new_vector); + ret = dict_set_int8(xattr_req, QUOTA_LIMIT_KEY, 1); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "dict set of key for " + "hard-limit failed"); + goto err; + } - vector = new_vector; - count = new_count; - } else { - goto unwind; - } - } + ret = dict_set_int8(xattr_req, QUOTA_LIMIT_OBJECTS_KEY, 1); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "dict set of key for quota object limit failed"); + goto err; + } - STACK_WIND (frame, - priv->is_quota_on? quota_writev_cbk: default_writev_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, fd, - vector, count, off, flags, iobref, xdata); + STACK_WIND(frame, quota_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xattr_req); - if (new_vector != NULL) - GF_FREE (new_vector); + ret = 0; - return 0; +err: + if (xattr_req) + dict_unref(xattr_req); -unwind: - QUOTA_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); - return 0; -} + if (ret < 0) { + QUOTA_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL); + } + return 0; + +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, + loc, xattr_req); + return 0; +} int32_t -quota_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t off, - uint32_t flags, struct iobref *iobref, dict_t *xdata) +quota_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - quota_priv_t *priv = NULL; - int32_t ret = -1, op_errno = EINVAL; - int32_t parents = 0; - uint64_t size = 0; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - quota_dentry_t *dentry = NULL, *tmp = NULL; - call_stub_t *stub = NULL; - struct list_head head = {0, }; + int32_t ret = 0; + uint64_t ctx_int = 0; + quota_inode_ctx_t *ctx = NULL; + quota_local_t *local = NULL; + + local = frame->local; + + if ((op_ret < 0) || (local == NULL) || (postbuf == NULL)) { + goto out; + } + + ret = inode_ctx_get(local->loc.inode, this, &ctx_int); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INODE_CTX_GET_FAILED, + "%s: failed to get the " + "context", + local->loc.path); + goto out; + } + + ctx = (quota_inode_ctx_t *)(unsigned long)ctx_int; + + if (ctx == NULL) { + gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INODE_CTX_GET_FAILED, + "quota context not set in %s (gfid:%s)", local->loc.path, + uuid_utoa(local->loc.inode->gfid)); + goto out; + } + + LOCK(&ctx->lock); + { + ctx->buf = *postbuf; + } + UNLOCK(&ctx->lock); - priv = this->private; +out: + QUOTA_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata); - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + return 0; +} - INIT_LIST_HEAD (&head); +static int gf_quota_enforcer_log; - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("quota", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, fd, unwind); +int32_t +quota_writev_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t off, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + quota_local_t *local = NULL; + int32_t op_errno = EINVAL; + struct iovec *new_vector = NULL; + int32_t new_count = 0; - local = quota_local_new (); - if (local == NULL) { - goto unwind; - } + local = frame->local; - frame->local = local; - local->loc.inode = inode_ref (fd->inode); + GF_VALIDATE_OR_GOTO("quota", local, unwind); - ret = quota_inode_ctx_get (fd->inode, this, &ctx, 0); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on " - "inode (%s). " - "If quota is not enabled recently and crawler has " - "finished crawling, its an error", - uuid_utoa (fd->inode->gfid)); - } + if (local->op_ret == -1) { + op_errno = local->op_errno; - stub = fop_writev_stub (frame, quota_writev_helper, fd, vector, count, - off, flags, iobref, xdata); - if (stub == NULL) { - op_errno = ENOMEM; + if ((op_errno == EDQUOT) && (local->space_available > 0)) { + new_count = iov_subset(vector, count, 0, local->space_available, + &new_vector, 0); + if (new_count < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; goto unwind; + } + + vector = new_vector; + count = new_count; + } else if (op_errno == ENOENT || op_errno == ESTALE) { + /* We may get ENOENT/ESTALE in case of below scenario + * fd = open file.txt + * unlink file.txt + * write on fd + * Here build_ancestry can fail as the file is removed. + * For now ignore ENOENT/ESTALE with writes on active fd + * We need to re-visit this code once we understand + * how other file-system behave in this scenario + */ + gf_msg_debug(this->name, 0, + "quota enforcer failed " + "with ENOENT/ESTALE on %s, cannot check " + "quota limits and allowing writes", + uuid_utoa(fd->inode->gfid)); + } else if ((op_errno == EINVAL) && + !inode_parent(local->loc.inode, 0, NULL)) { + /* We may get INVAL with parent == NULL, + * in case of below scenario + * 1. enable quota + * 2. glusterfsd stop/start + * 3. nameless lookup + * 4. write on fd + * Here build_ancestry can fail as the file's pgfid + * is't exist. + * For now ignore EINVAL with writes on active fd + * untils the pgfid is created at name lookup + */ + GF_LOG_OCCASIONALLY(gf_quota_enforcer_log, this->name, + GF_LOG_CRITICAL, + "Quota cannot be enforced as " + "parent is not available and writes are being " + "allowed without checking whether they are " + "within quota limits. This can happen if Quota " + "crawl is not complete. If crawl has been " + "completed, please file a bug."); + } else { + goto unwind; } + } - priv = this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, unwind); - - size = iov_length (vector, count); - if (ctx != NULL) { - LOCK (&ctx->lock); - { - list_for_each_entry (dentry, &ctx->parents, next) { - tmp = __quota_dentry_new (NULL, dentry->name, - dentry->par); - list_add_tail (&tmp->next, &head); - parents++; - } - } - UNLOCK (&ctx->lock); - } + STACK_WIND(frame, quota_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, off, flags, + iobref, xdata); - local->delta = size; + if (new_vector != NULL) + GF_FREE(new_vector); + + return 0; - local->link_count = parents; +unwind: + QUOTA_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +quota_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + quota_priv_t *priv = NULL; + int32_t op_errno = EINVAL; + int32_t parents = 0; + int32_t fail_count = 0; + uint64_t size = 0; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; + quota_dentry_t *dentry = NULL, *tmp = NULL; + call_stub_t *stub = NULL; + struct list_head head; + inode_t *par_inode = NULL; + + priv = this->private; + + WIND_IF_QUOTAOFF(priv->is_quota_on, off); + + INIT_LIST_HEAD(&head); + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO("quota", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd, unwind); + + local = quota_local_new(); + if (local == NULL) { + goto unwind; + } + + frame->local = local; + local->loc.inode = inode_ref(fd->inode); + + (void)quota_inode_ctx_get(fd->inode, this, &ctx, 0); + if (ctx == NULL) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(fd->inode->gfid)); + } + + stub = fop_writev_stub(frame, quota_writev_helper, fd, vector, count, off, + flags, iobref, xdata); + if (stub == NULL) { + op_errno = ENOMEM; + goto unwind; + } + + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, priv, unwind); + + parents = quota_add_parents_from_ctx(ctx, &head); + if (parents == -1) { + op_errno = errno; + goto unwind; + } + + size = iov_length(vector, count); + + LOCK(&local->lock); + { + local->delta = size; + local->object_delta = 0; + local->link_count = (parents != 0) ? parents : 1; local->stub = stub; + } + UNLOCK(&local->lock); - if (parents == 0) { - local->link_count = 1; - quota_check_limit (frame, fd->inode, this, NULL, NULL); - } else { - list_for_each_entry_safe (dentry, tmp, &head, next) { - quota_check_limit (frame, fd->inode, this, dentry->name, - dentry->par); - __quota_dentry_free (dentry); + if (parents == 0) { + /* nameless lookup on this inode, allow quota to reconstruct + * ancestry as part of check_limit. + */ + quota_check_limit(frame, fd->inode, this); + } else { + list_for_each_entry_safe(dentry, tmp, &head, next) + { + par_inode = do_quota_check_limit(frame, fd->inode, this, dentry, + _gf_false); + if (par_inode == NULL) { + if (ctx) { + /* remove stale entry from inode ctx */ + quota_dentry_del(ctx, dentry->name, dentry->par); + parents--; + fail_count++; } + } else { + inode_unref(par_inode); + } + __quota_dentry_free(dentry); } - return 0; + if (parents == 0) { + LOCK(&local->lock); + { + local->link_count++; + } + UNLOCK(&local->lock); + quota_check_limit(frame, fd->inode, this); + } + + while (fail_count != 0) { + quota_link_count_decrement(frame); + fail_count--; + } + } + + return 0; unwind: - QUOTA_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + QUOTA_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); + return 0; -wind: - STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, fd, - vector, count, off, flags, iobref, xdata); - return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, + fd, vector, count, off, flags, iobref, xdata); + return 0; } - int32_t -quota_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +quota_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - QUOTA_STACK_UNWIND (mkdir, frame, op_ret, op_errno, inode, - buf, preparent, postparent, xdata); - return 0; + QUOTA_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; } - int32_t -quota_mkdir_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - mode_t mode, mode_t umask, dict_t *xdata) +quota_mkdir_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { - quota_local_t *local = NULL; - int32_t op_errno = EINVAL; + quota_local_t *local = NULL; + int32_t op_errno = EINVAL; - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto unwind; - } + local = frame->local; - op_errno = local->op_errno; + GF_VALIDATE_OR_GOTO("quota", local, unwind); - if (local->op_ret == -1) { - goto unwind; - } + op_errno = local->op_errno; - STACK_WIND (frame, - quota_mkdir_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, loc, - mode, umask, xdata); + if (local->op_ret == -1) { + goto unwind; + } - return 0; + STACK_WIND(frame, quota_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); + + return 0; unwind: - QUOTA_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, - NULL, NULL, NULL); - return 0; + QUOTA_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + return 0; } - int32_t -quota_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - mode_t umask, dict_t *xdata) +quota_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { - quota_priv_t *priv = NULL; - int32_t ret = 0, op_errno = 0; - quota_local_t *local = NULL; - call_stub_t *stub = NULL; - - priv = this->private; - - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); - - local = quota_local_new (); - if (local == NULL) { - op_errno = ENOMEM; - goto err; - } - - frame->local = local; - - ret = loc_copy (&local->loc, loc); - if (ret) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed"); - goto err; - } - - stub = fop_mkdir_stub (frame, quota_mkdir_helper, loc, mode, umask, - xdata); - if (stub == NULL) { - op_errno = ENOMEM; - goto err; - } - + quota_priv_t *priv = NULL; + int32_t ret = 0, op_errno = 0; + quota_local_t *local = NULL; + call_stub_t *stub = NULL; + + priv = this->private; + WIND_IF_QUOTAOFF(priv->is_quota_on, off); + + if (!should_quota_enforce(this, xdata, GF_FOP_MKDIR)) { + gf_msg(this->name, GF_LOG_DEBUG, 0, Q_MSG_ENFORCEMENT_SKIPPED, + "Enforcement has been skipped(internal fop)."); + goto off; + } + + local = quota_local_new(); + if (local == NULL) { + op_errno = ENOMEM; + goto err; + } + + frame->local = local; + + ret = loc_copy(&local->loc, loc); + if (ret) { + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "loc_copy failed"); + goto err; + } + + stub = fop_mkdir_stub(frame, quota_mkdir_helper, loc, mode, umask, xdata); + if (stub == NULL) { + op_errno = ENOMEM; + goto err; + } + + LOCK(&local->lock); + { local->stub = stub; local->delta = 0; + local->object_delta = 1; local->link_count = 1; + } + UNLOCK(&local->lock); - quota_check_limit (frame, loc->parent, this, NULL, NULL); - return 0; + quota_check_limit(frame, loc->parent, this); + return 0; err: - QUOTA_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL); + QUOTA_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); - return 0; + return 0; -wind: - STACK_WIND (frame, - priv->is_quota_on? quota_mkdir_cbk: default_mkdir_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, loc, - mode, umask, xdata); +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, + loc, mode, umask, xdata); - return 0; + return 0; } - int32_t -quota_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +quota_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - int32_t ret = -1; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - quota_dentry_t *dentry = NULL; - - local = frame->local; - if (op_ret < 0) { - goto unwind; - } - - ret = quota_inode_ctx_get (inode, this, &ctx, 1); - if ((ret == -1) || (ctx == NULL)) { - gf_log (this->name, GF_LOG_WARNING, "cannot create quota " - "context in inode(gfid:%s)", - uuid_utoa (inode->gfid)); - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - - LOCK (&ctx->lock); - { - ctx->buf = *buf; - - dentry = __quota_dentry_new (ctx, (char *)local->loc.name, - local->loc.parent->gfid); - if (dentry == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "cannot create a new dentry (name:%s) for " - "inode(gfid:%s)", local->loc.name, - uuid_utoa (local->loc.inode->gfid)); - op_ret = -1; - op_errno = ENOMEM; - goto unlock; - } - } + int32_t ret = -1; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; + quota_dentry_t *dentry = NULL; + + local = frame->local; + if (op_ret < 0) { + goto unwind; + } + + ret = quota_inode_ctx_get(inode, this, &ctx, 1); + if ((ret == -1) || (ctx == NULL)) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_INODE_CTX_GET_FAILED, + "cannot create quota " + "context in inode(gfid:%s)", + uuid_utoa(inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + LOCK(&ctx->lock); + { + ctx->buf = *buf; + + dentry = __quota_dentry_new(ctx, (char *)local->loc.name, + local->loc.parent->gfid); + if (dentry == NULL) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "cannot create a new dentry " + "(name:%s) for inode(gfid:%s)", + local->loc.name, uuid_utoa(local->loc.inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + goto unlock; + } + } unlock: - UNLOCK (&ctx->lock); + UNLOCK(&ctx->lock); unwind: - QUOTA_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent, xdata); - return 0; + QUOTA_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); + return 0; } - int32_t -quota_create_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, mode_t mode, mode_t umask, fd_t *fd, - dict_t *xdata) +quota_create_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flags, mode_t mode, mode_t umask, fd_t *fd, + dict_t *xdata) { - quota_local_t *local = NULL; - int32_t op_errno = EINVAL; - quota_priv_t *priv = NULL; + quota_local_t *local = NULL; + int32_t op_errno = EINVAL; - local = frame->local; + local = frame->local; - priv = this->private; + GF_VALIDATE_OR_GOTO("quota", local, unwind); - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto unwind; - } - - if (local->op_ret == -1) { - op_errno = local->op_errno; - goto unwind; - } + if (local->op_ret == -1) { + op_errno = local->op_errno; + goto unwind; + } - - STACK_WIND (frame, - priv->is_quota_on? quota_create_cbk: default_create_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, loc, - flags, mode, umask, fd, xdata); - return 0; + STACK_WIND(frame, quota_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; unwind: - QUOTA_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, - NULL, NULL, NULL, NULL); - return 0; + QUOTA_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); + return 0; } - int32_t -quota_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +quota_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - quota_priv_t *priv = NULL; - int32_t ret = -1; - quota_local_t *local = NULL; - int32_t op_errno = 0; - call_stub_t *stub = NULL; - - priv = this->private; - - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); - - local = quota_local_new (); - if (local == NULL) { - op_errno = ENOMEM; - goto err; - } - - frame->local = local; - - ret = loc_copy (&local->loc, loc); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed"); - op_errno = ENOMEM; - goto err; - } - - stub = fop_create_stub (frame, quota_create_helper, loc, flags, mode, - umask, fd, xdata); - if (stub == NULL) { - goto err; - } - + quota_priv_t *priv = NULL; + int32_t ret = -1; + quota_local_t *local = NULL; + int32_t op_errno = 0; + call_stub_t *stub = NULL; + + priv = this->private; + + WIND_IF_QUOTAOFF(priv->is_quota_on, off); + QUOTA_WIND_FOR_INTERNAL_FOP(xdata, off); + + local = quota_local_new(); + if (local == NULL) { + op_errno = ENOMEM; + goto err; + } + + frame->local = local; + + ret = loc_copy(&local->loc, loc); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "loc_copy failed"); + op_errno = ENOMEM; + goto err; + } + + stub = fop_create_stub(frame, quota_create_helper, loc, flags, mode, umask, + fd, xdata); + if (stub == NULL) { + goto err; + } + + LOCK(&local->lock); + { local->link_count = 1; local->stub = stub; local->delta = 0; + local->object_delta = 1; + } + UNLOCK(&local->lock); - quota_check_limit (frame, loc->parent, this, NULL, NULL); - return 0; + quota_check_limit(frame, loc->parent, this); + return 0; err: - QUOTA_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL, NULL); + QUOTA_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); - return 0; + return 0; -wind: - STACK_WIND (frame, - priv->is_quota_on? quota_create_cbk: default_create_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, loc, - flags, mode, umask, fd, xdata); - return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; } - int32_t -quota_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +quota_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - uint64_t value = 0; - quota_dentry_t *dentry = NULL; - quota_dentry_t *old_dentry = NULL; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; + uint64_t value = 0; - if (op_ret < 0) { - goto out; - } + if (op_ret < 0) { + goto out; + } - local = (quota_local_t *) frame->local; + local = (quota_local_t *)frame->local; - inode_ctx_get (local->loc.inode, this, &value); - ctx = (quota_inode_ctx_t *)(unsigned long)value; + inode_ctx_get(local->loc.inode, this, &value); + ctx = (quota_inode_ctx_t *)(unsigned long)value; - if (ctx == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "quota context not set in inode (gfid:%s)", - uuid_utoa (local->loc.inode->gfid)); - goto out; - } - - if (!local->skip_check) - quota_update_size (this, local->loc.inode, - (char *)local->loc.name, - local->loc.parent->gfid, - (-(ctx->buf.ia_blocks * 512))); + if (ctx == NULL) { + gf_msg(this->name, GF_LOG_INFO, EINVAL, Q_MSG_INODE_CTX_GET_FAILED, + "quota context not set inode (gfid:%s)", + uuid_utoa(local->loc.gfid)); + goto out; + } - LOCK (&ctx->lock); - { - list_for_each_entry (dentry, &ctx->parents, next) { - if ((strcmp (dentry->name, local->loc.name) == 0) && - (uuid_compare (local->loc.parent->gfid, - dentry->par) == 0)) { - old_dentry = dentry; - break; - } - } - if (old_dentry) - __quota_dentry_free (old_dentry); - } - UNLOCK (&ctx->lock); + quota_dentry_del(ctx, local->loc.name, local->loc.parent->gfid); out: - QUOTA_STACK_UNWIND (unlink, frame, op_ret, op_errno, preparent, - postparent, xdata); - return 0; + QUOTA_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); + return 0; } - int32_t -quota_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, - dict_t *xdata) +quota_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - quota_priv_t *priv = NULL; - int32_t ret = -1; - quota_local_t *local = NULL; - - priv = this->private; + quota_priv_t *priv = NULL; + int32_t ret = -1; + quota_local_t *local = NULL; - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + priv = this->private; - local = quota_local_new (); - if (local == NULL) { - goto err; - } + WIND_IF_QUOTAOFF(priv->is_quota_on, off); - frame->local = local; + local = quota_local_new(); + if (local == NULL) { + goto err; + } - if (xdata && dict_get (xdata, GLUSTERFS_INTERNAL_FOP_KEY)) { - local->skip_check = _gf_true; - } + frame->local = local; - ret = loc_copy (&local->loc, loc); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed"); - goto err; - } + ret = loc_copy(&local->loc, loc); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "loc_copy failed"); + goto err; + } -wind: - STACK_WIND (frame, - priv->is_quota_on? quota_unlink_cbk: default_unlink_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, loc, - xflag, xdata); + STACK_WIND(frame, quota_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); - ret = 0; + ret = 0; err: - if (ret == -1) { - QUOTA_STACK_UNWIND (unlink, frame, -1, 0, NULL, NULL, NULL); - } + if (ret == -1) { + QUOTA_STACK_UNWIND(unlink, frame, -1, 0, NULL, NULL, NULL); + } - return 0; -} + return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, + loc, xflag, xdata); + return 0; +} int32_t -quota_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +quota_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - int32_t ret = -1; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - quota_dentry_t *dentry = NULL; - char found = 0; - - if (op_ret < 0) { - goto out; - } - - local = (quota_local_t *) frame->local; - - if (local->skip_check) - goto out; - - quota_update_size (this, local->loc.parent, NULL, NULL, - (buf->ia_blocks * 512)); - - ret = quota_inode_ctx_get (inode, this, &ctx, 0); - if ((ret == -1) || (ctx == NULL)) { - gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on " - "inode (%s). " - "If quota is not enabled recently and crawler has " - "finished crawling, its an error", - uuid_utoa (inode->gfid)); - goto out; - } - - LOCK (&ctx->lock); + int32_t ret = -1; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; + quota_dentry_t *dentry = NULL; + char found = 0; + + if (op_ret < 0) { + goto out; + } + + local = (quota_local_t *)frame->local; + + ret = quota_inode_ctx_get(inode, this, &ctx, 0); + if ((ret == -1) || (ctx == NULL)) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(inode->gfid)); + goto out; + } + + LOCK(&ctx->lock); + { + list_for_each_entry(dentry, &ctx->parents, next) { - list_for_each_entry (dentry, &ctx->parents, next) { - if ((strcmp (dentry->name, local->loc.name) == 0) && - (uuid_compare (local->loc.parent->gfid, - dentry->par) == 0)) { - found = 1; - gf_log (this->name, GF_LOG_WARNING, - "new entry being linked (name:%s) for " - "inode (gfid:%s) is already present " - "in inode-dentry-list", dentry->name, - uuid_utoa (local->loc.inode->gfid)); - break; - } - } - - if (!found) { - dentry = __quota_dentry_new (ctx, - (char *)local->loc.name, - local->loc.parent->gfid); - if (dentry == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "cannot create a new dentry (name:%s) " - "for inode(gfid:%s)", local->loc.name, - uuid_utoa (local->loc.inode->gfid)); - op_ret = -1; - op_errno = ENOMEM; - goto unlock; - } - } - - ctx->buf = *buf; + if ((strcmp(dentry->name, local->loc.name) == 0) && + (gf_uuid_compare(local->loc.parent->gfid, dentry->par) == 0)) { + found = 1; + + gf_msg_debug(this->name, 0, + "new entry being" + " linked (name:%s) for inode " + "(gfid:%s) is already present " + "in inode-dentry-list", + dentry->name, uuid_utoa(local->loc.inode->gfid)); + break; + } + } + + if (!found) { + dentry = __quota_dentry_new(ctx, (char *)local->loc.name, + local->loc.parent->gfid); + if (dentry == NULL) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "cannot create a new dentry (name:%s)" + "for inode(gfid:%s)", + local->loc.name, uuid_utoa(local->loc.inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + goto unlock; + } } + + ctx->buf = *buf; + } unlock: - UNLOCK (&ctx->lock); + UNLOCK(&ctx->lock); out: - QUOTA_STACK_UNWIND (link, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); + QUOTA_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); - return 0; + return 0; } - int32_t -quota_link_helper (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc, dict_t *xdata) +quota_link_helper(call_frame_t *frame, xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) { - quota_local_t *local = NULL; - int32_t op_errno = EINVAL; - quota_priv_t *priv = NULL; + quota_local_t *local = NULL; + int32_t op_errno = EINVAL; - priv = this->private; + local = frame->local; - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto unwind; - } + GF_VALIDATE_OR_GOTO("quota", local, unwind); - op_errno = local->op_errno; + op_errno = local->op_errno; - if (local->op_ret == -1) { - goto unwind; - } + if (local->op_ret == -1) { + goto unwind; + } - STACK_WIND (frame, priv->is_quota_on? quota_link_cbk: default_link_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, oldloc, - newloc, xdata); - return 0; + STACK_WIND(frame, quota_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; unwind: - QUOTA_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, - NULL, NULL, NULL); - return 0; + QUOTA_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + return 0; } - -int32_t -quota_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, - dict_t *xdata) +void +quota_link_continue(call_frame_t *frame) { - quota_priv_t *priv = NULL; - int32_t ret = -1, op_errno = ENOMEM; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - call_stub_t *stub = NULL; - - priv = this->private; + int32_t ret = -1; + int32_t op_errno = EIO; + quota_local_t *local = NULL; + uuid_t common_ancestor = {0}; + xlator_t *this = NULL; + quota_inode_ctx_t *ctx = NULL; + inode_t *src_parent = NULL; + inode_t *dst_parent = NULL; + + local = frame->local; + this = THIS; + + if (local->op_ret < 0) { + op_errno = local->op_errno; + goto err; + } - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + if (local->xdata && dict_get(local->xdata, GLUSTERFS_INTERNAL_FOP_KEY)) { + /* Treat link as rename, crawl upwards only till common ancestor + */ + ret = quota_find_common_ancestor( + local->oldloc.inode, local->newloc.parent, &common_ancestor); + if (ret < 0 || gf_uuid_is_null(common_ancestor)) { + gf_msg(this->name, GF_LOG_ERROR, ESTALE, + Q_MSG_ANCESTRY_BUILD_FAILED, + "failed to get " + "common_ancestor for %s and %s", + local->oldloc.path, local->newloc.path); + op_errno = ESTALE; + goto err; + } + } else { + /* Treat link as a new file. + * TODO: Currently marker accounts twice for the links created + * across directories. + * This needs re-visit if marker accounts only once + * for the links created across directories + */ + if (local->oldloc.parent) + src_parent = inode_ref(local->oldloc.parent); + else + src_parent = inode_parent(local->oldloc.inode, 0, NULL); + dst_parent = local->newloc.parent; - quota_inode_ctx_get (oldloc->inode, this, &ctx, 0); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on " - "inode (%s). " - "If quota is not enabled recently and crawler has " - "finished crawling, its an error", - uuid_utoa (oldloc->inode->gfid)); - } + /* No need to check quota limit if src and dst parents are same + */ + if (src_parent == dst_parent || + gf_uuid_compare(src_parent->gfid, dst_parent->gfid) == 0) { + inode_unref(src_parent); + goto wind; + } + + inode_unref(src_parent); + } + + quota_inode_ctx_get(local->oldloc.inode, this, &ctx, 0); + if (ctx == NULL) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(local->oldloc.inode->gfid)); + } + + LOCK(&local->lock); + { + local->link_count = 1; + local->delta = (ctx != NULL) ? ctx->buf.ia_blocks * 512 : 0; + local->object_delta = 1; + gf_uuid_copy(local->common_ancestor, common_ancestor); + } + UNLOCK(&local->lock); - local = quota_local_new (); - if (local == NULL) { - goto err; - } + quota_check_limit(frame, local->newloc.parent, this); + return; - frame->local = (void *) local; +err: + QUOTA_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + return; - if (xdata && dict_get (xdata, GLUSTERFS_INTERNAL_FOP_KEY)) { - local->skip_check = _gf_true; - goto wind; - } +wind: + STACK_WIND(frame, quota_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, &(local->oldloc), + &(local->newloc), local->xdata); + return; +} - ret = loc_copy (&local->loc, newloc); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed"); - goto err; - } +int32_t +quota_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + quota_priv_t *priv = NULL; + int32_t ret = -1; + int32_t op_errno = ENOMEM; + quota_local_t *local = NULL; + call_stub_t *stub = NULL; + + priv = this->private; + + WIND_IF_QUOTAOFF(priv->is_quota_on, off); + + local = quota_local_new(); + if (local == NULL) { + goto err; + } + + frame->local = (void *)local; + + if (xdata) + local->xdata = dict_ref(xdata); + + ret = loc_copy(&local->loc, newloc); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "loc_copy failed"); + goto err; + } + + ret = loc_copy(&local->oldloc, oldloc); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "loc_copy failed"); + goto err; + } + + ret = loc_copy(&local->newloc, newloc); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "loc_copy failed"); + goto err; + } + + /* No need to check quota limit if src and dst parents are same */ + if (oldloc->parent && newloc->parent && + !gf_uuid_compare(oldloc->parent->gfid, newloc->parent->gfid)) { + gf_msg_debug(this->name, GF_LOG_DEBUG, + "link %s -> %s are " + "in the same directory, so skip check limit", + oldloc->path, newloc->path); + goto wind; + } + + stub = fop_link_stub(frame, quota_link_helper, oldloc, newloc, xdata); + if (stub == NULL) { + goto err; + } + + LOCK(&local->lock); + { + local->link_count = 2; + local->fop_continue_cbk = quota_link_continue; + local->stub = stub; + } + UNLOCK(&local->lock); - stub = fop_link_stub (frame, quota_link_helper, oldloc, newloc, xdata); - if (stub == NULL) { - goto err; - } + check_ancestory(frame, newloc->parent); - local->link_count = 1; - local->stub = stub; - local->delta = (ctx != NULL) ? ctx->buf.ia_blocks * 512 : 0; + /* source parent can be NULL, so do check_ancestry on a file */ + if (oldloc->parent) + check_ancestory(frame, oldloc->parent); + else + check_ancestory(frame, oldloc->inode); - quota_check_limit (frame, newloc->parent, this, NULL, NULL); - return 0; + return 0; err: - QUOTA_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, - NULL, NULL, NULL); + QUOTA_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + return 0; - return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + return 0; wind: - STACK_WIND (frame, default_link_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, oldloc, - newloc, xdata); - return 0; + STACK_WIND(frame, quota_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; } - int32_t -quota_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent, - dict_t *xdata) +quota_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { - int32_t ret = -1; - int64_t size = 0; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - quota_dentry_t *old_dentry = NULL, *dentry = NULL; - char new_dentry_found = 0; + int32_t ret = -1; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; + quota_dentry_t *old_dentry = NULL, *dentry = NULL; + char new_dentry_found = 0; - if (op_ret < 0) { - goto out; - } + if (op_ret < 0) { + goto out; + } - local = frame->local; - if (local == NULL) { - op_ret = -1; - op_errno = EINVAL; - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto out; - } + local = frame->local; - if (IA_ISREG (local->oldloc.inode->ia_type) - || IA_ISLNK (local->oldloc.inode->ia_type)) { - size = buf->ia_blocks * 512; - } + GF_VALIDATE_OR_GOTO("quota", local, out); - if (local->oldloc.parent != local->newloc.parent) { - quota_update_size (this, local->oldloc.parent, NULL, NULL, - (-size)); - quota_update_size (this, local->newloc.parent, NULL, NULL, - size); - } + if (!QUOTA_REG_OR_LNK_FILE(local->oldloc.inode->ia_type)) + goto out; - if (!(IA_ISREG (local->oldloc.inode->ia_type) - || IA_ISLNK (local->oldloc.inode->ia_type))) { - goto out; - } + ret = quota_inode_ctx_get(local->oldloc.inode, this, &ctx, 0); + if ((ret == -1) || (ctx == NULL)) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(local->oldloc.inode->gfid)); - ret = quota_inode_ctx_get (local->oldloc.inode, this, &ctx, 0); - if ((ret == -1) || (ctx == NULL)) { - gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on " - "inode (%s). " - "If quota is not enabled recently and crawler has " - "finished crawling, its an error", - uuid_utoa (local->oldloc.inode->gfid)); + goto out; + } - goto out; - } - - LOCK (&ctx->lock); + LOCK(&ctx->lock); + { + list_for_each_entry(dentry, &ctx->parents, next) { - /* decision of whether to create a context in newloc->inode - * depends on fuse_rename_cbk's choice of inode it retains - * after rename. currently it just associates oldloc->inode - * with new parent and name. If this changes, following code - * should be changed to set a new context in newloc->inode. - */ - list_for_each_entry (dentry, &ctx->parents, next) { - if ((strcmp (dentry->name, local->oldloc.name) == 0) && - (uuid_compare (local->oldloc.parent->gfid, - dentry->par) == 0)) { - old_dentry = dentry; - } else if ((strcmp (dentry->name, - local->newloc.name) == 0) && - (uuid_compare (local->oldloc.parent->gfid, - dentry->par) == 0)) { - new_dentry_found = 1; - gf_log (this->name, GF_LOG_WARNING, - "new entry being linked (name:%s) for " - "inode (gfid:%s) is already present " - "in inode-dentry-list", dentry->name, - uuid_utoa (local->newloc.inode->gfid)); - break; - } - } - - if (old_dentry != NULL) { - __quota_dentry_free (old_dentry); - } else { - gf_log (this->name, GF_LOG_WARNING, - "dentry corresponding to the path just renamed " - "(name:%s) is not present", local->oldloc.name); - } - - if (!new_dentry_found) { - dentry = __quota_dentry_new (ctx, - (char *)local->newloc.name, - local->newloc.parent->gfid); - if (dentry == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "cannot create a new dentry (name:%s) " - "for inode(gfid:%s)", - local->newloc.name, - uuid_utoa (local->newloc.inode->gfid)); - op_ret = -1; - op_errno = ENOMEM; - goto unlock; - } - } - - ctx->buf = *buf; + if ((strcmp(dentry->name, local->oldloc.name) == 0) && + (gf_uuid_compare(local->oldloc.parent->gfid, dentry->par) == + 0)) { + old_dentry = dentry; + } else if ((strcmp(dentry->name, local->newloc.name) == 0) && + (gf_uuid_compare(local->newloc.parent->gfid, + dentry->par) == 0)) { + new_dentry_found = 1; + gf_msg_debug(this->name, 0, + "new entry being " + "linked (name:%s) for inode (gfid:%s) " + "is in inode-dentry-list", + dentry->name, + uuid_utoa(local->oldloc.inode->gfid)); + } + + if (old_dentry && new_dentry_found) + break; + } + + if (old_dentry != NULL) { + __quota_dentry_free(old_dentry); + } else { + gf_msg_debug(this->name, 0, + "dentry corresponding" + "the path just renamed (name:%s) is not" + " present", + local->oldloc.name); + } + + if (!new_dentry_found) { + dentry = __quota_dentry_new(ctx, (char *)local->newloc.name, + local->newloc.parent->gfid); + if (dentry == NULL) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "cannot create a new dentry (name:%s) " + "for inode(gfid:%s)", + local->newloc.name, + uuid_utoa(local->newloc.inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + goto unlock; + } } + + ctx->buf = *buf; + } unlock: - UNLOCK (&ctx->lock); + UNLOCK(&ctx->lock); out: - QUOTA_STACK_UNWIND (rename, frame, op_ret, op_errno, buf, preoldparent, - postoldparent, prenewparent, postnewparent, xdata); + QUOTA_STACK_UNWIND(rename, frame, op_ret, op_errno, buf, preoldparent, + postoldparent, prenewparent, postnewparent, xdata); - return 0; + return 0; } - int32_t -quota_rename_helper (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc, dict_t *xdata) +quota_rename_helper(call_frame_t *frame, xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) { - quota_local_t *local = NULL; - int32_t op_errno = EINVAL; - quota_priv_t *priv = NULL; + quota_local_t *local = NULL; + int32_t op_errno = EINVAL; - priv = this->private; + local = frame->local; - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto unwind; - } + GF_VALIDATE_OR_GOTO("quota", local, unwind); - op_errno = local->op_errno; + op_errno = local->op_errno; - if (local->op_ret == -1) { - goto unwind; - } + if (local->op_ret == -1) { + goto unwind; + } - STACK_WIND (frame, - priv->is_quota_on? quota_rename_cbk: default_rename_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, oldloc, - newloc, xdata); + STACK_WIND(frame, quota_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); - return 0; + return 0; unwind: - QUOTA_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, - NULL, NULL, NULL, NULL); - return 0; + QUOTA_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); + return 0; } - -int32_t -quota_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc, dict_t *xdata) +static int32_t +quota_rename_get_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) { - quota_priv_t *priv = NULL; - int32_t ret = -1, op_errno = ENOMEM; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - call_stub_t *stub = NULL; + quota_local_t *local = NULL; + int32_t ret = 0; + int64_t *size = 0; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO_WITH_ERROR("quota", this, out, op_errno, EINVAL); + GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, xdata, out, op_errno, EINVAL); + local = frame->local; + GF_ASSERT(local); + local->link_count = 1; + + if (op_ret < 0) + goto out; + + ret = dict_get_bin(xdata, QUOTA_SIZE_KEY, (void **)&size); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, EINVAL, Q_MSG_SIZE_KEY_MISSING, + "size key not present in dict"); + op_errno = EINVAL; + goto out; + } + local->delta = ntoh64(*size); + local->object_delta = 1; + quota_check_limit(frame, local->newloc.parent, this); + return 0; - priv = this->private; +out: + quota_handle_validate_error(frame, -1, op_errno); + return 0; +} - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); +void +quota_rename_continue(call_frame_t *frame) +{ + int32_t ret = -1; + int32_t op_errno = EIO; + quota_local_t *local = NULL; + uuid_t common_ancestor = {0}; + xlator_t *this = NULL; + quota_inode_ctx_t *ctx = NULL; - local = quota_local_new (); - if (local == NULL) { - goto err; - } + local = frame->local; + this = THIS; - frame->local = local; + if (local->op_ret < 0) { + op_errno = local->op_errno; + goto err; + } + + ret = quota_find_common_ancestor(local->oldloc.parent, local->newloc.parent, + &common_ancestor); + if (ret < 0 || gf_uuid_is_null(common_ancestor)) { + gf_msg(this->name, GF_LOG_ERROR, ESTALE, Q_MSG_ANCESTRY_BUILD_FAILED, + "failed to get " + "common_ancestor for %s and %s", + local->oldloc.path, local->newloc.path); + op_errno = ESTALE; + goto err; + } + + LOCK(&local->lock); + { + local->link_count = 1; + gf_uuid_copy(local->common_ancestor, common_ancestor); + } + UNLOCK(&local->lock); - ret = loc_copy (&local->oldloc, oldloc); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed"); - goto err; - } + if (QUOTA_REG_OR_LNK_FILE(local->oldloc.inode->ia_type)) { + ret = quota_inode_ctx_get(local->oldloc.inode, this, &ctx, 0); + if (ctx == NULL) { + gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INODE_CTX_GET_FAILED, + "quota context not set in inode (gfid:%s), " + "considering file size as zero while enforcing " + "quota on new ancestry", + uuid_utoa(local->oldloc.inode->gfid)); + + local->delta = 0; + local->object_delta = 1; + } else { + /* FIXME: We need to account for the size occupied by + * this inode on the target directory. To avoid double + * accounting, we need to modify enforcer to perform + * quota_check_limit only up till the least common + * ancestor directory inode*/ - ret = loc_copy (&local->newloc, newloc); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed"); - goto err; - } + /* FIXME: The following code assumes that regular files + * and link files are present, in their entirety, in a + * single brick. This *assumption is invalid in the + * case of stripe.*/ - stub = fop_rename_stub (frame, quota_rename_helper, oldloc, newloc, - xdata); - if (stub == NULL) { - goto err; + local->delta = ctx->buf.ia_blocks * 512; + local->object_delta = 1; } - local->link_count = 1; - local->stub = stub; - - if (IA_ISREG (oldloc->inode->ia_type) - || IA_ISLNK (oldloc->inode->ia_type)) { - ret = quota_inode_ctx_get (oldloc->inode, this, &ctx, 0); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "quota context not set in inode (gfid:%s), " - "considering file size as zero while enforcing " - "quota on new ancestry", - oldloc->inode ? uuid_utoa (oldloc->inode->gfid) - : "0"); - local->delta = 0; - } else { - local->delta = ctx->buf.ia_blocks * 512; - } - } else { - local->delta = 0; + } else if (IA_ISDIR(local->oldloc.inode->ia_type)) { + ret = quota_validate(frame, local->oldloc.inode, this, + quota_rename_get_size_cbk); + if (ret) { + op_errno = -ret; + goto err; } - quota_check_limit (frame, newloc->parent, this, NULL, NULL); - return 0; - -err: - QUOTA_STACK_UNWIND (rename, frame, -1, op_errno, NULL, - NULL, NULL, NULL, NULL, NULL); - return 0; + return; + } -wind: - STACK_WIND (frame, default_rename_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, oldloc, - newloc, xdata); + quota_check_limit(frame, local->newloc.parent, this); + return; - return 0; +err: + QUOTA_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); + return; } - int32_t -quota_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +quota_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - int64_t size = 0; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - quota_dentry_t *dentry = NULL; - - if (op_ret < 0) { - goto out; - } + quota_priv_t *priv = NULL; + int32_t ret = -1; + int32_t op_errno = ENOMEM; + quota_local_t *local = NULL; + call_stub_t *stub = NULL; + + priv = this->private; + + WIND_IF_QUOTAOFF(priv->is_quota_on, off); + + local = quota_local_new(); + if (local == NULL) { + goto err; + } + + frame->local = local; + + ret = loc_copy(&local->oldloc, oldloc); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "loc_copy failed"); + goto err; + } + + ret = loc_copy(&local->newloc, newloc); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "loc_copy failed"); + goto err; + } + + /* No need to check quota limit if src and dst parents are same */ + if (oldloc->parent && newloc->parent && + !gf_uuid_compare(oldloc->parent->gfid, newloc->parent->gfid)) { + gf_msg_debug(this->name, 0, + "rename %s -> %s are " + "in the same directory, so skip check limit", + oldloc->path, newloc->path); + goto wind; + } + + stub = fop_rename_stub(frame, quota_rename_helper, oldloc, newloc, xdata); + if (stub == NULL) { + goto err; + } + + LOCK(&local->lock); + { + /* link_count here tell how many check_ancestry should be done + * before continuing the FOP + */ + local->link_count = 2; + local->stub = stub; + local->fop_continue_cbk = quota_rename_continue; + } + UNLOCK(&local->lock); - local = frame->local; - size = buf->ia_blocks * 512; + check_ancestory(frame, newloc->parent); + check_ancestory(frame, oldloc->parent); + return 0; - quota_update_size (this, local->loc.parent, NULL, NULL, size); +err: + QUOTA_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); + return 0; - quota_inode_ctx_get (local->loc.inode, this, &ctx, 1); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on " - "inode (%s). " - "If quota is not enabled recently and crawler has " - "finished crawling, its an error", - uuid_utoa (local->loc.inode->gfid)); +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, + oldloc, newloc, xdata); + return 0; - goto out; - } +wind: + STACK_WIND(frame, quota_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; +} - LOCK (&ctx->lock); - { - ctx->buf = *buf; - - dentry = __quota_dentry_new (ctx, (char *)local->loc.name, - local->loc.parent->gfid); - if (dentry == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "cannot create a new dentry (name:%s) for " - "inode(gfid:%s)", local->loc.name, - uuid_utoa (local->loc.inode->gfid)); - op_ret = -1; - op_errno = ENOMEM; - } +int32_t +quota_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; + quota_dentry_t *dentry = NULL; + int32_t ret = -1; + + if (op_ret < 0) { + goto out; + } + + local = frame->local; + + ret = quota_inode_ctx_get(local->loc.inode, this, &ctx, 1); + if ((ret == -1) || (ctx == NULL)) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(local->loc.inode->gfid)); + + goto out; + } + + LOCK(&ctx->lock); + { + ctx->buf = *buf; + + dentry = __quota_dentry_new(ctx, (char *)local->loc.name, + local->loc.parent->gfid); + if (dentry == NULL) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "cannot create " + "a new dentry (name:%s) for inode(gfid:%s)", + local->loc.name, uuid_utoa(local->loc.inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; } - UNLOCK (&ctx->lock); + } + UNLOCK(&ctx->lock); out: - QUOTA_STACK_UNWIND (symlink, frame, op_ret, op_errno, inode, buf, - preparent, postparent, xdata); + QUOTA_STACK_UNWIND(symlink, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); - return 0; + return 0; } - int -quota_symlink_helper (call_frame_t *frame, xlator_t *this, const char *linkpath, - loc_t *loc, mode_t umask, dict_t *xdata) +quota_symlink_helper(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) { - quota_local_t *local = NULL; - int32_t op_errno = EINVAL; - quota_priv_t *priv = NULL; + quota_local_t *local = NULL; + int32_t op_errno = EINVAL; - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto unwind; - } + local = frame->local; - priv = this->private; + GF_VALIDATE_OR_GOTO("quota", local, unwind); - if (local->op_ret == -1) { - op_errno = local->op_errno; - goto unwind; - } + if (local->op_ret == -1) { + op_errno = local->op_errno; + goto unwind; + } - STACK_WIND (frame, - priv->is_quota_on? quota_symlink_cbk: default_symlink_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, - linkpath, loc, umask, xdata); - return 0; + STACK_WIND(frame, quota_symlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, xdata); + return 0; unwind: - QUOTA_STACK_UNWIND (symlink, frame, -1, op_errno, NULL, NULL, - NULL, NULL, NULL); - return 0; + QUOTA_STACK_UNWIND(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + return 0; } - int -quota_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, - loc_t *loc, mode_t umask, dict_t *xdata) +quota_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) { - quota_priv_t *priv = NULL; - int32_t ret = -1; - int32_t op_errno = ENOMEM; - quota_local_t *local = NULL; - call_stub_t *stub = NULL; + quota_priv_t *priv = NULL; + int32_t ret = -1; + int32_t op_errno = ENOMEM; + quota_local_t *local = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + WIND_IF_QUOTAOFF(priv->is_quota_on, off); - local = quota_local_new (); - if (local == NULL) { - goto err; - } + local = quota_local_new(); + if (local == NULL) { + goto err; + } - frame->local = local; + frame->local = local; - ret = loc_copy (&local->loc, loc); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed"); - goto err; - } + ret = loc_copy(&local->loc, loc); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "loc_copy failed"); + goto err; + } - stub = fop_symlink_stub (frame, quota_symlink_helper, linkpath, loc, - umask, xdata); - if (stub == NULL) { - goto err; - } + stub = fop_symlink_stub(frame, quota_symlink_helper, linkpath, loc, umask, + xdata); + if (stub == NULL) { + goto err; + } + LOCK(&local->lock); + { local->stub = stub; - local->delta = strlen (linkpath); + local->delta = strlen(linkpath); + local->object_delta = 1; local->link_count = 1; + } + UNLOCK(&local->lock); - quota_check_limit (frame, loc->parent, this, NULL, NULL); - return 0; + quota_check_limit(frame, loc->parent, this); + return 0; err: - QUOTA_STACK_UNWIND (symlink, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL); + QUOTA_STACK_UNWIND(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); - return 0; + return 0; -wind: - STACK_WIND (frame, - priv->is_quota_on? quota_symlink_cbk: default_symlink_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, xdata); - return 0; + return 0; } - int32_t -quota_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +quota_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - int64_t delta = 0; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; - if (op_ret < 0) { - goto out; - } + if (op_ret < 0) { + goto out; + } - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto out; - } + local = frame->local; - delta = (postbuf->ia_blocks - prebuf->ia_blocks) * 512; + GF_VALIDATE_OR_GOTO("quota", local, out); - quota_update_size (this, local->loc.inode, NULL, NULL, delta); + quota_inode_ctx_get(local->loc.inode, this, &ctx, 0); + if (ctx == NULL) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(local->loc.inode->gfid)); + goto out; + } - quota_inode_ctx_get (local->loc.inode, this, &ctx, 0); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on " - "inode (%s). " - "If quota is not enabled recently and crawler has " - "finished crawling, its an error", - uuid_utoa (local->loc.inode->gfid)); - goto out; - } - - LOCK (&ctx->lock); - { - ctx->buf = *postbuf; - } - UNLOCK (&ctx->lock); + LOCK(&ctx->lock); + { + ctx->buf = *postbuf; + } + UNLOCK(&ctx->lock); out: - QUOTA_STACK_UNWIND (truncate, frame, op_ret, op_errno, prebuf, - postbuf, xdata); - return 0; + QUOTA_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } - int32_t -quota_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, - dict_t *xdata) +quota_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - quota_priv_t *priv = NULL; - int32_t ret = -1; - quota_local_t *local = NULL; + quota_priv_t *priv = NULL; + int32_t ret = -1; + quota_local_t *local = NULL; - priv = this->private; + priv = this->private; - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + WIND_IF_QUOTAOFF(priv->is_quota_on, off); + local = quota_local_new(); + if (local == NULL) { + goto err; + } - local = quota_local_new (); - if (local == NULL) { - goto err; - } + frame->local = local; - frame->local = local; + ret = loc_copy(&local->loc, loc); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "loc_copy failed"); + goto err; + } - ret = loc_copy (&local->loc, loc); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed"); - goto err; - } + STACK_WIND(frame, quota_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); -wind: - STACK_WIND (frame, - priv->is_quota_on? quota_truncate_cbk: default_truncate_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, loc, - offset, xdata); + return 0; - return 0; err: - QUOTA_STACK_UNWIND (truncate, frame, -1, ENOMEM, NULL, NULL, NULL); + QUOTA_STACK_UNWIND(truncate, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, + loc, offset, xdata); + return 0; } - int32_t -quota_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +quota_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - int64_t delta = 0; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; - if (op_ret < 0) { - goto out; - } + if (op_ret < 0) { + goto out; + } - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto out; - } + local = frame->local; - delta = (postbuf->ia_blocks - prebuf->ia_blocks) * 512; + GF_VALIDATE_OR_GOTO("quota", local, out); - quota_update_size (this, local->loc.inode, NULL, NULL, delta); - - quota_inode_ctx_get (local->loc.inode, this, &ctx, 0); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on " - "inode (%s). " - "If quota is not enabled recently and crawler has " - "finished crawling, its an error", - uuid_utoa (local->loc.inode->gfid)); - goto out; - } + quota_inode_ctx_get(local->loc.inode, this, &ctx, 0); + if (ctx == NULL) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(local->loc.inode->gfid)); + goto out; + } - LOCK (&ctx->lock); - { - ctx->buf = *postbuf; - } - UNLOCK (&ctx->lock); + LOCK(&ctx->lock); + { + ctx->buf = *postbuf; + } + UNLOCK(&ctx->lock); out: - QUOTA_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, prebuf, - postbuf, xdata); - return 0; + QUOTA_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } - int32_t -quota_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - dict_t *xdata) +quota_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - quota_priv_t *priv = NULL; - quota_local_t *local = NULL; + quota_priv_t *priv = NULL; + quota_local_t *local = NULL; - priv = this->private; + priv = this->private; - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + WIND_IF_QUOTAOFF(priv->is_quota_on, off); - local = quota_local_new (); - if (local == NULL) - goto err; + local = quota_local_new(); + if (local == NULL) + goto err; - frame->local = local; + frame->local = local; - local->loc.inode = inode_ref (fd->inode); + local->loc.inode = inode_ref(fd->inode); -wind: - STACK_WIND (frame, priv->is_quota_on? - quota_ftruncate_cbk: default_ftruncate_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, fd, - offset, xdata); + STACK_WIND(frame, quota_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); - return 0; + return 0; err: - QUOTA_STACK_UNWIND (ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL); + QUOTA_STACK_UNWIND(ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; -} + return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; +} -int32_t -quota_send_dir_limit_to_cli (call_frame_t *frame, xlator_t *this, - inode_t *inode, const char *name) +static int32_t +quota_send_dir_limit_to_cli(call_frame_t *frame, xlator_t *this, inode_t *inode, + const char *name, const int namelen) { - int32_t ret = 0; - char dir_limit [1024] = {0, }; - dict_t *dict = NULL; - quota_inode_ctx_t *ctx = NULL; - uint64_t value = 0; - quota_priv_t *priv = NULL; - - priv = this->private; - if (!priv->is_quota_on) { - snprintf (dir_limit, 1024, "Quota is disabled please turn on"); - goto dict_set; - } - - ret = inode_ctx_get (inode, this, &value); - if (ret < 0) - goto out; - - ctx = (quota_inode_ctx_t *)(unsigned long)value; - snprintf (dir_limit, 1024, "%"PRId64",%"PRId64, ctx->size, - ctx->hard_lim); + int32_t ret = 0; + int dir_limit_len = 0; + char dir_limit[64] = { + 0, + }; + dict_t *dict = NULL; + quota_inode_ctx_t *ctx = NULL; + uint64_t value = 0; + quota_priv_t *priv = NULL; + + priv = this->private; + if (!priv->is_quota_on) { + dir_limit_len = snprintf(dir_limit, sizeof(dir_limit), + "Quota is disabled please turn on"); + goto dict_set; + } + + ret = inode_ctx_get(inode, this, &value); + if (ret < 0) + goto out; + + ctx = (quota_inode_ctx_t *)(unsigned long)value; + dir_limit_len = snprintf(dir_limit, sizeof(dir_limit), + "%" PRId64 ",%" PRId64, ctx->size, ctx->hard_lim); dict_set: - dict = dict_new (); - if (dict == NULL) { - ret = -1; - goto out; - } + dict = dict_new(); + if (dict == NULL) { + ret = -1; + goto out; + } - ret = dict_set_str (dict, (char *) name, dir_limit); - if (ret < 0) - goto out; + ret = dict_set_nstrn(dict, (char *)name, namelen, dir_limit, dir_limit_len); + if (ret < 0) + goto out; - gf_log (this->name, GF_LOG_DEBUG, "str = %s", dir_limit); + gf_msg_debug(this->name, 0, "str = %s", dir_limit); - QUOTA_STACK_UNWIND (getxattr, frame, 0, 0, dict, NULL); + QUOTA_STACK_UNWIND(getxattr, frame, 0, 0, dict, NULL); - ret = 0; + ret = 0; out: - return ret; + if (dict) + dict_unref(dict); + return ret; } - int32_t -quota_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name, dict_t *xdata) +quota_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) { - int32_t ret = 0; + int32_t ret = 0; - if (name && strcasecmp (name, "trusted.limit.list") == 0) { - ret = quota_send_dir_limit_to_cli (frame, this, fd->inode, - name); - if (ret == 0) { - return 0; - } + if (name && strcasecmp(name, "trusted.limit.list") == 0) { + ret = quota_send_dir_limit_to_cli(frame, this, fd->inode, + "trusted.limit.list", + SLEN("trusted.limit.list")); + if (ret == 0) { + return 0; } + } - STACK_WIND (frame, default_fgetxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); - return 0; + STACK_WIND(frame, default_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + return 0; } - int32_t -quota_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) +quota_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - int32_t ret = 0; - - if ((name != NULL) && strcasecmp (name, "trusted.limit.list") == 0) { - ret = quota_send_dir_limit_to_cli (frame, this, loc->inode, - name); - if (ret == 0) - return 0; - } - - STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); - return 0; + int32_t ret = 0; + + if ((name != NULL) && strcasecmp(name, "trusted.limit.list") == 0) { + ret = quota_send_dir_limit_to_cli(frame, this, loc->inode, + "trusted.limit.list", + SLEN("trusted.limit.list")); + if (ret == 0) + return 0; + } + + STACK_WIND(frame, default_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + return 0; } - int32_t -quota_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - dict_t *xdata) +quota_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) { - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; - if (op_ret < 0) { - goto out; - } + if (op_ret < 0) { + goto out; + } - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto out; - } + local = frame->local; - quota_inode_ctx_get (local->loc.inode, this, &ctx, 0); - if (ctx == NULL) { - if (!IA_ISDIR (buf->ia_type)) { - gf_log (this->name, GF_LOG_DEBUG, - "quota context is NULL on " - "inode (%s). " - "If quota is not enabled recently and crawler " - "has finished crawling, its an error", - uuid_utoa (local->loc.inode->gfid)); - } + GF_VALIDATE_OR_GOTO("quota", local, out); - goto out; + quota_inode_ctx_get(local->loc.inode, this, &ctx, 0); + if (ctx == NULL) { + if (!IA_ISDIR(buf->ia_type)) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(local->loc.inode->gfid)); } - LOCK (&ctx->lock); - { - if (buf) - ctx->buf = *buf; - } - UNLOCK (&ctx->lock); + goto out; + } + + if (buf) { + LOCK(&ctx->lock); + ctx->buf = *buf; + UNLOCK(&ctx->lock); + } out: - QUOTA_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); - return 0; + QUOTA_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata); + return 0; } - int32_t -quota_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +quota_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - quota_priv_t *priv = NULL; - quota_local_t *local = NULL; - int32_t ret = -1; - - priv = this->private; + quota_priv_t *priv = NULL; + quota_local_t *local = NULL; + int32_t ret = -1; - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + priv = this->private; + WIND_IF_QUOTAOFF(priv->is_quota_on, off); - local = quota_local_new (); - if (local == NULL) { - goto unwind; - } + local = quota_local_new(); + if (local == NULL) { + goto unwind; + } - frame->local = local; - ret = loc_copy (&local->loc, loc); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed"); - goto unwind; - } + frame->local = local; + ret = loc_copy(&local->loc, loc); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "loc_copy failed"); + goto unwind; + } -wind: - STACK_WIND (frame, priv->is_quota_on? quota_stat_cbk: default_stat_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat, loc, - xdata); - return 0; + STACK_WIND(frame, quota_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; unwind: - QUOTA_STACK_UNWIND (stat, frame, -1, ENOMEM, NULL, NULL); - return 0; -} + QUOTA_STACK_UNWIND(stat, frame, -1, ENOMEM, NULL, NULL); + return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat, + loc, xdata); + return 0; +} int32_t -quota_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - dict_t *xdata) +quota_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) { - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; - if (op_ret < 0) { - goto out; - } + if (op_ret < 0) { + goto out; + } - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto out; - } + local = frame->local; - quota_inode_ctx_get (local->loc.inode, this, &ctx, 0); - if (ctx == NULL) { - if (!IA_ISDIR (buf->ia_type)) { - gf_log (this->name, GF_LOG_DEBUG, - "quota context is NULL on " - "inode (%s). " - "If quota is not enabled recently and crawler " - "has finished crawling, its an error", - uuid_utoa (local->loc.inode->gfid)); - } + GF_VALIDATE_OR_GOTO("quota", local, out); - goto out; + quota_inode_ctx_get(local->loc.inode, this, &ctx, 0); + if (ctx == NULL) { + if (!IA_ISDIR(buf->ia_type)) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(local->loc.inode->gfid)); } - LOCK (&ctx->lock); - { - if (buf) - ctx->buf = *buf; - } - UNLOCK (&ctx->lock); + goto out; + } + + if (buf) { + LOCK(&ctx->lock); + ctx->buf = *buf; + UNLOCK(&ctx->lock); + } out: - QUOTA_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); - return 0; + QUOTA_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata); + return 0; } - int32_t -quota_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +quota_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - quota_priv_t *priv = NULL; - quota_local_t *local = NULL; + quota_priv_t *priv = NULL; + quota_local_t *local = NULL; - priv = this->private; + priv = this->private; - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + WIND_IF_QUOTAOFF(priv->is_quota_on, off); - local = quota_local_new (); - if (local == NULL) { - goto unwind; - } + local = quota_local_new(); + if (local == NULL) { + goto unwind; + } - frame->local = local; + frame->local = local; - local->loc.inode = inode_ref (fd->inode); + local->loc.inode = inode_ref(fd->inode); -wind: - STACK_WIND (frame, - priv->is_quota_on? quota_fstat_cbk: default_fstat_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat, fd, - xdata); - return 0; + STACK_WIND(frame, quota_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; unwind: - QUOTA_STACK_UNWIND (fstat, frame, -1, ENOMEM, NULL, NULL); - return 0; -} + QUOTA_STACK_UNWIND(fstat, frame, -1, ENOMEM, NULL, NULL); + return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat, + fd, xdata); + return 0; +} int32_t -quota_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, const char *path, - struct iatt *buf, dict_t *xdata) +quota_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, const char *path, + struct iatt *buf, dict_t *xdata) { - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; - if (op_ret < 0) { - goto out; - } + if (op_ret < 0) { + goto out; + } - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto out; - } + local = frame->local; - quota_inode_ctx_get (local->loc.inode, this, &ctx, 0); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on " - "inode (%s). " - "If quota is not enabled recently and crawler has " - "finished crawling, its an error", - uuid_utoa (local->loc.inode->gfid)); - goto out; - } + GF_VALIDATE_OR_GOTO("quota", local, out); - LOCK (&ctx->lock); - { - ctx->buf = *buf; - } - UNLOCK (&ctx->lock); + quota_inode_ctx_get(local->loc.inode, this, &ctx, 0); + if (ctx == NULL) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(local->loc.inode->gfid)); + goto out; + } + + LOCK(&ctx->lock); + { + ctx->buf = *buf; + } + UNLOCK(&ctx->lock); out: - QUOTA_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, buf, - xdata); - return 0; + QUOTA_STACK_UNWIND(readlink, frame, op_ret, op_errno, path, buf, xdata); + return 0; } - int32_t -quota_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, - dict_t *xdata) +quota_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) { - quota_priv_t *priv = NULL; - quota_local_t *local = NULL; - int32_t ret = -1; + quota_priv_t *priv = NULL; + quota_local_t *local = NULL; + int32_t ret = -1; - priv = this->private; + priv = this->private; - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + WIND_IF_QUOTAOFF(priv->is_quota_on, off); - local = quota_local_new (); - if (local == NULL) { - goto unwind; - } + local = quota_local_new(); + if (local == NULL) { + goto unwind; + } - frame->local = local; + frame->local = local; - ret = loc_copy (&local->loc, loc); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed"); - goto unwind; - } + ret = loc_copy(&local->loc, loc); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "loc_copy failed"); + goto unwind; + } -wind: - STACK_WIND (frame, priv->is_quota_on? - quota_readlink_cbk: default_readlink_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->readlink, loc, - size, xdata); - return 0; + STACK_WIND(frame, quota_readlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, loc, size, xdata); + return 0; unwind: - QUOTA_STACK_UNWIND (readlink, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; -} + QUOTA_STACK_UNWIND(readlink, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readlink, + loc, size, xdata); + return 0; +} int32_t -quota_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *buf, struct iobref *iobref, - dict_t *xdata) +quota_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *buf, struct iobref *iobref, + dict_t *xdata) { - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; - if (op_ret < 0) { - goto out; - } + if (op_ret < 0) { + goto out; + } - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto out; - } + local = frame->local; - quota_inode_ctx_get (local->loc.inode, this, &ctx, 0); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on " - "inode (%s). " - "If quota is not enabled recently and crawler has " - "finished crawling, its an error", - uuid_utoa (local->loc.inode->gfid)); - goto out; - } + GF_VALIDATE_OR_GOTO("quota", local, out); - LOCK (&ctx->lock); - { - ctx->buf = *buf; - } - UNLOCK (&ctx->lock); + quota_inode_ctx_get(local->loc.inode, this, &ctx, 0); + if (ctx == NULL) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(local->loc.inode->gfid)); + goto out; + } + + LOCK(&ctx->lock); + { + ctx->buf = *buf; + } + UNLOCK(&ctx->lock); out: - QUOTA_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, - buf, iobref, xdata); - return 0; + QUOTA_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, buf, + iobref, xdata); + return 0; } - int32_t -quota_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) +quota_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - quota_priv_t *priv = NULL; - quota_local_t *local = NULL; + quota_priv_t *priv = NULL; + quota_local_t *local = NULL; - priv = this->private; + priv = this->private; - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + WIND_IF_QUOTAOFF(priv->is_quota_on, off); - local = quota_local_new (); - if (local == NULL) { - goto unwind; - } + local = quota_local_new(); + if (local == NULL) { + goto unwind; + } - frame->local = local; + frame->local = local; - local->loc.inode = inode_ref (fd->inode); + local->loc.inode = inode_ref(fd->inode); -wind: - STACK_WIND (frame, - priv->is_quota_on? quota_readv_cbk: default_readv_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, fd, - size, offset, flags, xdata); - return 0; + STACK_WIND(frame, quota_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + return 0; unwind: - QUOTA_STACK_UNWIND (readv, frame, -1, ENOMEM, NULL, -1, NULL, NULL, - NULL); - return 0; -} + QUOTA_STACK_UNWIND(readv, frame, -1, ENOMEM, NULL, -1, NULL, NULL, NULL); + return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; +} int32_t -quota_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +quota_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; - if (op_ret < 0) { - goto out; - } + if (op_ret < 0) { + goto out; + } - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto out; - } + local = frame->local; - quota_inode_ctx_get (local->loc.inode, this, &ctx, 0); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on " - "inode (%s). " - "If quota is not enabled recently and crawler has " - "finished crawling, its an error", - uuid_utoa (local->loc.inode->gfid)); - goto out; - } + GF_VALIDATE_OR_GOTO("quota", local, out); - LOCK (&ctx->lock); - { - ctx->buf = *postbuf; - } - UNLOCK (&ctx->lock); + quota_inode_ctx_get(local->loc.inode, this, &ctx, 0); + if (ctx == NULL) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(local->loc.inode->gfid)); + goto out; + } + + LOCK(&ctx->lock); + { + ctx->buf = *postbuf; + } + UNLOCK(&ctx->lock); out: - QUOTA_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, - xdata); - return 0; + QUOTA_STACK_UNWIND(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; } - int32_t -quota_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, - dict_t *xdata) +quota_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) { - quota_priv_t *priv = NULL; - quota_local_t *local = NULL; + quota_priv_t *priv = NULL; + quota_local_t *local = NULL; - priv = this->private; + priv = this->private; - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + WIND_IF_QUOTAOFF(priv->is_quota_on, off); - local = quota_local_new (); - if (local == NULL) { - goto unwind; - } + local = quota_local_new(); + if (local == NULL) { + goto unwind; + } - local->loc.inode = inode_ref (fd->inode); + local->loc.inode = inode_ref(fd->inode); - frame->local = local; + frame->local = local; -wind: - STACK_WIND (frame, - priv->is_quota_on? quota_fsync_cbk: default_fsync_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, fd, - flags, xdata); - return 0; + STACK_WIND(frame, quota_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, flags, xdata); + return 0; unwind: - QUOTA_STACK_UNWIND (fsync, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + QUOTA_STACK_UNWIND(fsync, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, + fd, flags, xdata); + return 0; } - int32_t -quota_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost, dict_t *xdata) +quota_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) { - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; - if (op_ret < 0) { - goto out; - } + if (op_ret < 0) { + goto out; + } - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto out; - } + local = frame->local; - quota_inode_ctx_get (local->loc.inode, this, &ctx, 0); - if (ctx == NULL) { - if (!IA_ISDIR (statpost->ia_type)) { - gf_log (this->name, GF_LOG_DEBUG, "quota context is " - "NULL on inode (%s). " - "If quota is not enabled recently and crawler " - "has finished crawling, its an error", - uuid_utoa (local->loc.inode->gfid)); - } + GF_VALIDATE_OR_GOTO("quota", local, out); - goto out; + quota_inode_ctx_get(local->loc.inode, this, &ctx, 0); + if (ctx == NULL) { + if (!IA_ISDIR(statpost->ia_type)) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(local->loc.inode->gfid)); } - LOCK (&ctx->lock); - { - if (statpost) - ctx->buf = *statpost; - } - UNLOCK (&ctx->lock); + goto out; + } + + if (statpost) { + LOCK(&ctx->lock); + ctx->buf = *statpost; + UNLOCK(&ctx->lock); + } out: - QUOTA_STACK_UNWIND (setattr, frame, op_ret, op_errno, statpre, - statpost, xdata); - return 0; + QUOTA_STACK_UNWIND(setattr, frame, op_ret, op_errno, statpre, statpost, + xdata); + return 0; } - int32_t -quota_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid, dict_t *xdata) +quota_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { - quota_priv_t *priv = NULL; - quota_local_t *local = NULL; - int32_t ret = -1; + quota_priv_t *priv = NULL; + quota_local_t *local = NULL; + int32_t ret = -1; - priv = this->private; + priv = this->private; - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + WIND_IF_QUOTAOFF(priv->is_quota_on, off); - local = quota_local_new (); - if (local == NULL) { - goto unwind; - } + local = quota_local_new(); + if (local == NULL) { + goto unwind; + } - frame->local = local; + frame->local = local; - ret = loc_copy (&local->loc, loc); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed"); - goto unwind; - } + ret = loc_copy(&local->loc, loc); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "loc_copy failed"); + goto unwind; + } -wind: - STACK_WIND (frame, - priv->is_quota_on? quota_setattr_cbk: default_setattr_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->setattr, loc, - stbuf, valid, xdata); - return 0; + STACK_WIND(frame, quota_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; unwind: - QUOTA_STACK_UNWIND (setattr, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; -} + QUOTA_STACK_UNWIND(setattr, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setattr, + loc, stbuf, valid, xdata); + return 0; +} int32_t -quota_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost, dict_t *xdata) +quota_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) { - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; - if (op_ret < 0) { - goto out; - } + if (op_ret < 0) { + goto out; + } - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto out; - } + local = frame->local; - quota_inode_ctx_get (local->loc.inode, this, &ctx, 0); - if (ctx == NULL) { - if (!IA_ISDIR (statpost->ia_type)) { - gf_log (this->name, GF_LOG_DEBUG, "quota context is " - "NULL on inode (%s). " - "If quota is not enabled recently and crawler " - "has finished crawling, its an error", - uuid_utoa (local->loc.inode->gfid)); - } + GF_VALIDATE_OR_GOTO("quota", local, out); - goto out; + quota_inode_ctx_get(local->loc.inode, this, &ctx, 0); + if (ctx == NULL) { + if (!IA_ISDIR(statpost->ia_type)) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(local->loc.inode->gfid)); } - LOCK (&ctx->lock); - { - ctx->buf = *statpost; - } - UNLOCK (&ctx->lock); + goto out; + } + + LOCK(&ctx->lock); + { + ctx->buf = *statpost; + } + UNLOCK(&ctx->lock); out: - QUOTA_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, statpre, - statpost, xdata); - return 0; + QUOTA_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, statpre, statpost, + xdata); + return 0; } - int32_t -quota_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid, dict_t *xdata) +quota_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { - quota_priv_t *priv = NULL; - quota_local_t *local = NULL; - - priv = this->private; + quota_priv_t *priv = NULL; + quota_local_t *local = NULL; - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + priv = this->private; + WIND_IF_QUOTAOFF(priv->is_quota_on, off); - local = quota_local_new (); - if (local == NULL) { - goto unwind; - } + local = quota_local_new(); + if (local == NULL) { + goto unwind; + } - frame->local = local; + frame->local = local; - local->loc.inode = inode_ref (fd->inode); + local->loc.inode = inode_ref(fd->inode); -wind: - STACK_WIND (frame, - priv->is_quota_on? quota_fsetattr_cbk: default_fsetattr_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetattr, fd, - stbuf, valid, xdata); - return 0; + STACK_WIND(frame, quota_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; unwind: - QUOTA_STACK_UNWIND (fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; -} + QUOTA_STACK_UNWIND(fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetattr, + fd, stbuf, valid, xdata); + return 0; +} int32_t -quota_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +quota_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - int32_t ret = -1; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - quota_dentry_t *dentry = NULL; - - local = frame->local; - if (op_ret < 0) { - goto unwind; - } - - ret = quota_inode_ctx_get (inode, this, &ctx, 1); - if ((ret == -1) || (ctx == NULL)) { - gf_log (this->name, GF_LOG_WARNING, "cannot create quota " - "context in inode (gfid:%s)", uuid_utoa (inode->gfid)); - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - - LOCK (&ctx->lock); - { - ctx->buf = *buf; - - dentry = __quota_dentry_new (ctx, (char *)local->loc.name, - local->loc.parent->gfid); - if (dentry == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "cannot create a new dentry (name:%s) for " - "inode(gfid:%s)", local->loc.name, - uuid_utoa (local->loc.inode->gfid)); - op_ret = -1; - op_errno = ENOMEM; - goto unlock; - } - } + int32_t ret = -1; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; + quota_dentry_t *dentry = NULL; + + local = frame->local; + if (op_ret < 0) { + goto unwind; + } + + ret = quota_inode_ctx_get(inode, this, &ctx, 1); + if ((ret == -1) || (ctx == NULL)) { + gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INODE_CTX_GET_FAILED, + "cannot create quota context in " + "inode(gfid:%s)", + uuid_utoa(inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + LOCK(&ctx->lock); + { + ctx->buf = *buf; + + dentry = __quota_dentry_new(ctx, (char *)local->loc.name, + local->loc.parent->gfid); + if (dentry == NULL) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "cannot create a new dentry " + "(name:%s) for inode(gfid:%s)", + local->loc.name, uuid_utoa(local->loc.inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + goto unlock; + } + } unlock: - UNLOCK (&ctx->lock); + UNLOCK(&ctx->lock); unwind: - QUOTA_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, - buf, preparent, postparent, xdata); - return 0; + QUOTA_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; } - int -quota_mknod_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) +quota_mknod_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) { - quota_local_t *local = NULL; - int32_t op_errno = EINVAL; - quota_priv_t *priv = NULL; + quota_local_t *local = NULL; + int32_t op_errno = EINVAL; - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto unwind; - } + local = frame->local; - priv = this->private; + GF_VALIDATE_OR_GOTO("quota", local, unwind); - if (local->op_ret == -1) { - op_errno = local->op_errno; - goto unwind; - } + if (local->op_ret == -1) { + op_errno = local->op_errno; + goto unwind; + } - STACK_WIND (frame, - priv->is_quota_on? quota_mknod_cbk: default_mknod_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, loc, - mode, rdev, umask, xdata); + STACK_WIND(frame, quota_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); - return 0; + return 0; unwind: - QUOTA_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, - NULL, NULL, NULL); - return 0; + QUOTA_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + return 0; } - int -quota_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, mode_t umask, dict_t *xdata) +quota_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) { - quota_priv_t *priv = NULL; - int32_t ret = -1; - quota_local_t *local = NULL; - call_stub_t *stub = NULL; - - priv = this->private; - - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); - - local = quota_local_new (); - if (local == NULL) { - goto err; - } - - frame->local = local; - - ret = loc_copy (&local->loc, loc); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed"); - goto err; - } - - stub = fop_mknod_stub (frame, quota_mknod_helper, loc, mode, rdev, - umask, xdata); - if (stub == NULL) { - goto err; - } - + quota_priv_t *priv = NULL; + int32_t ret = -1; + quota_local_t *local = NULL; + call_stub_t *stub = NULL; + + priv = this->private; + + WIND_IF_QUOTAOFF(priv->is_quota_on, off); + QUOTA_WIND_FOR_INTERNAL_FOP(xdata, off); + + local = quota_local_new(); + if (local == NULL) { + goto err; + } + + frame->local = local; + + ret = loc_copy(&local->loc, loc); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "loc_copy failed"); + goto err; + } + + stub = fop_mknod_stub(frame, quota_mknod_helper, loc, mode, rdev, umask, + xdata); + if (stub == NULL) { + goto err; + } + + LOCK(&local->lock); + { local->link_count = 1; local->stub = stub; local->delta = 0; + local->object_delta = 1; + } + UNLOCK(&local->lock); - quota_check_limit (frame, loc->parent, this, NULL, NULL); - return 0; -err: - QUOTA_STACK_UNWIND (mknod, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, - NULL); + quota_check_limit(frame, loc->parent, this); + return 0; - return 0; - -wind: - STACK_WIND (frame, - priv->is_quota_on? quota_mknod_cbk: default_mknod_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, loc, - mode, rdev, umask, xdata); - - return 0; +err: + QUOTA_STACK_UNWIND(mknod, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); + return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, + loc, mode, rdev, umask, xdata); + return 0; } int -quota_setxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, dict_t *xdata) +quota_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - int ret = 0; - - local = frame->local; - if (!local) - goto out; - - ret = quota_inode_ctx_get (local->loc.inode, this, &ctx, 1); - if ((ret < 0) || (ctx == NULL)) { - op_errno = ENOMEM; - goto out; - } - - LOCK (&ctx->lock); - { - ctx->hard_lim = local->limit.hard_lim; - ctx->soft_lim = local->limit.soft_lim_percent; - } - UNLOCK (&ctx->lock); + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; + int ret = 0; + + if (op_ret < 0) { + goto out; + } + + local = frame->local; + if (!local) + goto out; + + ret = quota_inode_ctx_get(local->loc.inode, this, &ctx, 1); + if ((ret < 0) || (ctx == NULL)) { + op_errno = -1; + goto out; + } + + LOCK(&ctx->lock); + { + ctx->hard_lim = local->limit.hl; + ctx->soft_lim = local->limit.sl; + ctx->object_hard_lim = local->object_limit.hl; + ctx->object_soft_lim = local->object_limit.sl; + } + UNLOCK(&ctx->lock); out: - QUOTA_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); - return 0; + QUOTA_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); + return 0; } int -quota_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int flags, dict_t *xdata) +quota_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int flags, dict_t *xdata) { - quota_priv_t *priv = NULL; - int op_errno = EINVAL; - int op_ret = -1; - int64_t hard_lim = -1, soft_lim = -1; - quota_local_t *local = NULL; - char *src = NULL; - char *dst = NULL; - int len = 0; - int ret = -1; - - priv = this->private; - - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - - if (0 <= frame->root->pid) { - ret = dict_get_ptr_and_len (dict, QUOTA_LIMIT_KEY, - (void **)&src, &len); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, "dict_get on %s " - "failed", QUOTA_LIMIT_KEY); - } else { - dst = GF_CALLOC (len, sizeof (char), gf_common_mt_char); - if (dst) - memcpy (dst, src, len); - } - - GF_REMOVE_INTERNAL_XATTR ("trusted.glusterfs.quota*", - dict); - if (!ret && IA_ISDIR (loc->inode->ia_type) && dst) { - ret = dict_set_dynptr (dict, QUOTA_LIMIT_KEY, - dst, len); - if (ret) - gf_log (this->name, GF_LOG_WARNING, "setting " - "key %s failed", QUOTA_LIMIT_KEY); - else - dst = NULL; - } - } - - quota_get_limits (this, dict, &hard_lim, &soft_lim); - - if (hard_lim > 0) { - local = quota_local_new (); - if (local == NULL) { - op_errno = ENOMEM; - goto err; - } - - frame->local = local; - loc_copy (&local->loc, loc); - - local->limit.hard_lim = hard_lim; - local->limit.soft_lim_percent = soft_lim; + quota_priv_t *priv = NULL; + int op_errno = EINVAL; + int op_ret = -1; + int64_t hard_lim = -1; + int64_t soft_lim = -1; + int64_t object_hard_limit = -1; + int64_t object_soft_limit = -1; + quota_local_t *local = NULL; + gf_boolean_t internal_fop = _gf_false; + + priv = this->private; + + WIND_IF_QUOTAOFF(priv->is_quota_on, off); + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + if (xdata && dict_get_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY)) + internal_fop = _gf_true; + + if (frame->root->pid >= 0 && internal_fop == _gf_false) { + GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.quota*", dict, op_errno, + err); + GF_IF_INTERNAL_XATTR_GOTO("trusted.pgfid*", dict, op_errno, err); + } + + quota_get_limits(this, dict, &hard_lim, &soft_lim, &object_hard_limit, + &object_soft_limit); + + if (hard_lim > 0 || object_hard_limit > 0) { + local = quota_local_new(); + if (local == NULL) { + op_errno = ENOMEM; + goto err; } - -wind: - STACK_WIND (frame, - priv->is_quota_on? quota_setxattr_cbk: default_setxattr_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, loc, - dict, flags, xdata); - return 0; + frame->local = local; + loc_copy(&local->loc, loc); + } + + if (hard_lim > 0) { + local->limit.hl = hard_lim; + local->limit.sl = soft_lim; + } + + if (object_hard_limit > 0) { + local->object_limit.hl = object_hard_limit; + local->object_limit.sl = object_soft_limit; + } + + STACK_WIND(frame, quota_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata); + return 0; err: - QUOTA_STACK_UNWIND (setxattr, frame, op_ret, op_errno, NULL); - return 0; + QUOTA_STACK_UNWIND(setxattr, frame, op_ret, op_errno, NULL); + return 0; + +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, + loc, dict, flags, xdata); + return 0; } int -quota_fsetxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, dict_t *xdata) +quota_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - quota_inode_ctx_t *ctx = NULL; - quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; + quota_local_t *local = NULL; + + if (op_ret < 0) + goto out; + + local = frame->local; + if (!local) + goto out; + + op_ret = quota_inode_ctx_get(local->loc.inode, this, &ctx, 1); + if ((op_ret < 0) || (ctx == NULL)) { + op_errno = ENOMEM; + goto out; + } + + LOCK(&ctx->lock); + { + ctx->hard_lim = local->limit.hl; + ctx->soft_lim = local->limit.sl; + ctx->object_hard_lim = local->object_limit.hl; + ctx->object_soft_lim = local->object_limit.sl; + } + UNLOCK(&ctx->lock); - local = frame->local; - if (!local) - goto out; +out: + QUOTA_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata); + return 0; +} - op_ret = quota_inode_ctx_get (local->loc.inode, this, &ctx, 1); - if ((op_ret < 0) || (ctx == NULL)) { - op_errno = ENOMEM; - goto out; +int +quota_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int flags, dict_t *xdata) +{ + quota_priv_t *priv = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + quota_local_t *local = NULL; + int64_t hard_lim = -1; + int64_t soft_lim = -1; + int64_t object_hard_limit = -1; + int64_t object_soft_limit = -1; + + priv = this->private; + + WIND_IF_QUOTAOFF(priv->is_quota_on, off); + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + if (0 <= frame->root->pid) { + GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.quota*", dict, op_errno, + err); + GF_IF_INTERNAL_XATTR_GOTO("trusted.pgfid*", dict, op_errno, err); + } + + quota_get_limits(this, dict, &hard_lim, &soft_lim, &object_hard_limit, + &object_soft_limit); + + if (hard_lim > 0 || object_hard_limit > 0) { + local = quota_local_new(); + if (local == NULL) { + op_errno = ENOMEM; + goto err; } + frame->local = local; + local->loc.inode = inode_ref(fd->inode); + } + + if (hard_lim > 0) { + local->limit.hl = hard_lim; + local->limit.sl = soft_lim; + } + + if (object_hard_limit > 0) { + local->object_limit.hl = object_hard_limit; + local->object_limit.sl = object_soft_limit; + } + + STACK_WIND(frame, quota_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + return 0; +err: + QUOTA_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, NULL); + return 0; - LOCK (&ctx->lock); - { - ctx->hard_lim = local->limit.hard_lim; - ctx->soft_lim = local->limit.soft_lim_percent; - } - UNLOCK (&ctx->lock); +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + return 0; +} -out: - QUOTA_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata); - return 0; +int +quota_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + QUOTA_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata); + return 0; } int -quota_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - dict_t *dict, int flags, dict_t *xdata) +quota_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - quota_priv_t *priv = NULL; - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - quota_local_t *local = NULL; - int64_t hard_lim = -1, soft_lim = -1; + quota_priv_t *priv = NULL; + int32_t op_errno = EINVAL; - priv = this->private; + priv = this->private; - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + WIND_IF_QUOTAOFF(priv->is_quota_on, off); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO(this, err); - if (0 <= frame->root->pid) - GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.quota*", dict, - op_errno, err); + /* all quota xattrs can be cleaned up by doing setxattr on special key. + * Hence its ok that we don't allow removexattr on quota keys here. + */ + if (frame->root->pid >= 0) { + GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.quota*", name, op_errno, + err); + GF_IF_NATIVE_XATTR_GOTO("trusted.pgfid*", name, op_errno, err); + } - quota_get_limits (this, dict, &hard_lim, &soft_lim); + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(loc, err); - if (hard_lim > 0) { - local = quota_local_new (); - frame->local = local; - local->loc.inode = inode_ref (fd->inode); + STACK_WIND(frame, quota_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; - local->limit.hard_lim = hard_lim; - local->limit.soft_lim_percent = soft_lim; - } +err: + QUOTA_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL); + return 0; -wind: - STACK_WIND (frame, priv->is_quota_on? - quota_fsetxattr_cbk: default_fsetxattr_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetxattr, fd, - dict, flags, xdata); - return 0; - err: - QUOTA_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL); - return 0; +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; } - int -quota_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +quota_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - QUOTA_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata); - return 0; + QUOTA_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata); + return 0; } int -quota_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) +quota_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { - quota_priv_t *priv = NULL; - int32_t op_errno = EINVAL; - - priv = this->private; - - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + quota_priv_t *priv = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + priv = this->private; + + WIND_IF_QUOTAOFF(priv->is_quota_on, off); + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + if (frame->root->pid >= 0) { + GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.quota*", name, op_errno, + err); + GF_IF_NATIVE_XATTR_GOTO("trusted.pgfid*", name, op_errno, err); + } + STACK_WIND(frame, quota_fremovexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + return 0; +err: + QUOTA_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, NULL); + return 0; - VALIDATE_OR_GOTO (this, err); +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + return 0; +} - /* all quota xattrs can be cleaned up by doing setxattr on special key. - * Hence its ok that we don't allow removexattr on quota keys here. +int32_t +quota_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct statvfs *buf, + dict_t *xdata) +{ + inode_t *inode = NULL; + uint64_t value = 0; + int64_t usage = -1; + int64_t avail = -1; + int64_t blocks = 0; + quota_inode_ctx_t *ctx = NULL; + int ret = 0; + + inode = cookie; + + /* This fop will fail mostly in case of client disconnect, + * which is already logged. Hence, not logging here */ + if (op_ret == -1) + goto unwind; + /* + * We should never get here unless quota_statfs (below) sent us a + * cookie, and it would only do so if the value was non-NULL. This + * check is therefore just routine defensive coding. + */ + + GF_VALIDATE_OR_GOTO("quota", inode, unwind); + + inode_ctx_get(inode, this, &value); + ctx = (quota_inode_ctx_t *)(unsigned long)value; + if (!ctx || ctx->hard_lim <= 0) + goto unwind; + + { /* statfs is adjusted in this code block */ + usage = (ctx->size) / buf->f_bsize; + + blocks = ctx->hard_lim / buf->f_bsize; + buf->f_blocks = blocks; + + avail = buf->f_blocks - usage; + avail = max(avail, 0); + + buf->f_bfree = avail; + /* + * We have to assume that the total assigned quota + * won't cause us to dip into the reserved space, + * because dealing with the overcommitted cases is + * just too hairy (especially when different bricks + * might be using different reserved percentages and + * such). */ - GF_IF_NATIVE_XATTR_GOTO ("trusted.quota*", - name, op_errno, err); + buf->f_bavail = buf->f_bfree; + } - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (loc, err); + xdata = xdata ? dict_ref(xdata) : dict_new(); + if (!xdata) + goto unwind; -wind: - STACK_WIND (frame, priv->is_quota_on? - quota_removexattr_cbk: default_removexattr_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr, - loc, name, xdata); - return 0; -err: - QUOTA_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); - return 0; -} + ret = dict_set_int8(xdata, "quota-deem-statfs", 1); + if (-1 == ret) + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, Q_MSG_ENOMEM, + "Dict set failed, deem-statfs option may " + "have no effect"); +unwind: + QUOTA_STACK_UNWIND(statfs, frame, op_ret, op_errno, buf, xdata); -int -quota_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - QUOTA_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata); - return 0; + if (xdata) + dict_unref(xdata); + + return 0; } -int -quota_fremovexattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata) +int32_t +quota_statfs_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) { - quota_priv_t *priv = NULL; - int32_t op_ret = -1; - int32_t op_errno = EINVAL; + quota_local_t *local = frame->local; + int op_errno = EINVAL; - priv = this->private; + GF_VALIDATE_OR_GOTO("quota", local, err); - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + if (-1 == local->op_ret) { + op_errno = local->op_errno; + goto err; + } - GF_IF_NATIVE_XATTR_GOTO ("trusted.quota*", - name, op_errno, err); + STACK_WIND_COOKIE(frame, quota_statfs_cbk, local->inode, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, loc, xdata); + return 0; +err: + QUOTA_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL); -wind: - STACK_WIND (frame, priv->is_quota_on? - quota_fremovexattr_cbk: default_fremovexattr_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->fremovexattr, - fd, name, xdata); - return 0; - err: - QUOTA_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL); - return 0; + return 0; } - int32_t -quota_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct statvfs *buf, - dict_t *xdata) +quota_statfs_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) { - inode_t *inode = NULL; - uint64_t value = 0; - int64_t usage = -1; - int64_t avail = -1; - int64_t blocks = 0; - quota_inode_ctx_t *ctx = NULL; - int ret = 0; - gf_boolean_t dict_created = _gf_false; - - inode = cookie; - - /* This fop will fail mostly in case of client disconnect's, - * which is already logged. Hence, not logging here */ - if (op_ret == -1) - goto unwind; - /* - * We should never get here unless quota_statfs (below) sent us a - * cookie, and it would only do so if the value was non-NULL. This - * check is therefore just routine defensive coding. - */ - if (!inode) { - gf_log(this->name,GF_LOG_WARNING, - "null inode, cannot adjust for quota"); - goto unwind; - } - - inode_ctx_get (inode, this, &value); - if (!value) { - goto unwind; - } - - /* if limit is set on this inode, report statfs based on this inode - * else report based on root. - */ - ctx = (quota_inode_ctx_t *)(unsigned long)value; - if (ctx->hard_lim <= 0) { - inode_ctx_get (inode->table->root, this, &value); - ctx = (quota_inode_ctx_t *)(unsigned long) value; - if (!ctx) - goto unwind; - } + quota_local_t *local = NULL; + int32_t ret = 0; + quota_inode_ctx_t *ctx = NULL; + uint64_t value = 0; + quota_meta_t size = { + 0, + }; + + local = frame->local; + + if (op_ret < 0) + goto resume; + + GF_ASSERT(local); + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO_WITH_ERROR("quota", this, resume, op_errno, EINVAL); + GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, xdata, resume, op_errno, EINVAL); + + ret = inode_ctx_get(local->validate_loc.inode, this, &value); + + ctx = (quota_inode_ctx_t *)(unsigned long)value; + if ((ret == -1) || (ctx == NULL)) { + gf_msg(this->name, GF_LOG_WARNING, EINVAL, Q_MSG_INODE_CTX_GET_FAILED, + "quota context is not present in inode (gfid:%s)", + uuid_utoa(local->validate_loc.inode->gfid)); + op_errno = EINVAL; + goto resume; + } + + ret = quota_dict_get_meta(xdata, QUOTA_SIZE_KEY, SLEN(QUOTA_SIZE_KEY), + &size); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, EINVAL, Q_MSG_SIZE_KEY_MISSING, + "size key not present in " + "dict"); + op_errno = EINVAL; + } + + LOCK(&ctx->lock); + { + ctx->size = size.size; + ctx->validate_time = gf_time(); + ctx->file_count = size.file_count; + ctx->dir_count = size.dir_count; + } + UNLOCK(&ctx->lock); - usage = (ctx->size) / buf->f_bsize; +resume: + local->op_errno = op_errno; + quota_link_count_decrement(frame); + return 0; +} - if (ctx->hard_lim > 0) { - blocks = ctx->hard_lim / buf->f_bsize; - buf->f_blocks = blocks; +void +quota_get_limit_dir_continuation(struct list_head *parents, inode_t *inode, + int32_t op_ret, int32_t op_errno, void *data) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + quota_dentry_t *entry = NULL; + inode_t *parent = NULL; - avail = buf->f_blocks - usage; - avail = (avail >= 0) ? avail : 0; + frame = data; + this = THIS; - if (buf->f_bfree > avail) { - buf->f_bfree = avail; - } - /* - * We have to assume that the total assigned quota - * won't cause us to dip into the reserved space, - * because dealing with the overcommitted cases is - * just too hairy (especially when different bricks - * might be using different reserved percentages and - * such). - */ - buf->f_bavail = buf->f_bfree; + if ((op_ret < 0) || list_empty(parents)) { + if (op_ret >= 0) { + gf_msg(this->name, GF_LOG_WARNING, EIO, Q_MSG_ANCESTRY_BUILD_FAILED, + "Couldn't build ancestry for inode (gfid:%s). " + "Without knowing ancestors till root, quota " + "cannot be enforced. " + "Hence, failing fop with EIO", + uuid_utoa(inode->gfid)); + op_errno = EIO; } - if (!xdata) { - xdata = dict_new (); - if (!xdata) - goto unwind; - dict_created = _gf_true; - } + quota_handle_validate_error(frame, -1, op_errno); + goto out; + } - ret = dict_set_int8 (xdata, "quota-deem-statfs", 1); - if (-1 == ret) - gf_log (this->name, GF_LOG_ERROR, "Dict set failed, " - "deem-statfs option may have no effect"); + entry = list_entry(parents, quota_dentry_t, next); + parent = inode_find(inode->table, entry->par); -unwind: - QUOTA_STACK_UNWIND (statfs, frame, op_ret, op_errno, buf, xdata); + quota_get_limit_dir(frame, parent, this); - if (dict_created) - dict_unref (xdata); - return 0; + inode_unref(parent); +out: + return; } - -int32_t -quota_statfs_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xdata) +void +quota_statfs_continue(call_frame_t *frame, xlator_t *this, inode_t *inode) { - quota_local_t *local = NULL; - int op_errno = EINVAL; - - GF_VALIDATE_OR_GOTO ("quota", (local = frame->local), err); - - if (-1 == local->op_ret) { - op_errno = local->op_errno; - goto err; - } - - STACK_WIND_COOKIE (frame, quota_statfs_cbk, loc->inode, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->statfs, loc, xdata); - return 0; -err: - QUOTA_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); - return 0; + quota_local_t *local = frame->local; + int ret = -1; + + LOCK(&local->lock); + { + local->inode = inode_ref(inode); + } + UNLOCK(&local->lock); + + ret = quota_validate(frame, local->inode, this, quota_statfs_validate_cbk); + if (0 > ret) + quota_handle_validate_error(frame, -1, -ret); } -int32_t -quota_statfs_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xdata, - struct iatt *postparent) +void +quota_get_limit_dir(call_frame_t *frame, inode_t *cur_inode, xlator_t *this) { - quota_local_t *local = NULL; - int32_t ret = 0; - quota_inode_ctx_t *ctx = NULL; - int64_t *size = 0; - uint64_t value = 0; + inode_t *inode = NULL; + inode_t *parent = NULL; + uint64_t value = 0; + quota_inode_ctx_t *ctx = NULL; + quota_local_t *local = frame->local; - local = frame->local; + if (!cur_inode) + goto out; - if (op_ret < 0) - goto resume; + inode = inode_ref(cur_inode); + while (inode) { + value = 0; + inode_ctx_get(inode, this, &value); - GF_ASSERT (local); - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO_WITH_ERROR ("quota", this, resume, op_errno, - EINVAL); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, xdata, resume, op_errno, - EINVAL); + if (value) { + ctx = (quota_inode_ctx_t *)(unsigned long)value; + if (ctx->hard_lim > 0) + break; + } - ret = inode_ctx_get (local->validate_loc.inode, this, &value); + if (__is_root_gfid(inode->gfid)) + goto off; - ctx = (quota_inode_ctx_t *)(unsigned long)value; - if ((ret == -1) || (ctx == NULL)) { - gf_log (this->name, GF_LOG_WARNING, - "quota context is not present in inode (gfid:%s)", - uuid_utoa (local->validate_loc.inode->gfid)); - op_errno = EINVAL; - goto resume; + parent = inode_parent(inode, 0, NULL); + if (!parent) { + (void)quota_build_ancestry(inode, quota_get_limit_dir_continuation, + frame); + goto out; } - ret = dict_get_bin (xdata, QUOTA_SIZE_KEY, (void **) &size); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "size key not present in dict"); - op_errno = EINVAL; - goto resume; - } + inode_unref(inode); + inode = parent; + } - LOCK (&ctx->lock); - { - ctx->size = ntoh64 (*size); - gettimeofday (&ctx->tv, NULL); - } - UNLOCK (&ctx->lock); + quota_statfs_continue(frame, this, inode); + inode_unref(inode); + return; -resume: - --local->link_count; +off: + gf_msg_debug(this->name, 0, "No limit set on the inode or it's parents."); - quota_resume_fop_if_validation_done (local); - return 0; + QUOTA_STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, &local->loc, + local->xdata); +out: + inode_unref(inode); + + return; } int32_t -quota_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +quota_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - quota_local_t *local = NULL; - int op_errno = 0; - call_stub_t *stub = NULL; - quota_priv_t *priv = NULL; - int ret = 0; + int op_errno = 0; + int ret = -1; + int8_t ignore_deem_statfs = 0; + quota_priv_t *priv = NULL; + quota_local_t *local = NULL; + call_stub_t *stub = NULL; - priv = this->private; + priv = this->private; + GF_ASSERT(loc); - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + WIND_IF_QUOTAOFF(priv->is_quota_on, off); - if (priv->consider_statfs && loc->inode) { - local = quota_local_new (); - if (!local) { - op_errno = ENOMEM; - goto err; - } - frame->local = local; + ret = dict_get_int8(xdata, GF_INTERNAL_IGNORE_DEEM_STATFS, + &ignore_deem_statfs); + ret = 0; - local->inode = inode_ref (loc->inode); - local->link_count = 1; + if (ignore_deem_statfs) + goto off; - stub = fop_statfs_stub (frame, quota_statfs_helper, loc, xdata); - if (!stub) { - op_errno = ENOMEM; - goto err; - } + if (priv->consider_statfs && loc->inode) { + local = quota_local_new(); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; - local->stub = stub; + ret = loc_copy(&local->loc, loc); + if (-1 == ret) { + op_errno = ENOMEM; + goto err; + } - ret = quota_validate (frame, local->inode, this, - quota_statfs_validate_cbk); - if (0 > ret) { - op_errno = -ret; - --local->link_count; - } + if (xdata) + local->xdata = dict_ref(xdata); + + stub = fop_statfs_stub(frame, quota_statfs_helper, &local->loc, + local->xdata); + if (!stub) { + op_errno = ENOMEM; + goto err; + } + + LOCK(&local->lock); + { + local->link_count = 1; + local->stub = stub; + } + UNLOCK(&local->lock); + + quota_get_limit_dir(frame, loc->inode, this); - quota_resume_fop_if_validation_done (local); - } - else { - /* - * We have to make sure that we never get to quota_statfs_cbk - * with a cookie that points to something other than an inode, - * which is exactly what would happen with STACK_UNWIND using - * that as a callback. Therefore, use default_statfs_cbk in - * this case instead. - * - * Also if the option deem-statfs is not set to "on" don't - * bother calculating quota limit on / in statfs_cbk. - */ - if (priv->consider_statfs) - gf_log(this->name,GF_LOG_WARNING, - "missing inode, cannot adjust for quota"); -wind: - STACK_WIND (frame, default_statfs_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->statfs, loc, xdata); - } return 0; + } + + /* + * We have to make sure that we never get to quota_statfs_cbk + * with a cookie that points to something other than an inode, + * which is exactly what would happen with STACK_UNWIND using + * that as a callback. Therefore, use default_statfs_cbk in + * this case instead. + * + * Also if the option deem-statfs is not set to "on" don't + * bother calculating quota limit on / in statfs_cbk. + */ + if (priv->consider_statfs) + gf_log(this->name, GF_LOG_ERROR, + "Missing inode, can't adjust for quota"); + +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->statfs, + loc, xdata); + return 0; err: - STACK_UNWIND_STRICT (statfs, frame, -1, op_errno, NULL, NULL); + QUOTA_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL); - if (local) - quota_local_cleanup (this, local); - return 0; + return 0; } int -quota_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries, - dict_t *xdata) +quota_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) { - gf_dirent_t *entry = NULL; - quota_local_t *local = NULL; - loc_t loc = {0, }; + gf_dirent_t *entry = NULL; + quota_local_t *local = NULL; + loc_t loc = { + 0, + }; - if (op_ret <= 0) - goto unwind; + if (op_ret <= 0) + goto unwind; - local = frame->local; + local = frame->local; - list_for_each_entry (entry, &entries->list, list) { - if ((strcmp (entry->d_name, ".") == 0) - || (strcmp (entry->d_name, "..") == 0)) - continue; + list_for_each_entry(entry, &entries->list, list) + { + if ((strcmp(entry->d_name, ".") == 0) || + (strcmp(entry->d_name, "..") == 0) || entry->inode == NULL) + continue; - uuid_copy (loc.gfid, entry->d_stat.ia_gfid); - loc.inode = inode_ref (entry->inode); - loc.parent = inode_ref (local->loc.inode); - uuid_copy (loc.pargfid, loc.parent->gfid); - loc.name = entry->d_name; + gf_uuid_copy(loc.gfid, entry->d_stat.ia_gfid); + loc.inode = inode_ref(entry->inode); + loc.parent = inode_ref(local->loc.inode); + gf_uuid_copy(loc.pargfid, loc.parent->gfid); + loc.name = entry->d_name; - quota_fill_inodectx (this, entry->inode, entry->dict, - &loc, &entry->d_stat, &op_errno); + quota_fill_inodectx(this, entry->inode, entry->dict, &loc, + &entry->d_stat, &op_errno); - loc_wipe (&loc); - } + loc_wipe(&loc); + } unwind: - QUOTA_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, xdata); + QUOTA_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata); - return 0; + return 0; } int -quota_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, dict_t *dict) +quota_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict) { - quota_priv_t *priv = NULL; - int ret = 0; - gf_boolean_t new_dict = _gf_false; - quota_local_t *local = NULL; + quota_priv_t *priv = NULL; + int ret = 0; + gf_boolean_t new_dict = _gf_false; + quota_local_t *local = NULL; - priv = this->private; + priv = this->private; - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); + WIND_IF_QUOTAOFF(priv->is_quota_on, off); - local = quota_local_new (); + local = quota_local_new(); - if (local == NULL) { - goto err; - } + if (local == NULL) { + goto err; + } - frame->local = local; + frame->local = local; + + local->loc.inode = inode_ref(fd->inode); - local->loc.inode = inode_ref (fd->inode); + if (dict == NULL) { + dict = dict_new(); + new_dict = _gf_true; + } - if (dict == NULL) { - dict = dict_new (); - new_dict = _gf_true; + if (dict) { + ret = dict_set_int8(dict, QUOTA_LIMIT_KEY, 1); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "dict set of key for hard-limit"); + goto err; } + } - if (dict) { - ret = dict_set_int8 (dict, QUOTA_LIMIT_KEY, 1); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "dict set of key for hard-limit failed"); - goto err; - } + if (dict) { + ret = dict_set_int8(dict, QUOTA_LIMIT_OBJECTS_KEY, 1); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "dict set of key for hard-limit " + "failed"); + goto err; } + } -wind: - STACK_WIND (frame, - priv->is_quota_on? quota_readdirp_cbk: default_readdirp_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, fd, - size, offset, dict); + STACK_WIND(frame, quota_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict); - if (new_dict) { - dict_unref (dict); - } + if (new_dict) { + dict_unref(dict); + } - return 0; + return 0; err: - STACK_UNWIND_STRICT (readdirp, frame, -1, EINVAL, NULL, NULL); + STACK_UNWIND_STRICT(readdirp, frame, -1, EINVAL, NULL, NULL); - if (new_dict) { - dict_unref (dict); - } + if (new_dict) { + dict_unref(dict); + } - return 0; + return 0; + +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, + fd, size, offset, dict); + return 0; } int32_t @@ -3753,522 +4644,693 @@ quota_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - int32_t ret = 0; - uint64_t ctx_int = 0; - quota_inode_ctx_t *ctx = NULL; - quota_local_t *local = NULL; - quota_dentry_t *dentry = NULL; - int64_t delta = 0; - - local = frame->local; - - if ((op_ret < 0) || (local == NULL)) { - goto out; - } - - ret = inode_ctx_get (local->loc.inode, this, &ctx_int); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to get the context", local->loc.path); - goto out; - } - - ctx = (quota_inode_ctx_t *)(unsigned long) ctx_int; - - if (ctx == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "quota context not set in %s (gfid:%s)", - local->loc.path, uuid_utoa (local->loc.inode->gfid)); - goto out; - } - - LOCK (&ctx->lock); - { - ctx->buf = *postbuf; - } - UNLOCK (&ctx->lock); - - list_for_each_entry (dentry, &ctx->parents, next) { - delta = (postbuf->ia_blocks - prebuf->ia_blocks) * 512; - quota_update_size (this, local->loc.inode, - dentry->name, dentry->par, delta); - } + int32_t ret = 0; + uint64_t ctx_int = 0; + quota_inode_ctx_t *ctx = NULL; + quota_local_t *local = NULL; + + local = frame->local; + + if ((op_ret < 0) || (local == NULL)) { + goto out; + } + + ret = inode_ctx_get(local->loc.inode, this, &ctx_int); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INODE_CTX_GET_FAILED, + "%s: failed to get the context", local->loc.path); + goto out; + } + + ctx = (quota_inode_ctx_t *)(unsigned long)ctx_int; + + if (ctx == NULL) { + gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INODE_CTX_GET_FAILED, + "quota context not set in %s (gfid:%s)", local->loc.path, + uuid_utoa(local->loc.inode->gfid)); + goto out; + } + + LOCK(&ctx->lock); + { + ctx->buf = *postbuf; + } + UNLOCK(&ctx->lock); out: - QUOTA_STACK_UNWIND (fallocate, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + QUOTA_STACK_UNWIND(fallocate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); - return 0; + return 0; } - int32_t -quota_fallocate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t mode, off_t offset, size_t len, dict_t *xdata) +quota_fallocate_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t mode, off_t offset, size_t len, dict_t *xdata) { - quota_local_t *local = NULL; - int32_t op_errno = EINVAL; - quota_priv_t *priv = NULL; + quota_local_t *local = NULL; + int32_t op_errno = EINVAL; - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - goto unwind; - } + local = frame->local; - priv = this->private; + GF_VALIDATE_OR_GOTO("quota", local, unwind); - if (local->op_ret == -1) { - op_errno = local->op_errno; - goto unwind; + if (local->op_ret == -1) { + op_errno = local->op_errno; + if (op_errno == ENOENT || op_errno == ESTALE) { + /* We may get ENOENT/ESTALE in case of below scenario + * fd = open file.txt + * unlink file.txt + * fallocate on fd + * Here build_ancestry can fail as the file is removed. + * For now ignore ENOENT/ESTALE on active fd + * We need to re-visit this code once we understand + * how other file-system behave in this scenario + */ + gf_msg_debug(this->name, 0, + "quota enforcer failed " + "with ENOENT/ESTALE on %s, cannot check " + "quota limits and allowing fallocate", + uuid_utoa(fd->inode->gfid)); + } else { + goto unwind; } + } - STACK_WIND (frame, priv->is_quota_on? - quota_fallocate_cbk: default_fallocate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, - xdata); - return 0; + STACK_WIND(frame, quota_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, + xdata); + return 0; unwind: - QUOTA_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + QUOTA_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - int32_t quota_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, - off_t offset, size_t len, dict_t *xdata) + off_t offset, size_t len, dict_t *xdata) { - int32_t ret = -1, op_errno = EINVAL; - int32_t parents = 0; - quota_local_t *local = NULL; - quota_inode_ctx_t *ctx = NULL; - quota_priv_t *priv = NULL; - quota_dentry_t *dentry = NULL; - call_stub_t *stub = NULL; - - priv = this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, unwind); - - WIND_IF_QUOTAOFF (priv->is_quota_on, wind); - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("quota", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, fd, unwind); - - local = quota_local_new (); - if (local == NULL) { - goto unwind; - } - - frame->local = local; - local->loc.inode = inode_ref (fd->inode); - - ret = quota_inode_ctx_get (fd->inode, this, &ctx, 0); - if (ctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, "quota context is " - "NULL on inode (%s). " - "If quota is not enabled recently and crawler " - "has finished crawling, its an error", - uuid_utoa (local->loc.inode->gfid)); - } - - stub = fop_fallocate_stub(frame, quota_fallocate_helper, fd, mode, - offset, len, xdata); - if (stub == NULL) { - op_errno = ENOMEM; - goto unwind; + int32_t op_errno = EINVAL; + int32_t parents = 0; + int32_t fail_count = 0; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; + quota_priv_t *priv = NULL; + quota_dentry_t *dentry = NULL; + quota_dentry_t *tmp = NULL; + call_stub_t *stub = NULL; + struct list_head head = { + 0, + }; + inode_t *par_inode = NULL; + + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, priv, unwind); + + WIND_IF_QUOTAOFF(priv->is_quota_on, off); + + INIT_LIST_HEAD(&head); + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO("quota", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd, unwind); + + local = quota_local_new(); + if (local == NULL) { + goto unwind; + } + + frame->local = local; + local->loc.inode = inode_ref(fd->inode); + + (void)quota_inode_ctx_get(fd->inode, this, &ctx, 0); + if (ctx == NULL) { + gf_msg_debug(this->name, 0, + "quota context is NULL on inode" + " (%s). If quota is not enabled recently and " + "crawler has finished crawling, its an error", + uuid_utoa(local->loc.inode->gfid)); + } + + stub = fop_fallocate_stub(frame, quota_fallocate_helper, fd, mode, offset, + len, xdata); + if (stub == NULL) { + op_errno = ENOMEM; + goto unwind; + } + + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, priv, unwind); + + parents = quota_add_parents_from_ctx(ctx, &head); + if (parents == -1) { + op_errno = errno; + goto unwind; + } + + /* + * Note that by using len as the delta we're assuming the range from + * offset to offset+len has not already been allocated. This can result + * in ENOSPC errors attempting to allocate an already allocated range. + */ + local->delta = len; + local->object_delta = 0; + local->stub = stub; + local->link_count = parents; + + if (parents == 0) { + local->link_count = 1; + quota_check_limit(frame, fd->inode, this); + } else { + list_for_each_entry_safe(dentry, tmp, &head, next) + { + par_inode = do_quota_check_limit(frame, fd->inode, this, dentry, + _gf_false); + if (par_inode == NULL) { + /* remove stale entry from inode_ctx */ + quota_dentry_del(ctx, dentry->name, dentry->par); + parents--; + fail_count++; + } else { + inode_unref(par_inode); + } + __quota_dentry_free(dentry); } - priv = this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, unwind); - - if (ctx != NULL) { - LOCK (&ctx->lock); - { - list_for_each_entry (dentry, &ctx->parents, next) { - parents++; - } - } - UNLOCK (&ctx->lock); + if (parents == 0) { + LOCK(&local->lock); + { + local->link_count++; + } + UNLOCK(&local->lock); + quota_check_limit(frame, fd->inode, this); } - /* - * Note that by using len as the delta we're assuming the range from - * offset to offset+len has not already been allocated. This can result - * in ENOSPC errors attempting to allocate an already allocated range. - */ - local->delta = len; - local->stub = stub; - local->link_count = parents; - - if (parents == 0) { - local->link_count = 1; - quota_check_limit (frame, fd->inode, this, NULL, NULL); - } else { - list_for_each_entry (dentry, &ctx->parents, next) { - quota_check_limit (frame, fd->inode, this, dentry->name, - dentry->par); - } + while (fail_count != 0) { + quota_link_count_decrement(frame); + fail_count--; } + } - return 0; + return 0; unwind: - QUOTA_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + QUOTA_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; -wind: - STACK_WIND (frame, priv->is_quota_on? - quota_fallocate_cbk: default_fallocate_cbk, - FIRST_CHILD(this), +off: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, - xdata); - return 0; + xdata); + return 0; } -/* Logs if -* i. Usage crossed soft limit -* ii. Usage above soft limit and alert-time timed out -*/ void -quota_log_usage (xlator_t *this, quota_inode_ctx_t *ctx, inode_t *inode, - int64_t delta) +quota_log_helper(char **usage_str, int64_t cur_size, inode_t *inode, + char **path, time_t *cur_time) { - struct timeval cur_time = {0,}; - char *usage_str = NULL; - char *path = NULL; - int64_t cur_size = 0; - quota_priv_t *priv = NULL; - - priv = this->private; - cur_size = ctx->size + delta; - if ((ctx->soft_lim <= 0) || (timerisset (&ctx->prev_log) && - !quota_timeout (&ctx->prev_log, - priv->log_timeout))) { - return; - } + xlator_t *this = THIS; - gettimeofday (&cur_time, NULL); - ctx->prev_log = cur_time; + if (!usage_str || !inode || !path || !cur_time) { + gf_log(this->name, GF_LOG_ERROR, "Received null argument"); + return; + } - usage_str = gf_uint64_2human_readable (cur_size); - inode_path (inode, NULL, &path); - if (!path) - path = uuid_utoa (inode->gfid); + *usage_str = gf_uint64_2human_readable(cur_size); + if (!(*usage_str)) + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, Q_MSG_ENOMEM, + "integer to string conversion failed Reason" + ":\"Cannot allocate memory\""); - /* Usage crossed/reached soft limit */ - if (DID_REACH_LIMIT (ctx->soft_lim, ctx->size, cur_size)) { + inode_path(inode, NULL, path); + if (!(*path)) + *path = uuid_utoa(inode->gfid); - gf_log (this->name, GF_LOG_ALERT, "Usage crossed " - "soft limit: %s used by %s", usage_str, path); - } - /* Usage is above soft limit */ - else if (cur_size > ctx->soft_lim){ - gf_log (this->name, GF_LOG_ALERT, "Usage is above " - "soft limit: %s used by %s", usage_str, path); - } - if (usage_str) - GF_FREE (usage_str); + *cur_time = gf_time(); } -int32_t -mem_acct_init (xlator_t *this) +/* Logs if + * i. Usage crossed soft limit + * ii. Usage above soft limit and alert-time elapsed + */ +void +quota_log_usage(xlator_t *this, quota_inode_ctx_t *ctx, inode_t *inode, + int64_t delta) { - int ret = -1; + time_t cur_time = 0; + char *usage_str = NULL; + char *path = NULL; + int64_t cur_size = 0; + quota_priv_t *priv = NULL; - if (!this) - return ret; + priv = this->private; + cur_size = ctx->size + delta; - ret = xlator_mem_acct_init (this, gf_quota_mt_end + 1); + if ((ctx->soft_lim <= 0) || cur_size < ctx->soft_lim) + return; - if (ret != 0) { - gf_log (this->name, GF_LOG_WARNING, "Memory accounting" - "init failed"); - return ret; - } + /* Usage crossed/reached soft limit */ + if (DID_REACH_LIMIT(ctx->soft_lim, ctx->size, cur_size)) { + quota_log_helper(&usage_str, cur_size, inode, &path, &cur_time); - return ret; -} + gf_msg(this->name, GF_LOG_ALERT, 0, Q_MSG_CROSSED_SOFT_LIMIT, + "Usage crossed soft limit: " + "%s used by %s", + usage_str, path); + gf_event(EVENT_QUOTA_CROSSED_SOFT_LIMIT, + "Usage=%s;volume=%s;" + "path=%s", + usage_str, priv->volume_uuid, path); -int32_t -quota_forget (xlator_t *this, inode_t *inode) -{ - int32_t ret = 0; - uint64_t ctx_int = 0; - quota_inode_ctx_t *ctx = NULL; - quota_dentry_t *dentry = NULL, *tmp; + ctx->prev_log_time = cur_time; - ret = inode_ctx_del (inode, this, &ctx_int); + } + /* Usage is above soft limit */ + else if (cur_size > ctx->soft_lim && + quota_timeout(ctx->prev_log_time, priv->log_timeout)) { + quota_log_helper(&usage_str, cur_size, inode, &path, &cur_time); - if (ret < 0) { - return 0; - } - - ctx = (quota_inode_ctx_t *) (long)ctx_int; + gf_msg(this->name, GF_LOG_ALERT, 0, Q_MSG_CROSSED_SOFT_LIMIT, + "Usage is above soft limit: %s used by %s", usage_str, path); - LOCK (&ctx->lock); - { - list_for_each_entry_safe (dentry, tmp, &ctx->parents, next) { - __quota_dentry_free (dentry); - } - } - UNLOCK (&ctx->lock); + gf_event(EVENT_QUOTA_CROSSED_SOFT_LIMIT, + "Usage=%s;volume=%s;" + "path=%s", + usage_str, priv->volume_uuid, path); - LOCK_DESTROY (&ctx->lock); + ctx->prev_log_time = cur_time; + } - GF_FREE (ctx); + if (path) + GF_FREE(path); - return 0; + if (usage_str) + GF_FREE(usage_str); } int32_t -init (xlator_t *this) +mem_acct_init(xlator_t *this) { - int32_t ret = -1; - quota_priv_t *priv = NULL; - - if ((this->children == NULL) - || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: quota (%s) not configured with " - "exactly one child", this->name); - return -1; - } + int ret = -1; - if (this->parents == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } + if (!this) + return ret; - QUOTA_ALLOC_OR_GOTO (priv, quota_priv_t, err); + ret = xlator_mem_acct_init(this, gf_quota_mt_end + 1); - LOCK_INIT (&priv->lock); + if (ret != 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "Memory accounting init failed"); + return ret; + } - this->private = priv; + return ret; +} - GF_OPTION_INIT ("deem-statfs", priv->consider_statfs, bool, err); - GF_OPTION_INIT ("server-quota", priv->is_quota_on, bool, err); - GF_OPTION_INIT ("default-soft-limit", priv->default_soft_lim, percent, - err); - GF_OPTION_INIT ("soft-timeout", priv->soft_timeout, time, err); - GF_OPTION_INIT ("hard-timeout", priv->hard_timeout, time, err); - GF_OPTION_INIT ("alert-time", priv->log_timeout, time, err); - GF_OPTION_INIT ("volume-uuid", priv->volume_uuid, str, err); +int32_t +quota_forget(xlator_t *this, inode_t *inode) +{ + int32_t ret = 0; + uint64_t ctx_int = 0; + quota_inode_ctx_t *ctx = NULL; + quota_dentry_t *dentry = NULL, *tmp; - this->local_pool = mem_pool_new (quota_local_t, 64); - if (!this->local_pool) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "failed to create local_t's memory pool"); - goto err; - } + ret = inode_ctx_del(inode, this, &ctx_int); - if (priv->is_quota_on) { - priv->rpc_clnt = quota_enforcer_init (this, this->options); - if (priv->rpc_clnt == NULL) { - ret = -1; - gf_log (this->name, GF_LOG_WARNING, - "quota enforcer rpc init failed"); - goto err; - } + if (ret < 0) { + return 0; + } + + ctx = (quota_inode_ctx_t *)(long)ctx_int; + + LOCK(&ctx->lock); + { + list_for_each_entry_safe(dentry, tmp, &ctx->parents, next) + { + __quota_dentry_free(dentry); } + } + UNLOCK(&ctx->lock); - ret = 0; -err: - return ret; + LOCK_DESTROY(&ctx->lock); + + GF_FREE(ctx); + + return 0; } int -reconfigure (xlator_t *this, dict_t *options) +notify(xlator_t *this, int event, void *data, ...) { - int32_t ret = -1; - quota_priv_t *priv = NULL; - - priv = this->private; - - GF_OPTION_RECONF ("deem-statfs", priv->consider_statfs, options, bool, - out); - GF_OPTION_RECONF ("server-quota", priv->is_quota_on, options, bool, - out); - GF_OPTION_RECONF ("default-soft-limit", priv->default_soft_lim, - options, percent, out); - GF_OPTION_RECONF ("alert-time", priv->log_timeout, options, - time, out); - GF_OPTION_RECONF ("soft-timeout", priv->soft_timeout, options, - time, out); - GF_OPTION_RECONF ("hard-timeout", priv->hard_timeout, options, - time, out); - - if (priv->is_quota_on) { - priv->rpc_clnt = quota_enforcer_init (this, - this->options); - if (priv->rpc_clnt == NULL) { - ret = -1; - gf_log (this->name, GF_LOG_WARNING, - "quota enforcer rpc init failed"); - goto out; - } - - } else { - if (priv->rpc_clnt) { - // Quotad is shutdown when there is no started volume - // which has quota enabled. So, we should disable the - // enforcer client when quota is disabled on a volume, - // to avoid spurious reconnect attempts to a service - // (quotad), that is known to be down. - rpc_clnt_disable (priv->rpc_clnt); + quota_priv_t *priv = NULL; + int ret = 0; + rpc_clnt_t *rpc = NULL; + gf_boolean_t conn_status = _gf_true; + xlator_t *victim = data; + + priv = this->private; + if (!priv || !priv->is_quota_on) + goto out; + + if (event == GF_EVENT_PARENT_DOWN) { + rpc = priv->rpc_clnt; + if (rpc) { + rpc_clnt_disable(rpc); + pthread_mutex_lock(&priv->conn_mutex); + { + conn_status = priv->conn_status; + while (conn_status) { + (void)pthread_cond_wait(&priv->conn_cond, + &priv->conn_mutex); + conn_status = priv->conn_status; } + } + pthread_mutex_unlock(&priv->conn_mutex); + gf_log(this->name, GF_LOG_INFO, + "Notify GF_EVENT_PARENT_DOWN for brick %s", victim->name); } + } - ret = 0; out: - return ret; + ret = default_notify(this, event, data); + return ret; } int32_t -quota_priv_dump (xlator_t *this) +init(xlator_t *this) { - quota_priv_t *priv = NULL; - int32_t ret = -1; + int32_t ret = -1; + quota_priv_t *priv = NULL; + rpc_clnt_t *rpc = NULL; + + if ((this->children == NULL) || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, 0, Q_MSG_INVALID_VOLFILE, + "FATAL: quota (%s) not configured with " + "exactly one child", + this->name); + return -1; + } + + if (this->parents == NULL) { + gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INVALID_VOLFILE, + "dangling volume. check volfile"); + } + + QUOTA_ALLOC_OR_GOTO(priv, quota_priv_t, err); + + LOCK_INIT(&priv->lock); + + this->private = priv; + + GF_OPTION_INIT("deem-statfs", priv->consider_statfs, bool, err); + GF_OPTION_INIT("server-quota", priv->is_quota_on, bool, err); + GF_OPTION_INIT("default-soft-limit", priv->default_soft_lim, percent, err); + GF_OPTION_INIT("soft-timeout", priv->soft_timeout, time, err); + GF_OPTION_INIT("hard-timeout", priv->hard_timeout, time, err); + GF_OPTION_INIT("alert-time", priv->log_timeout, time, err); + GF_OPTION_INIT("volume-uuid", priv->volume_uuid, str, err); + + this->local_pool = mem_pool_new(quota_local_t, 64); + if (!this->local_pool) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, Q_MSG_ENOMEM, + "failed to create local_t's memory pool"); + goto err; + } + + pthread_mutex_init(&priv->conn_mutex, NULL); + pthread_cond_init(&priv->conn_cond, NULL); + priv->conn_status = _gf_false; + + if (priv->is_quota_on) { + rpc = quota_enforcer_init(this, this->options); + if (rpc == NULL) { + ret = -1; + gf_msg(this->name, GF_LOG_WARNING, 0, + Q_MSG_QUOTA_ENFORCER_RPC_INIT_FAILED, + "quota enforcer rpc init failed"); + goto err; + } + + LOCK(&priv->lock); + { + priv->rpc_clnt = rpc; + } + UNLOCK(&priv->lock); + } + + ret = 0; +err: + return ret; +} +int +reconfigure(xlator_t *this, dict_t *options) +{ + int32_t ret = -1; + quota_priv_t *priv = NULL; + gf_boolean_t quota_on = _gf_false; + rpc_clnt_t *rpc = NULL; + + priv = this->private; + + GF_OPTION_RECONF("deem-statfs", priv->consider_statfs, options, bool, out); + GF_OPTION_RECONF("server-quota", quota_on, options, bool, out); + GF_OPTION_RECONF("default-soft-limit", priv->default_soft_lim, options, + percent, out); + GF_OPTION_RECONF("alert-time", priv->log_timeout, options, time, out); + GF_OPTION_RECONF("soft-timeout", priv->soft_timeout, options, time, out); + GF_OPTION_RECONF("hard-timeout", priv->hard_timeout, options, time, out); + + if (quota_on) { + priv->rpc_clnt = quota_enforcer_init(this, this->options); + if (priv->rpc_clnt == NULL) { + ret = -1; + gf_msg(this->name, GF_LOG_WARNING, 0, + Q_MSG_QUOTA_ENFORCER_RPC_INIT_FAILED, + "quota enforcer rpc init failed"); + goto out; + } + + } else { + LOCK(&priv->lock); + { + rpc = priv->rpc_clnt; + priv->rpc_clnt = NULL; + } + UNLOCK(&priv->lock); - GF_ASSERT (this); + if (rpc != NULL) { + // Quotad is shutdown when there is no started volume + // which has quota enabled. So, we should disable the + // enforcer client when quota is disabled on a volume, + // to avoid spurious reconnect attempts to a service + // (quotad), that is known to be down. + rpc_clnt_unref(rpc); + } + } - priv = this->private; + priv->is_quota_on = quota_on; - gf_proc_dump_add_section ("xlators.features.quota.priv", this->name); + ret = 0; +out: + return ret; +} - ret = TRY_LOCK (&priv->lock); - if (ret) - goto out; - else { - gf_proc_dump_write("soft-timeout", "%d", priv->soft_timeout); - gf_proc_dump_write("hard-timeout", "%d", priv->hard_timeout); - gf_proc_dump_write("alert-time", "%d", priv->log_timeout); - gf_proc_dump_write("quota-on", "%d", priv->is_quota_on); - gf_proc_dump_write("statfs", "%d", priv->consider_statfs); - gf_proc_dump_write("volume-uuid", "%s", priv->volume_uuid); - gf_proc_dump_write("validation-count", "%ld", - priv->validation_count); - } - UNLOCK (&priv->lock); +int32_t +quota_priv_dump(xlator_t *this) +{ + quota_priv_t *priv = NULL; + int32_t ret = -1; + + GF_ASSERT(this); + + priv = this->private; + if (!priv) + goto out; + + gf_proc_dump_add_section("xlators.features.quota.priv"); + + ret = TRY_LOCK(&priv->lock); + if (ret) + goto out; + else { + gf_proc_dump_write("soft-timeout", "%u", priv->soft_timeout); + gf_proc_dump_write("hard-timeout", "%u", priv->hard_timeout); + gf_proc_dump_write("alert-time", "%u", priv->log_timeout); + gf_proc_dump_write("quota-on", "%d", priv->is_quota_on); + gf_proc_dump_write("statfs", "%d", priv->consider_statfs); + gf_proc_dump_write("volume-uuid", "%s", priv->volume_uuid); + gf_proc_dump_write("validation-count", "%" PRIu64, + priv->validation_count); + } + UNLOCK(&priv->lock); out: - return 0; + return 0; } void -fini (xlator_t *this) +fini(xlator_t *this) { + quota_priv_t *priv = NULL; + rpc_clnt_t *rpc = NULL; + + priv = this->private; + if (!priv) return; + rpc = priv->rpc_clnt; + priv->rpc_clnt = NULL; + if (rpc) { + rpc_clnt_connection_cleanup(&rpc->conn); + rpc_clnt_unref(rpc); + } + + this->private = NULL; + LOCK_DESTROY(&priv->lock); + pthread_mutex_destroy(&priv->conn_mutex); + pthread_cond_destroy(&priv->conn_cond); + + GF_FREE(priv); + if (this->local_pool) { + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } + return; } - struct xlator_fops fops = { - .statfs = quota_statfs, - .lookup = quota_lookup, - .writev = quota_writev, - .create = quota_create, - .mkdir = quota_mkdir, - .truncate = quota_truncate, - .ftruncate = quota_ftruncate, - .unlink = quota_unlink, - .symlink = quota_symlink, - .link = quota_link, - .rename = quota_rename, - .getxattr = quota_getxattr, - .fgetxattr = quota_fgetxattr, - .stat = quota_stat, - .fstat = quota_fstat, - .readlink = quota_readlink, - .readv = quota_readv, - .fsync = quota_fsync, - .setattr = quota_setattr, - .fsetattr = quota_fsetattr, - .mknod = quota_mknod, - .setxattr = quota_setxattr, - .fsetxattr = quota_fsetxattr, - .removexattr = quota_removexattr, - .fremovexattr = quota_fremovexattr, - .readdirp = quota_readdirp, - .fallocate = quota_fallocate, + .statfs = quota_statfs, + .lookup = quota_lookup, + .writev = quota_writev, + .create = quota_create, + .mkdir = quota_mkdir, + .truncate = quota_truncate, + .ftruncate = quota_ftruncate, + .unlink = quota_unlink, + .symlink = quota_symlink, + .link = quota_link, + .rename = quota_rename, + .getxattr = quota_getxattr, + .fgetxattr = quota_fgetxattr, + .stat = quota_stat, + .fstat = quota_fstat, + .readlink = quota_readlink, + .readv = quota_readv, + .fsync = quota_fsync, + .setattr = quota_setattr, + .fsetattr = quota_fsetattr, + .mknod = quota_mknod, + .setxattr = quota_setxattr, + .fsetxattr = quota_fsetxattr, + .removexattr = quota_removexattr, + .fremovexattr = quota_fremovexattr, + .readdirp = quota_readdirp, + .fallocate = quota_fallocate, }; -struct xlator_cbks cbks = { - .forget = quota_forget -}; +struct xlator_cbks cbks = {.forget = quota_forget}; struct xlator_dumpops dumpops = { - .priv = quota_priv_dump, + .priv = quota_priv_dump, }; struct volume_options options[] = { - {.key = {"limit-set"}}, - {.key = {"deem-statfs"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - .description = "If set to on, it takes quota limits into" - "consideration while estimating fs size. (df command)" - " (Default is off)." - }, - {.key = {"server-quota"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - .description = "Skip the quota enforcement if the feature is" - " not turned on. This is not a user exposed option." - }, - {.key = {"default-soft-limit"}, - .type = GF_OPTION_TYPE_PERCENT, - .default_value = "80%", - .min = 0, - .max = LONG_MAX, - }, - {.key = {"soft-timeout"}, - .type = GF_OPTION_TYPE_TIME, - .min = 0, - .max = 1800, - .default_value = "60", - .description = "quota caches the directory sizes on client. " - "soft-timeout indicates the timeout for the validity of" - " cache before soft-limit has been crossed." - }, - {.key = {"hard-timeout"}, - .type = GF_OPTION_TYPE_TIME, - .min = 0, - .max = 60, - .default_value = "5", - .description = "quota caches the directory sizes on client. " - "hard-timeout indicates the timeout for the validity of" - " cache after soft-limit has been crossed." - }, - { .key = {"username"}, - .type = GF_OPTION_TYPE_ANY, - }, - { .key = {"password"}, - .type = GF_OPTION_TYPE_ANY, - }, - { .key = {"transport-type"}, - .value = {"tcp", "socket", "ib-verbs", "unix", "ib-sdp", - "tcp/client", "ib-verbs/client", "rdma"}, - .type = GF_OPTION_TYPE_STR, - }, - { .key = {"remote-host"}, - .type = GF_OPTION_TYPE_INTERNET_ADDRESS, - }, - { .key = {"remote-port"}, - .type = GF_OPTION_TYPE_INT, - }, - { .key = {"volume-uuid"}, - .type = GF_OPTION_TYPE_STR, - .description = "uuid of the volume this brick is part of." - }, - { .key = {"alert-time"}, - .type = GF_OPTION_TYPE_TIME, - .min = 0, - .max = 7*86400, - .default_value = "86400", - }, - {.key = {NULL}} + { + .key = {"enable"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable is the volume option that can be used " + "to turn on quota.", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .level = OPT_STATUS_BASIC, + .tags = {}, + }, + { + .key = {"deem-statfs"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "If set to on, it takes quota limits into" + " consideration while estimating fs size. (df command)" + " (Default is on).", + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {}, + }, + { + .key = {"server-quota"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Skip the quota enforcement if the feature is" + " not turned on. This is not a user exposed option.", + .flags = OPT_FLAG_NONE, + }, + { + .key = {"default-soft-limit"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "80%", + .op_version = {3}, + .description = "Soft limit is expressed as a proportion of hard limit." + " Default-soft-limit is the proportion used when the " + " user does not supply any soft limit value.", + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {}, + }, + { + .key = {"soft-timeout"}, + .type = GF_OPTION_TYPE_TIME, + .min = 0, + .max = 1800, + .default_value = "60", + .description = "quota caches the directory sizes on client. " + "soft-timeout indicates the timeout for the validity of" + " cache before soft-limit has been crossed.", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {}, + }, + { + .key = {"hard-timeout"}, + .type = GF_OPTION_TYPE_TIME, + .min = 0, + .max = 60, + .default_value = "5", + .description = "quota caches the directory sizes on client. " + "hard-timeout indicates the timeout for the validity of" + " cache after soft-limit has been crossed.", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {}, + }, + {.key = {"volume-uuid"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "{{ volume.id }}", + .description = "uuid of the volume this brick is part of."}, + { + .key = {"alert-time"}, + .type = GF_OPTION_TYPE_TIME, + .min = 0, + .max = 7 * 86400, + .default_value = "86400", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .description = "Frequency of limit breach messages in log.", + .tags = {}, + }, + {.key = {NULL}}}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "quota", + .category = GF_MAINTAINED, }; diff --git a/xlators/features/quota/src/quota.h b/xlators/features/quota/src/quota.h index 96c19e77eb8..0395d78c9ef 100644 --- a/xlators/features/quota/src/quota.h +++ b/xlators/features/quota/src/quota.h @@ -7,205 +7,260 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #ifndef _QUOTA_H #define _QUOTA_H -#include "xlator.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" +#include <glusterfs/call-stub.h> #include "quota-mem-types.h" -#include "glusterfs.h" -#include "compat.h" -#include "logging.h" -#include "dict.h" -#include "stack.h" -#include "common-utils.h" -#include "event.h" -#include "globals.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/gf-event.h> #include "rpcsvc.h" #include "rpc-clnt.h" -#include "byte-order.h" +#include <glusterfs/byte-order.h> #include "glusterfs3-xdr.h" #include "glusterfs3.h" #include "xdr-generic.h" -#include "compat-errno.h" +#include <glusterfs/compat-errno.h> #include "protocol-common.h" +#include <glusterfs/quota-common-utils.h> +#include "quota-messages.h" -#define DIRTY "dirty" -#define SIZE "size" -#define CONTRIBUTION "contri" -#define VAL_LENGTH 8 -#define READDIR_BUF 4096 +#define DIRTY "dirty" +#define SIZE "size" +#define CONTRIBUTION "contri" +#define VAL_LENGTH 8 +#define READDIR_BUF 4096 #ifndef UUID_CANONICAL_FORM_LEN #define UUID_CANONICAL_FORM_LEN 36 #endif -#define WIND_IF_QUOTAOFF(is_quota_on, label) \ - if (!is_quota_on) \ - goto label; - -#define DID_REACH_LIMIT(lim, prev_size, cur_size) \ - ((cur_size) >= (lim) && (prev_size) < (lim)) - -#define QUOTA_SAFE_INCREMENT(lock, var) \ - do { \ - LOCK (lock); \ - var ++; \ - UNLOCK (lock); \ - } while (0) - -#define QUOTA_SAFE_DECREMENT(lock, var) \ - do { \ - LOCK (lock); \ - var --; \ - UNLOCK (lock); \ - } while (0) - -#define QUOTA_ALLOC_OR_GOTO(var, type, label) \ - do { \ - var = GF_CALLOC (sizeof (type), 1, \ - gf_quota_mt_##type); \ - if (!var) { \ - gf_log ("", GF_LOG_ERROR, \ - "out of memory"); \ - ret = -1; \ - goto label; \ - } \ - } while (0); - -#define QUOTA_STACK_UNWIND(fop, frame, params...) \ - do { \ - quota_local_t *_local = NULL; \ - xlator_t *_this = NULL; \ - if (frame) { \ - _local = frame->local; \ - _this = frame->this; \ - frame->local = NULL; \ - } \ - STACK_UNWIND_STRICT (fop, frame, params); \ - quota_local_cleanup (_this, _local); \ - } while (0) - -#define QUOTA_FREE_CONTRIBUTION_NODE(_contribution) \ - do { \ - list_del (&_contribution->contri_list); \ - GF_FREE (_contribution); \ - } while (0) - -#define GET_CONTRI_KEY(var, _vol_name, _gfid, _ret) \ - do { \ - char _gfid_unparsed[40]; \ - if (_gfid != NULL) { \ - uuid_unparse (_gfid, _gfid_unparsed); \ - _ret = gf_asprintf (var, QUOTA_XATTR_PREFIX \ - "%s.%s." CONTRIBUTION, \ - _vol_name, _gfid_unparsed); \ - } else { \ - _ret = gf_asprintf (var, QUOTA_XATTR_PREFIX \ - "%s.." CONTRIBUTION, \ - _vol_name); \ - } \ - } while (0) - - -#define GET_CONTRI_KEY_OR_GOTO(var, _vol_name, _gfid, label) \ - do { \ - GET_CONTRI_KEY(var, _vol_name, _gfid, ret); \ - if (ret == -1) \ - goto label; \ - } while (0) - -#define GET_DIRTY_KEY_OR_GOTO(var, _vol_name, label) \ - do { \ - ret = gf_asprintf (var, QUOTA_XATTR_PREFIX \ - "%s." DIRTY, _vol_name); \ - if (ret == -1) \ - goto label; \ - } while (0) +#define WIND_IF_QUOTAOFF(is_quota_on, label) \ + if (!is_quota_on) \ + goto label; + +#define QUOTA_WIND_FOR_INTERNAL_FOP(xdata, label) \ + do { \ + if (xdata && dict_get_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY)) \ + goto label; \ + } while (0) + +#define DID_REACH_LIMIT(lim, prev_size, cur_size) \ + ((cur_size) >= (lim) && (prev_size) < (lim)) + +#define QUOTA_SAFE_INCREMENT(lock, var) \ + do { \ + LOCK(lock); \ + var++; \ + UNLOCK(lock); \ + } while (0) + +#define QUOTA_SAFE_DECREMENT(lock, var) \ + do { \ + LOCK(lock); \ + var--; \ + UNLOCK(lock); \ + } while (0) + +#define QUOTA_ALLOC_OR_GOTO(var, type, label) \ + do { \ + var = GF_CALLOC(sizeof(type), 1, gf_quota_mt_##type); \ + if (!var) { \ + gf_msg("", GF_LOG_ERROR, ENOMEM, Q_MSG_ENOMEM, "out of memory"); \ + ret = -1; \ + goto label; \ + } \ + } while (0); + +#define QUOTA_STACK_WIND_TAIL(frame, params...) \ + do { \ + quota_local_t *_local = NULL; \ + \ + if (frame) { \ + _local = frame->local; \ + frame->local = NULL; \ + } \ + \ + STACK_WIND_TAIL(frame, params); \ + \ + if (_local) \ + quota_local_cleanup(_local); \ + } while (0) + +#define QUOTA_STACK_UNWIND(fop, frame, params...) \ + do { \ + quota_local_t *_local = NULL; \ + if (frame) { \ + _local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + quota_local_cleanup(_local); \ + } while (0) + +#define QUOTA_FREE_CONTRIBUTION_NODE(_contribution) \ + do { \ + list_del(&_contribution->contri_list); \ + GF_FREE(_contribution); \ + } while (0) + +#define GET_CONTRI_KEY(var, _vol_name, _gfid, _ret) \ + do { \ + char _gfid_unparsed[40]; \ + if (_gfid != NULL) { \ + gf_uuid_unparse(_gfid, _gfid_unparsed); \ + _ret = gf_asprintf(var, QUOTA_XATTR_PREFIX "%s.%s." CONTRIBUTION, \ + _vol_name, _gfid_unparsed); \ + } else { \ + _ret = gf_asprintf(var, QUOTA_XATTR_PREFIX "%s.." CONTRIBUTION, \ + _vol_name); \ + } \ + } while (0) + +#define GET_CONTRI_KEY_OR_GOTO(var, _vol_name, _gfid, label) \ + do { \ + GET_CONTRI_KEY(var, _vol_name, _gfid, ret); \ + if (ret == -1) \ + goto label; \ + } while (0) +#define GET_DIRTY_KEY_OR_GOTO(var, _vol_name, label) \ + do { \ + ret = gf_asprintf(var, QUOTA_XATTR_PREFIX "%s." DIRTY, _vol_name); \ + if (ret == -1) \ + goto label; \ + } while (0) +#define QUOTA_REG_OR_LNK_FILE(ia_type) (IA_ISREG(ia_type) || IA_ISLNK(ia_type)) struct quota_dentry { - char *name; - uuid_t par; - struct list_head next; + char *name; + uuid_t par; + struct list_head next; }; typedef struct quota_dentry quota_dentry_t; struct quota_inode_ctx { - int64_t size; - int64_t hard_lim; - int64_t soft_lim; - struct iatt buf; - struct list_head parents; - struct timeval tv; - struct timeval prev_log; - gf_lock_t lock; + int64_t size; + int64_t hard_lim; + int64_t soft_lim; + int64_t file_count; + int64_t dir_count; + int64_t object_hard_lim; + int64_t object_soft_lim; + struct iatt buf; + struct list_head parents; + time_t validate_time; + time_t prev_log_time; + gf_boolean_t ancestry_built; + gf_lock_t lock; }; typedef struct quota_inode_ctx quota_inode_ctx_t; -struct quota_limit { - int64_t hard_lim; - int64_t soft_lim_percent; -} __attribute__ ((packed)); -typedef struct quota_limit quota_limit_t; +typedef void (*quota_ancestry_built_t)(struct list_head *parents, + inode_t *inode, int32_t op_ret, + int32_t op_errno, void *data); + +typedef void (*quota_fop_continue_t)(call_frame_t *frame); struct quota_local { - gf_lock_t lock; - uint32_t validate_count; - uint32_t link_count; - loc_t loc; - loc_t oldloc; - loc_t newloc; - loc_t validate_loc; - int64_t delta; - int32_t op_ret; - int32_t op_errno; - int64_t size; - gf_boolean_t skip_check; - char just_validated; - fop_lookup_cbk_t validate_cbk; - inode_t *inode; - call_stub_t *stub; - struct iobref *iobref; - quota_limit_t limit; - int64_t space_available; + gf_lock_t lock; + uint32_t link_count; + loc_t loc; + loc_t oldloc; + loc_t newloc; + loc_t validate_loc; + int64_t delta; + int8_t object_delta; + int32_t op_ret; + int32_t op_errno; + int64_t size; + char just_validated; + fop_lookup_cbk_t validate_cbk; + quota_fop_continue_t fop_continue_cbk; + inode_t *inode; + uuid_t common_ancestor; /* Used by quota_rename */ + call_stub_t *stub; + struct iobref *iobref; + quota_limits_t limit; + quota_limits_t object_limit; + int64_t space_available; + quota_ancestry_built_t ancestry_cbk; + void *ancestry_data; + dict_t *xdata; + dict_t *validate_xdata; + int32_t quotad_conn_retry; + xlator_t *this; + call_frame_t *par_frame; }; -typedef struct quota_local quota_local_t; +typedef struct quota_local quota_local_t; struct quota_priv { - uint32_t soft_timeout; - uint32_t hard_timeout; - uint32_t log_timeout; - double default_soft_lim; - gf_boolean_t is_quota_on; - gf_boolean_t consider_statfs; - gf_lock_t lock; - rpc_clnt_prog_t *quota_enforcer; - struct rpcsvc_program *quotad_aggregator; - struct rpc_clnt *rpc_clnt; - rpcsvc_t *rpcsvc; - inode_table_t *itable; - char *volume_uuid; - uint64_t validation_count; + /* FIXME: consider time_t for timeouts. */ + uint32_t soft_timeout; + uint32_t hard_timeout; + uint32_t log_timeout; + double default_soft_lim; + gf_boolean_t is_quota_on; + gf_boolean_t consider_statfs; + gf_lock_t lock; + rpc_clnt_prog_t *quota_enforcer; + struct rpcsvc_program *quotad_aggregator; + struct rpc_clnt *rpc_clnt; + rpcsvc_t *rpcsvc; + inode_table_t *itable; + char *volume_uuid; + uint64_t validation_count; + int32_t quotad_conn_status; + pthread_mutex_t conn_mutex; + pthread_cond_t conn_cond; + gf_boolean_t conn_status; }; -typedef struct quota_priv quota_priv_t; +typedef struct quota_priv quota_priv_t; int -quota_enforcer_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xdata, fop_lookup_cbk_t cbk); +quota_enforcer_lookup(call_frame_t *frame, xlator_t *this, dict_t *xdata, + fop_lookup_cbk_t cbk); + +void +_quota_enforcer_lookup(void *data); + struct rpc_clnt * -quota_enforcer_init (xlator_t *this, dict_t *options); +quota_enforcer_init(xlator_t *this, dict_t *options); + +void +quota_log_usage(xlator_t *this, quota_inode_ctx_t *ctx, inode_t *inode, + int64_t delta); + +int +quota_build_ancestry(inode_t *inode, quota_ancestry_built_t ancestry_cbk, + void *data); void -quota_log_usage (xlator_t *this, quota_inode_ctx_t *ctx, inode_t *inode, - int64_t delta); +quota_get_limit_dir(call_frame_t *frame, inode_t *cur_inode, xlator_t *this); + +int32_t +quota_check_limit(call_frame_t *frame, inode_t *inode, xlator_t *this); + +inode_t * +do_quota_check_limit(call_frame_t *frame, inode_t *inode, xlator_t *this, + quota_dentry_t *dentry, gf_boolean_t force); +int +quota_fill_inodectx(xlator_t *this, inode_t *inode, dict_t *dict, loc_t *loc, + struct iatt *buf, int32_t *op_errno); + +int32_t +quota_check_size_limit(call_frame_t *frame, quota_inode_ctx_t *ctx, + quota_priv_t *priv, inode_t *_inode, xlator_t *this, + int32_t *op_errno, int just_validated, int64_t delta, + quota_local_t *local, gf_boolean_t *skip_check); +int32_t +quota_check_object_limit(call_frame_t *frame, quota_inode_ctx_t *ctx, + quota_priv_t *priv, inode_t *_inode, xlator_t *this, + int32_t *op_errno, int just_validated, + quota_local_t *local, gf_boolean_t *skip_check); #endif diff --git a/xlators/features/quota/src/quotad-aggregator.c b/xlators/features/quota/src/quotad-aggregator.c index f3f65ca2a04..75d47867b5b 100644 --- a/xlators/features/quota/src/quotad-aggregator.c +++ b/xlators/features/quota/src/quotad-aggregator.c @@ -13,411 +13,482 @@ #include "quotad-helpers.h" #include "quotad-aggregator.h" -struct rpcsvc_program quotad_aggregator_prog; +static char *qd_ext_xattrs[] = { + QUOTA_SIZE_KEY, + QUOTA_LIMIT_KEY, + QUOTA_LIMIT_OBJECTS_KEY, + NULL, +}; + +static struct rpcsvc_program quotad_aggregator_prog; struct iobuf * -quotad_serialize_reply (rpcsvc_request_t *req, void *arg, struct iovec *outmsg, - xdrproc_t xdrproc) +quotad_serialize_reply(rpcsvc_request_t *req, void *arg, struct iovec *outmsg, + xdrproc_t xdrproc) { - struct iobuf *iob = NULL; - ssize_t retlen = 0; - ssize_t xdr_size = 0; - - GF_VALIDATE_OR_GOTO ("server", req, ret); - - /* First, get the io buffer into which the reply in arg will - * be serialized. + struct iobuf *iob = NULL; + ssize_t retlen = 0; + ssize_t xdr_size = 0; + + GF_VALIDATE_OR_GOTO("server", req, ret); + + /* First, get the io buffer into which the reply in arg will + * be serialized. + */ + if (arg && xdrproc) { + xdr_size = xdr_sizeof(xdrproc, arg); + iob = iobuf_get2(req->svc->ctx->iobuf_pool, xdr_size); + if (!iob) { + gf_log_callingfn(THIS->name, GF_LOG_ERROR, "Failed to get iobuf"); + goto ret; + }; + + iobuf_to_iovec(iob, outmsg); + /* Use the given serializer to translate the given C structure + * in arg to XDR format which will be written into the buffer + * in outmsg. */ - if (arg && xdrproc) { - xdr_size = xdr_sizeof (xdrproc, arg); - iob = iobuf_get2 (req->svc->ctx->iobuf_pool, xdr_size); - if (!iob) { - gf_log_callingfn (THIS->name, GF_LOG_ERROR, - "Failed to get iobuf"); - goto ret; - }; - - iobuf_to_iovec (iob, outmsg); - /* Use the given serializer to translate the give C structure in arg - * to XDR format which will be written into the buffer in outmsg. - */ - /* retlen is used to received the error since size_t is unsigned and we - * need -1 for error notification during encoding. - */ - - retlen = xdr_serialize_generic (*outmsg, arg, xdrproc); - if (retlen == -1) { - /* Failed to Encode 'GlusterFS' msg in RPC is not exactly - failure of RPC return values.. client should get - notified about this, so there are no missing frames */ - gf_log_callingfn ("", GF_LOG_ERROR, "Failed to encode message"); - req->rpc_err = GARBAGE_ARGS; - retlen = 0; - } - } - outmsg->iov_len = retlen; -ret: + /* retlen is used to received the error since size_t is unsigned and we + * need -1 for error notification during encoding. + */ + + retlen = xdr_serialize_generic(*outmsg, arg, xdrproc); if (retlen == -1) { - iobuf_unref (iob); - iob = NULL; + /* Failed to Encode 'GlusterFS' msg in RPC is not exactly + failure of RPC return values.. Client should get + notified about this, so there are no missing frames */ + gf_log_callingfn("", GF_LOG_ERROR, "Failed to encode message"); + req->rpc_err = GARBAGE_ARGS; + retlen = 0; } - - return iob; + } + outmsg->iov_len = retlen; +ret: + return iob; } int -quotad_aggregator_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, - void *arg, struct iovec *payload, - int payloadcount, struct iobref *iobref, - xdrproc_t xdrproc) +quotad_aggregator_submit_reply(call_frame_t *frame, rpcsvc_request_t *req, + void *arg, struct iovec *payload, + int payloadcount, struct iobref *iobref, + xdrproc_t xdrproc) { - struct iobuf *iob = NULL; - int ret = -1; - struct iovec rsp = {0,}; - quotad_aggregator_state_t *state = NULL; - char new_iobref = 0; + struct iobuf *iob = NULL; + int ret = -1; + struct iovec rsp = { + 0, + }; + quotad_aggregator_state_t *state = NULL; + char new_iobref = 0; - GF_VALIDATE_OR_GOTO ("server", req, ret); + GF_VALIDATE_OR_GOTO("server", req, ret); - if (frame) { - state = frame->root->state; - frame->local = NULL; - } + if (frame) { + state = frame->root->state; + frame->local = NULL; + } + if (!iobref) { + iobref = iobref_new(); if (!iobref) { - iobref = iobref_new (); - if (!iobref) { - goto ret; - } - - new_iobref = 1; + goto ret; } - iob = quotad_serialize_reply (req, arg, &rsp, xdrproc); - if (!iob) { - gf_log ("", GF_LOG_ERROR, "Failed to serialize reply"); - goto ret; - } + new_iobref = 1; + } + + iob = quotad_serialize_reply(req, arg, &rsp, xdrproc); + if (!iob) { + gf_msg("", GF_LOG_ERROR, 0, Q_MSG_DICT_SERIALIZE_FAIL, + "Failed to serialize reply"); + goto ret; + } - iobref_add (iobref, iob); + iobref_add(iobref, iob); - ret = rpcsvc_submit_generic (req, &rsp, 1, payload, payloadcount, - iobref); + ret = rpcsvc_submit_generic(req, &rsp, 1, payload, payloadcount, iobref); - iobuf_unref (iob); + iobuf_unref(iob); - ret = 0; + ret = 0; ret: - if (state) { - quotad_aggregator_free_state (state); - } + if (state) { + quotad_aggregator_free_state(state); + } - if (frame) { - if (frame->root->client) - gf_client_unref (frame->root->client); + if (frame) + STACK_DESTROY(frame->root); - STACK_DESTROY (frame->root); - } + if (new_iobref) { + iobref_unref(iobref); + } - if (new_iobref) { - iobref_unref (iobref); - } - - return ret; + return ret; } int -quotad_aggregator_getlimit_cbk (xlator_t *this, call_frame_t *frame, - void *lookup_rsp) +quotad_aggregator_getlimit_cbk(xlator_t *this, call_frame_t *frame, + void *lookup_rsp) { - gfs3_lookup_rsp *rsp = lookup_rsp; - gf_cli_rsp cli_rsp = {0,}; - dict_t *xdata = NULL; - int ret = -1; + gfs3_lookup_rsp *rsp = lookup_rsp; + gf_cli_rsp cli_rsp = { + 0, + }; + dict_t *xdata = NULL; + quotad_aggregator_state_t *state = NULL; + int ret = -1; + int type = 0; + + if (!rsp || (rsp->op_ret == -1)) + goto reply; + + GF_PROTOCOL_DICT_UNSERIALIZE(frame->this, xdata, (rsp->xdata.xdata_val), + (rsp->xdata.xdata_len), rsp->op_ret, + rsp->op_errno, out); + + if (xdata) { + state = frame->root->state; + ret = dict_get_int32n(state->req_xdata, "type", SLEN("type"), &type); + if (ret < 0) + goto out; - GF_PROTOCOL_DICT_UNSERIALIZE (frame->this, xdata, - (rsp->xdata.xdata_val), - (rsp->xdata.xdata_len), rsp->op_ret, - rsp->op_errno, out); + ret = dict_set_int32_sizen(xdata, "type", type); + if (ret < 0) + goto out; + } - ret = 0; + ret = 0; out: - rsp->op_ret = ret; - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "failed to unserialize " - "nameless lookup rsp"); - goto reply; - } - cli_rsp.op_ret = rsp->op_ret; - cli_rsp.op_errno = rsp->op_errno; - cli_rsp.op_errstr = ""; - if (xdata) { - GF_PROTOCOL_DICT_SERIALIZE (frame->this, xdata, - (&cli_rsp.dict.dict_val), - (cli_rsp.dict.dict_len), - cli_rsp.op_errno, reply); - } + rsp->op_ret = ret; + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, Q_MSG_DICT_UNSERIALIZE_FAIL, + "failed to unserialize " + "nameless lookup rsp"); + goto reply; + } + cli_rsp.op_ret = rsp->op_ret; + cli_rsp.op_errno = rsp->op_errno; + cli_rsp.op_errstr = ""; + if (xdata) { + GF_PROTOCOL_DICT_SERIALIZE(frame->this, xdata, (&cli_rsp.dict.dict_val), + (cli_rsp.dict.dict_len), cli_rsp.op_errno, + reply); + } reply: - quotad_aggregator_submit_reply (frame, frame->local, (void*)&cli_rsp, NULL, 0, - NULL, (xdrproc_t)xdr_gf_cli_rsp); + quotad_aggregator_submit_reply(frame, (frame) ? frame->local : NULL, + (void *)&cli_rsp, NULL, 0, NULL, + (xdrproc_t)xdr_gf_cli_rsp); - dict_unref (xdata); - GF_FREE (cli_rsp.dict.dict_val); - return 0; + dict_unref(xdata); + GF_FREE(cli_rsp.dict.dict_val); + return 0; } int -quotad_aggregator_getlimit (rpcsvc_request_t *req) +quotad_aggregator_getlimit(rpcsvc_request_t *req) { - call_frame_t *frame = NULL; - gf_cli_req cli_req = {{0}, }; - gf_cli_rsp cli_rsp = {0}; - gfs3_lookup_req args = {{0,},}; - gfs3_lookup_rsp rsp = {0,}; - quotad_aggregator_state_t *state = NULL; - xlator_t *this = NULL; - dict_t *dict = NULL; - int ret = -1, op_errno = 0; - char *gfid_str = NULL; - uuid_t gfid = {0}; - - GF_VALIDATE_OR_GOTO ("quotad-aggregator", req, err); - - this = THIS; - - ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req); - if (ret < 0) { - //failed to decode msg; - gf_log ("", GF_LOG_ERROR, "xdr decoding error"); - req->rpc_err = GARBAGE_ARGS; - goto err; - } - - if (cli_req.dict.dict_len) { - dict = dict_new (); - ret = dict_unserialize (cli_req.dict.dict_val, - cli_req.dict.dict_len, &dict); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Failed to " - "unserialize req-buffer to dictionary"); - goto err; - } - } - - ret = dict_get_str (dict, "gfid", &gfid_str); - if (ret) { - goto err; - } - - uuid_parse ((const char*)gfid_str, gfid); - - frame = quotad_aggregator_get_frame_from_req (req); - if (frame == NULL) { - rsp.op_errno = ENOMEM; - goto err; - } - state = frame->root->state; - state->xdata = dict; - ret = dict_set_int32 (state->xdata, QUOTA_LIMIT_KEY, 42); - if (ret) - goto err; - - ret = dict_set_int32 (state->xdata, QUOTA_SIZE_KEY, 42); - if (ret) - goto err; - - ret = dict_set_int32 (state->xdata, GET_ANCESTRY_PATH_KEY,42); - if (ret) - goto err; - - memcpy (&args.gfid, &gfid, 16); - - args.bname = alloca (req->msg[0].iov_len); - args.xdata.xdata_val = alloca (req->msg[0].iov_len); - - ret = qd_nameless_lookup (this, frame, &args, state->xdata, - quotad_aggregator_getlimit_cbk); - if (ret) { - rsp.op_errno = ret; - goto err; + call_frame_t *frame = NULL; + gf_cli_req cli_req = { + {0}, + }; + gf_cli_rsp cli_rsp = {0}; + quotad_aggregator_state_t *state = NULL; + xlator_t *this = NULL; + dict_t *dict = NULL; + int ret = -1, op_errno = 0; + char *gfid_str = NULL; + uuid_t gfid = {0}; + char *volume_uuid = NULL; + + GF_VALIDATE_OR_GOTO("quotad-aggregator", req, err); + + this = THIS; + + cli_req.dict.dict_val = alloca(req->msg[0].iov_len); + + ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req); + if (ret < 0) { + // failed to decode msg; + gf_msg("this->name", GF_LOG_ERROR, 0, Q_MSG_XDR_DECODE_ERROR, + "xdr decoding error"); + req->rpc_err = GARBAGE_ARGS; + goto err; + } + + if (cli_req.dict.dict_len) { + dict = dict_new(); + ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len, + &dict); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, Q_MSG_DICT_UNSERIALIZE_FAIL, + "Failed to unserialize req-buffer to " + "dictionary"); + goto err; } - - return ret; + } + + ret = dict_get_strn(dict, "gfid", SLEN("gfid"), &gfid_str); + if (ret) { + goto err; + } + + ret = dict_get_strn(dict, "volume-uuid", SLEN("volume-uuid"), &volume_uuid); + if (ret) { + goto err; + } + + gf_uuid_parse((const char *)gfid_str, gfid); + + frame = quotad_aggregator_get_frame_from_req(req); + if (frame == NULL) { + cli_rsp.op_errno = ENOMEM; + goto errx; + } + state = frame->root->state; + state->req_xdata = dict; + state->xdata = dict_new(); + dict = NULL; + + ret = dict_set_int32_sizen(state->xdata, QUOTA_LIMIT_KEY, 42); + if (ret) + goto err; + + ret = dict_set_int32_sizen(state->xdata, QUOTA_LIMIT_OBJECTS_KEY, 42); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, Q_MSG_ENOMEM, + "Failed to set QUOTA_LIMIT_OBJECTS_KEY"); + goto err; + } + + ret = dict_set_int32_sizen(state->xdata, QUOTA_SIZE_KEY, 42); + if (ret) + goto err; + + ret = dict_set_int32_sizen(state->xdata, GET_ANCESTRY_PATH_KEY, 42); + if (ret) + goto err; + + ret = qd_nameless_lookup(this, frame, (char *)gfid, state->xdata, + volume_uuid, quotad_aggregator_getlimit_cbk); + if (ret) { + cli_rsp.op_errno = ret; + goto errx; + } + + return ret; err: - cli_rsp.op_ret = -1; - cli_rsp.op_errno = op_errno; - cli_rsp.op_errstr = ""; - - quotad_aggregator_getlimit_cbk (this, frame, &cli_rsp); - dict_unref (dict); - - return ret; + cli_rsp.op_errno = op_errno; +errx: + cli_rsp.op_ret = -1; + cli_rsp.op_errstr = ""; + + quotad_aggregator_getlimit_cbk(this, frame, &cli_rsp); + if (dict) + dict_unref(dict); + return ret; } int -quotad_aggregator_lookup_cbk (xlator_t *this, call_frame_t *frame, - void *rsp) +quotad_aggregator_lookup_cbk(xlator_t *this, call_frame_t *frame, void *rsp) { - quotad_aggregator_submit_reply (frame, frame->local, rsp, NULL, 0, NULL, - (xdrproc_t)xdr_gfs3_lookup_rsp); + quotad_aggregator_submit_reply(frame, frame ? frame->local : NULL, rsp, + NULL, 0, NULL, + (xdrproc_t)xdr_gfs3_lookup_rsp); - return 0; + return 0; } - int -quotad_aggregator_lookup (rpcsvc_request_t *req) +quotad_aggregator_lookup(rpcsvc_request_t *req) { - call_frame_t *frame = NULL; - gfs3_lookup_req args = {{0,},}; - int ret = -1, op_errno = 0; - gfs3_lookup_rsp rsp = {0,}; - quotad_aggregator_state_t *state = NULL; - xlator_t *this = NULL; - - GF_VALIDATE_OR_GOTO ("quotad-aggregator", req, err); - - this = THIS; - - args.bname = alloca (req->msg[0].iov_len); - args.xdata.xdata_val = alloca (req->msg[0].iov_len); - - ret = xdr_to_generic (req->msg[0], &args, - (xdrproc_t)xdr_gfs3_lookup_req); - if (ret < 0) { - rsp.op_errno = EINVAL; - goto err; - } - - frame = quotad_aggregator_get_frame_from_req (req); - if (frame == NULL) { - rsp.op_errno = ENOMEM; + call_frame_t *frame = NULL; + gfs3_lookup_req args = { + { + 0, + }, + }; + int i = 0, ret = -1, op_errno = 0; + gfs3_lookup_rsp rsp = { + 0, + }; + quotad_aggregator_state_t *state = NULL; + xlator_t *this = NULL; + dict_t *dict = NULL; + char *volume_uuid = NULL; + + GF_VALIDATE_OR_GOTO("quotad-aggregator", req, err); + + this = THIS; + + args.bname = alloca(req->msg[0].iov_len); + args.xdata.xdata_val = alloca(req->msg[0].iov_len); + + ret = xdr_to_generic(req->msg[0], &args, (xdrproc_t)xdr_gfs3_lookup_req); + if (ret < 0) { + rsp.op_errno = EINVAL; + goto err; + } + + frame = quotad_aggregator_get_frame_from_req(req); + if (frame == NULL) { + rsp.op_errno = ENOMEM; + goto err; + } + + state = frame->root->state; + + GF_PROTOCOL_DICT_UNSERIALIZE(this, dict, (args.xdata.xdata_val), + (args.xdata.xdata_len), ret, op_errno, err); + + ret = dict_get_str(dict, "volume-uuid", &volume_uuid); + if (ret) { + goto err; + } + + state->xdata = dict_new(); + + for (i = 0; qd_ext_xattrs[i]; i++) { + if (dict_get(dict, qd_ext_xattrs[i])) { + ret = dict_set_uint32(state->xdata, qd_ext_xattrs[i], 1); + if (ret < 0) goto err; } + } - state = frame->root->state; - - GF_PROTOCOL_DICT_UNSERIALIZE (this, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, - op_errno, err); - + ret = qd_nameless_lookup(this, frame, args.gfid, state->xdata, volume_uuid, + quotad_aggregator_lookup_cbk); + if (ret) { + rsp.op_errno = ret; + goto err; + } - ret = qd_nameless_lookup (this, frame, &args, state->xdata, - quotad_aggregator_lookup_cbk); - if (ret) { - rsp.op_errno = ret; - goto err; - } + if (dict) + dict_unref(dict); - return ret; + return ret; err: - rsp.op_ret = -1; - rsp.op_errno = op_errno; + rsp.op_ret = -1; + rsp.op_errno = op_errno; + + quotad_aggregator_lookup_cbk(this, frame, &rsp); + if (dict) + dict_unref(dict); - quotad_aggregator_lookup_cbk (this, frame, &rsp); - return ret; + return ret; } int -quotad_aggregator_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event, - void *data) +quotad_aggregator_rpc_notify(rpcsvc_t *rpc, void *xl, rpcsvc_event_t event, + void *data) { - if (!xl || !data) { - gf_log_callingfn ("server", GF_LOG_WARNING, - "Calling rpc_notify without initializing"); - goto out; - } + if (!xl || !data) { + gf_log_callingfn("server", GF_LOG_WARNING, + "Calling rpc_notify without initializing"); + goto out; + } - switch (event) { + switch (event) { case RPCSVC_EVENT_ACCEPT: - break; + break; case RPCSVC_EVENT_DISCONNECT: - break; + break; default: - break; - } + break; + } out: - return 0; + return 0; } int -quotad_aggregator_init (xlator_t *this) +quotad_aggregator_init(xlator_t *this) { - quota_priv_t *priv = NULL; - int ret = -1; - - priv = this->private; - - ret = dict_set_str (this->options, "transport.address-family", "unix"); - if (ret) - goto out; - - ret = dict_set_str (this->options, "transport-type", "socket"); - if (ret) - goto out; - - ret = dict_set_str (this->options, "transport.socket.listen-path", - "/tmp/quotad.socket"); - if (ret) - goto out; - - /* RPC related */ - priv->rpcsvc = rpcsvc_init (this, this->ctx, this->options, 0); - if (priv->rpcsvc == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "creation of rpcsvc failed"); - ret = -1; - goto out; - } + quota_priv_t *priv = NULL; + int ret = -1; - ret = rpcsvc_create_listeners (priv->rpcsvc, this->options, - this->name); - if (ret < 1) { - gf_log (this->name, GF_LOG_WARNING, - "creation of listener failed"); - ret = -1; - goto out; - } - - priv->quotad_aggregator = "ad_aggregator_prog; - quotad_aggregator_prog.options = this->options; - - ret = rpcsvc_program_register (priv->rpcsvc, "ad_aggregator_prog); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "registration of program (name:%s, prognum:%d, " - "progver:%d) failed", quotad_aggregator_prog.progname, - quotad_aggregator_prog.prognum, - quotad_aggregator_prog.progver); - goto out; - } + priv = this->private; - ret = 0; + if (priv->rpcsvc) { + /* Listener already created */ + return 0; + } + + ret = dict_set_nstrn(this->options, "transport.address-family", + SLEN("transport.address-family"), "unix", + SLEN("unix")); + if (ret) + goto out; + + ret = dict_set_nstrn(this->options, "transport-type", + SLEN("transport-type"), "socket", SLEN("socket")); + if (ret) + goto out; + + ret = dict_set_nstrn(this->options, "transport.socket.listen-path", + SLEN("transport.socket.listen-path"), + "/var/run/gluster/quotad.socket", + SLEN("/var/run/gluster/quotad.socket")); + if (ret) + goto out; + + /* RPC related */ + priv->rpcsvc = rpcsvc_init(this, this->ctx, this->options, 0); + if (priv->rpcsvc == NULL) { + gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_RPCSVC_INIT_FAILED, + "creation of rpcsvc failed"); + ret = -1; + goto out; + } + + ret = rpcsvc_create_listeners(priv->rpcsvc, this->options, this->name); + if (ret < 1) { + gf_msg(this->name, GF_LOG_WARNING, 0, + Q_MSG_RPCSVC_LISTENER_CREATION_FAILED, + "creation of listener failed"); + ret = -1; + goto out; + } + + priv->quotad_aggregator = "ad_aggregator_prog; + quotad_aggregator_prog.options = this->options; + + ret = rpcsvc_program_register(priv->rpcsvc, "ad_aggregator_prog, + _gf_false); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_RPCSVC_REGISTER_FAILED, + "registration of program (name:%s, prognum:%d, " + "progver:%d) failed", + quotad_aggregator_prog.progname, quotad_aggregator_prog.prognum, + quotad_aggregator_prog.progver); + goto out; + } + + ret = 0; out: - return ret; + if (ret && priv->rpcsvc) { + GF_FREE(priv->rpcsvc); + priv->rpcsvc = NULL; + } + + return ret; } -rpcsvc_actor_t quotad_aggregator_actors[] = { - [GF_AGGREGATOR_NULL] = {"NULL", GF_AGGREGATOR_NULL, NULL, NULL, 0, - DRC_NA}, - [GF_AGGREGATOR_LOOKUP] = {"LOOKUP", GF_AGGREGATOR_NULL, - quotad_aggregator_lookup, NULL, 0, DRC_NA}, - [GF_AGGREGATOR_GETLIMIT] = {"GETLIMIT", GF_AGGREGATOR_GETLIMIT, - quotad_aggregator_getlimit, NULL, 0}, +static rpcsvc_actor_t quotad_aggregator_actors[GF_AGGREGATOR_MAXVALUE] = { + [GF_AGGREGATOR_NULL] = {"NULL", NULL, NULL, GF_AGGREGATOR_NULL, DRC_NA, 0}, + [GF_AGGREGATOR_LOOKUP] = {"LOOKUP", quotad_aggregator_lookup, NULL, + GF_AGGREGATOR_NULL, DRC_NA, 0}, + [GF_AGGREGATOR_GETLIMIT] = {"GETLIMIT", quotad_aggregator_getlimit, NULL, + GF_AGGREGATOR_GETLIMIT, DRC_NA, 0}, }; - -struct rpcsvc_program quotad_aggregator_prog = { - .progname = "GlusterFS 3.3", - .prognum = GLUSTER_AGGREGATOR_PROGRAM, - .progver = GLUSTER_AGGREGATOR_VERSION, - .numactors = GF_AGGREGATOR_MAXVALUE, - .actors = quotad_aggregator_actors -}; +static struct rpcsvc_program quotad_aggregator_prog = { + .progname = "GlusterFS 3.3", + .prognum = GLUSTER_AGGREGATOR_PROGRAM, + .progver = GLUSTER_AGGREGATOR_VERSION, + .numactors = GF_AGGREGATOR_MAXVALUE, + .actors = quotad_aggregator_actors}; diff --git a/xlators/features/quota/src/quotad-aggregator.h b/xlators/features/quota/src/quotad-aggregator.h index 5ddea5b3c46..706592c7d50 100644 --- a/xlators/features/quota/src/quotad-aggregator.h +++ b/xlators/features/quota/src/quotad-aggregator.h @@ -12,26 +12,27 @@ #define _QUOTAD_AGGREGATOR_H #include "quota.h" -#include "stack.h" +#include <glusterfs/stack.h> #include "glusterfs3-xdr.h" -#include "inode.h" +#include <glusterfs/inode.h> typedef struct { - void *pool; - xlator_t *this; - xlator_t *active_subvol; - inode_table_t *itable; - loc_t loc; - dict_t *xdata; + void *pool; + xlator_t *this; + xlator_t *active_subvol; + inode_table_t *itable; + loc_t loc; + dict_t *xdata; + dict_t *req_xdata; } quotad_aggregator_state_t; -typedef int (*quotad_aggregator_lookup_cbk_t) (xlator_t *this, - call_frame_t *frame, - void *rsp); +typedef int (*quotad_aggregator_lookup_cbk_t)(xlator_t *this, + call_frame_t *frame, void *rsp); int -qd_nameless_lookup (xlator_t *this, call_frame_t *frame, gfs3_lookup_req *req, - dict_t *xdata, quotad_aggregator_lookup_cbk_t lookup_cbk); +qd_nameless_lookup(xlator_t *this, call_frame_t *frame, char *gfid, + dict_t *xdata, char *volume_uuid, + quotad_aggregator_lookup_cbk_t lookup_cbk); int -quotad_aggregator_init (xlator_t *this); +quotad_aggregator_init(xlator_t *this); #endif diff --git a/xlators/features/quota/src/quotad-helpers.c b/xlators/features/quota/src/quotad-helpers.c index fd309911474..51ff1d7e98d 100644 --- a/xlators/features/quota/src/quotad-helpers.c +++ b/xlators/features/quota/src/quotad-helpers.c @@ -11,103 +11,97 @@ #include "quotad-helpers.h" quotad_aggregator_state_t * -get_quotad_aggregator_state (xlator_t *this, rpcsvc_request_t *req) +get_quotad_aggregator_state(xlator_t *this, rpcsvc_request_t *req) { - quotad_aggregator_state_t *state = NULL; - xlator_t *active_subvol = NULL; - quota_priv_t *priv = NULL; + quotad_aggregator_state_t *state = NULL; + xlator_t *active_subvol = NULL; + quota_priv_t *priv = NULL; - state = (void *)GF_CALLOC (1, sizeof (*state), - gf_quota_mt_aggregator_state_t); - if (!state) - return NULL; + state = (void *)GF_CALLOC(1, sizeof(*state), + gf_quota_mt_aggregator_state_t); + if (!state) + return NULL; - state->this = THIS; - priv = this->private; + state->this = THIS; + priv = this->private; - LOCK (&priv->lock); - { - active_subvol = state->active_subvol = FIRST_CHILD (this); - } - UNLOCK (&priv->lock); + LOCK(&priv->lock); + { + active_subvol = state->active_subvol = FIRST_CHILD(this); + } + UNLOCK(&priv->lock); - if (active_subvol->itable == NULL) - active_subvol->itable = inode_table_new (4096, active_subvol); + if (active_subvol->itable == NULL) + active_subvol->itable = inode_table_new(4096, active_subvol); - state->itable = active_subvol->itable; + state->itable = active_subvol->itable; - state->pool = this->ctx->pool; + state->pool = this->ctx->pool; - return state; + return state; } void -quotad_aggregator_free_state (quotad_aggregator_state_t *state) +quotad_aggregator_free_state(quotad_aggregator_state_t *state) { - if (state->xdata) - dict_unref (state->xdata); + if (state->xdata) + dict_unref(state->xdata); - GF_FREE (state); + if (state->req_xdata) + dict_unref(state->req_xdata); + + GF_FREE(state); } call_frame_t * -quotad_aggregator_alloc_frame (rpcsvc_request_t *req) +quotad_aggregator_alloc_frame(rpcsvc_request_t *req) { - call_frame_t *frame = NULL; - quotad_aggregator_state_t *state = NULL; - xlator_t *this = NULL; + call_frame_t *frame = NULL; + quotad_aggregator_state_t *state = NULL; + xlator_t *this = NULL; - GF_VALIDATE_OR_GOTO ("server", req, out); - GF_VALIDATE_OR_GOTO ("server", req->trans, out); - GF_VALIDATE_OR_GOTO ("server", req->svc, out); - GF_VALIDATE_OR_GOTO ("server", req->svc->ctx, out); + GF_VALIDATE_OR_GOTO("server", req, out); + GF_VALIDATE_OR_GOTO("server", req->trans, out); + GF_VALIDATE_OR_GOTO("server", req->svc, out); + GF_VALIDATE_OR_GOTO("server", req->svc->ctx, out); - this = req->svc->mydata; + this = req->svc->xl; - frame = create_frame (this, req->svc->ctx->pool); - if (!frame) - goto out; + frame = create_frame(this, req->svc->ctx->pool); + if (!frame) + goto out; - state = get_quotad_aggregator_state (this, req); - if (!state) - goto out; + state = get_quotad_aggregator_state(this, req); + if (!state) + goto out; - frame->root->state = state; - frame->root->unique = 0; + frame->root->state = state; - frame->this = this; + frame->this = this; out: - return frame; + return frame; } call_frame_t * -quotad_aggregator_get_frame_from_req (rpcsvc_request_t *req) +quotad_aggregator_get_frame_from_req(rpcsvc_request_t *req) { - call_frame_t *frame = NULL; - client_t *client = NULL; - - GF_VALIDATE_OR_GOTO ("server", req, out); - - frame = quotad_aggregator_alloc_frame (req); - if (!frame) - goto out; - - client = req->trans->xl_private; + call_frame_t *frame = NULL; - frame->root->op = req->procnum; + GF_VALIDATE_OR_GOTO("server", req, out); - frame->root->unique = req->xid; + frame = quotad_aggregator_alloc_frame(req); + if (!frame) + goto out; - frame->root->uid = req->uid; - frame->root->gid = req->gid; - frame->root->pid = req->pid; + frame->root->op = req->procnum; - gf_client_ref (client); - frame->root->client = client; + frame->root->uid = req->uid; + frame->root->gid = req->gid; + frame->root->pid = req->pid; - frame->root->lk_owner = req->lk_owner; + frame->root->lk_owner = req->lk_owner; - frame->local = req; + frame->local = req; out: - return frame; + return frame; } diff --git a/xlators/features/quota/src/quotad-helpers.h b/xlators/features/quota/src/quotad-helpers.h index a10fb7fa82a..bcb39fe845e 100644 --- a/xlators/features/quota/src/quotad-helpers.h +++ b/xlators/features/quota/src/quotad-helpers.h @@ -16,9 +16,9 @@ #include "quotad-aggregator.h" void -quotad_aggregator_free_state (quotad_aggregator_state_t *state); +quotad_aggregator_free_state(quotad_aggregator_state_t *state); call_frame_t * -quotad_aggregator_get_frame_from_req (rpcsvc_request_t *req); +quotad_aggregator_get_frame_from_req(rpcsvc_request_t *req); #endif diff --git a/xlators/features/quota/src/quotad.c b/xlators/features/quota/src/quotad.c index 243b943e986..643f25c9c2a 100644 --- a/xlators/features/quota/src/quotad.c +++ b/xlators/features/quota/src/quotad.c @@ -9,202 +9,237 @@ */ #include "quota.h" #include "quotad-aggregator.h" -#include "common-utils.h" -int32_t -mem_acct_init (xlator_t *this) +int +qd_notify(xlator_t *this, int32_t event, void *data, ...) { - int ret = -1; + switch (event) { + case GF_EVENT_PARENT_UP: + quotad_aggregator_init(this); + } + + default_notify(this, event, data); + return 0; +} - if (!this) - return ret; +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; - ret = xlator_mem_acct_init (this, gf_quota_mt_end + 1); + if (!this) + return ret; - if (0 != ret) { - gf_log (this->name, GF_LOG_WARNING, "Memory accounting " - "init failed"); - return ret; - } + ret = xlator_mem_acct_init(this, gf_quota_mt_end + 1); + if (0 != ret) { + gf_log(this->name, GF_LOG_WARNING, + "Memory accounting " + "init failed"); return ret; + } + + return ret; } int32_t -qd_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xdata, struct iatt *postparent) +qd_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) { - quotad_aggregator_lookup_cbk_t lookup_cbk = NULL; - gfs3_lookup_rsp rsp = {0, }; + quotad_aggregator_lookup_cbk_t lookup_cbk = NULL; + gfs3_lookup_rsp rsp = { + 0, + }; - lookup_cbk = cookie; + lookup_cbk = cookie; - rsp.op_ret = op_ret; - rsp.op_errno = op_errno; + rsp.op_ret = op_ret; + rsp.op_errno = op_errno; - gf_stat_from_iatt (&rsp.postparent, postparent); + gf_stat_from_iatt(&rsp.postparent, postparent); - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), - rsp.xdata.xdata_len, rsp.op_errno, out); + GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&rsp.xdata.xdata_val), + rsp.xdata.xdata_len, rsp.op_errno, out); - gf_stat_from_iatt (&rsp.stat, buf); + gf_stat_from_iatt(&rsp.stat, buf); out: - lookup_cbk (this, frame, &rsp); + lookup_cbk(this, frame, &rsp); - GF_FREE (rsp.xdata.xdata_val); + GF_FREE(rsp.xdata.xdata_val); - inode_unref (inode); + inode_unref(inode); - return 0; + return 0; } xlator_t * -qd_find_subvol (xlator_t *this, char *volume_uuid) +qd_find_subvol(xlator_t *this, char *volume_uuid) { - xlator_list_t *child = NULL; - xlator_t *subvol = NULL; - char key[1024]; - char *optstr = NULL; - - if (!this || !volume_uuid) - goto out; - - for (child = this->children; child; child = child->next) { - snprintf(key, 1024, "%s.volume-id", child->xlator->name); - if (dict_get_str(this->options, key, &optstr) < 0) - continue; - - if (strcmp (optstr, volume_uuid) == 0) { - subvol = child->xlator; - break; - } + xlator_list_t *child = NULL; + xlator_t *subvol = NULL; + char key[1024]; + int keylen = 0; + char *optstr = NULL; + + if (!this || !volume_uuid) + goto out; + + for (child = this->children; child; child = child->next) { + keylen = snprintf(key, sizeof(key), "%s.volume-id", + child->xlator->name); + if (dict_get_strn(this->options, key, keylen, &optstr) < 0) + continue; + + if (strcmp(optstr, volume_uuid) == 0) { + subvol = child->xlator; + break; } + } out: - return subvol; + return subvol; } int -qd_nameless_lookup (xlator_t *this, call_frame_t *frame, gfs3_lookup_req *req, - dict_t *xdata, quotad_aggregator_lookup_cbk_t lookup_cbk) +qd_nameless_lookup(xlator_t *this, call_frame_t *frame, char *gfid, + dict_t *xdata, char *volume_uuid, + quotad_aggregator_lookup_cbk_t lookup_cbk) { - gfs3_lookup_rsp rsp = {0, }; - int op_errno = 0, ret = -1; - loc_t loc = {0, }; - quotad_aggregator_state_t *state = NULL; - quota_priv_t *priv = NULL; - xlator_t *subvol = NULL; - char *volume_uuid = NULL; - - priv = this->private; - state = frame->root->state; - - frame->root->op = GF_FOP_LOOKUP; - - loc.inode = inode_new (state->itable); - if (loc.inode == NULL) { - op_errno = ENOMEM; - goto out; - } - - memcpy (loc.gfid, req->gfid, 16); - - ret = dict_get_str (xdata, "volume-uuid", &volume_uuid); - if (ret < 0) { - op_errno = EINVAL; - goto out; - } - - subvol = qd_find_subvol (this, volume_uuid); - if (subvol == NULL) { - op_errno = EINVAL; - goto out; - } - - STACK_WIND_COOKIE (frame, qd_lookup_cbk, lookup_cbk, subvol, - subvol->fops->lookup, &loc, xdata); - return 0; + gfs3_lookup_rsp rsp = { + 0, + }; + int op_errno = 0, ret = -1; + loc_t loc = { + 0, + }; + quotad_aggregator_state_t *state = NULL; + xlator_t *subvol = NULL; + + state = frame->root->state; + + frame->root->op = GF_FOP_LOOKUP; + + loc.inode = inode_new(state->itable); + if (loc.inode == NULL) { + op_errno = ENOMEM; + goto out; + } + + memcpy(loc.gfid, gfid, 16); + + ret = dict_set_int8(xdata, QUOTA_READ_ONLY_KEY, 1); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM, + "dict set failed"); + ret = -ENOMEM; + goto out; + } + + subvol = qd_find_subvol(this, volume_uuid); + if (subvol == NULL) { + op_errno = EINVAL; + goto out; + } + + STACK_WIND_COOKIE(frame, qd_lookup_cbk, lookup_cbk, subvol, + subvol->fops->lookup, &loc, xdata); + return 0; out: - rsp.op_ret = -1; - rsp.op_errno = op_errno; + rsp.op_ret = -1; + rsp.op_errno = op_errno; - lookup_cbk (this, frame, &rsp); + lookup_cbk(this, frame, &rsp); - inode_unref (loc.inode); - return 0; + inode_unref(loc.inode); + return 0; } int -qd_reconfigure (xlator_t *this, dict_t *options) +qd_reconfigure(xlator_t *this, dict_t *options) { - /* As of now quotad is restarted upon alteration of volfile */ - return 0; + /* As of now quotad is restarted upon alteration of volfile */ + return 0; } void -qd_fini (xlator_t *this) +qd_fini(xlator_t *this) { - return; + quota_priv_t *priv = NULL; + + if (this == NULL || this->private == NULL) + goto out; + + priv = this->private; + + if (priv->rpcsvc) { + GF_FREE(priv->rpcsvc); + priv->rpcsvc = NULL; + } + + GF_FREE(priv); + +out: + return; } int32_t -qd_init (xlator_t *this) +qd_init(xlator_t *this) { - int32_t ret = -1; - quota_priv_t *priv = NULL; - - if (NULL == this->children) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: quota (%s) not configured for min of 1 child", - this->name); - ret = -1; - goto err; - } + int32_t ret = -1; + quota_priv_t *priv = NULL; - QUOTA_ALLOC_OR_GOTO (priv, quota_priv_t, err); - LOCK_INIT (&priv->lock); + if (NULL == this->children) { + gf_log(this->name, GF_LOG_ERROR, + "FATAL: quota (%s) not configured for min of 1 child", + this->name); + ret = -1; + goto err; + } - this->private = priv; + QUOTA_ALLOC_OR_GOTO(priv, quota_priv_t, err); + LOCK_INIT(&priv->lock); - ret = quotad_aggregator_init (this); - if (ret < 0) - goto err; + this->private = priv; - ret = 0; + ret = 0; err: - if (ret) { - GF_FREE (priv); - } - return ret; + if (ret) { + GF_FREE(priv); + } + return ret; } -class_methods_t class_methods = { - .init = qd_init, - .fini = qd_fini, - .reconfigure = qd_reconfigure, -}; +struct xlator_fops fops = {}; -struct xlator_fops fops = { -}; +struct xlator_cbks cbks = {}; -struct xlator_cbks cbks = { +struct volume_options options[] = { + {.key = {"transport-type"}, + .value = {"rpc", "rpc-over-rdma", "tcp", "socket", "ib-verbs", "unix", + "ib-sdp", "tcp/server", "ib-verbs/server", "rdma", + "rdma*([ \t]),*([ \t])socket", "rdma*([ \t]),*([ \t])tcp", + "tcp*([ \t]),*([ \t])rdma", "socket*([ \t]),*([ \t])rdma"}, + .type = GF_OPTION_TYPE_STR}, + { + .key = {"transport.*"}, + .type = GF_OPTION_TYPE_ANY, + }, + {.key = {NULL}}, }; -struct volume_options options[] = { - { .key = {"transport-type"}, - .value = {"rpc", "rpc-over-rdma", "tcp", "socket", "ib-verbs", - "unix", "ib-sdp", "tcp/server", "ib-verbs/server", "rdma", - "rdma*([ \t]),*([ \t])socket", - "rdma*([ \t]),*([ \t])tcp", - "tcp*([ \t]),*([ \t])rdma", - "socket*([ \t]),*([ \t])rdma"}, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"transport.*"}, - .type = GF_OPTION_TYPE_ANY, - }, - {.key = {NULL}} +xlator_api_t xlator_api = { + .init = qd_init, + .fini = qd_fini, + .reconfigure = qd_reconfigure, + .notify = qd_notify, + .mem_acct_init = mem_acct_init, + .op_version = {1}, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "quotad", + .category = GF_MAINTAINED, }; diff --git a/xlators/features/read-only/src/Makefile.am b/xlators/features/read-only/src/Makefile.am index 4c146213742..e4a2017ef0d 100644 --- a/xlators/features/read-only/src/Makefile.am +++ b/xlators/features/read-only/src/Makefile.am @@ -2,19 +2,20 @@ xlator_LTLIBRARIES = read-only.la worm.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features -noinst_HEADERS = read-only-common.h +noinst_HEADERS = read-only.h read-only-mem-types.h read-only-common.h worm-helper.h -read_only_la_LDFLAGS = -module -avoid-version +read_only_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) read_only_la_SOURCES = read-only.c read-only-common.c read_only_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -worm_la_LDFLAGS = -module -avoid-version +worm_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) -worm_la_SOURCES = read-only-common.c worm.c +worm_la_SOURCES = read-only-common.c worm-helper.c worm.c worm_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/features/read-only/src/read-only-common.c b/xlators/features/read-only/src/read-only-common.c index 56a7a7176aa..9640e7e3eee 100644 --- a/xlators/features/read-only/src/read-only-common.c +++ b/xlators/features/read-only/src/read-only-common.c @@ -7,233 +7,400 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif +#include "read-only.h" +#include "read-only-mem-types.h" +#include <glusterfs/defaults.h> -#include "xlator.h" -#include "defaults.h" +gf_boolean_t +is_readonly_or_worm_enabled(call_frame_t *frame, xlator_t *this) +{ + read_only_priv_t *priv = NULL; + gf_boolean_t readonly_or_worm_enabled = _gf_false; + + priv = this->private; + GF_ASSERT(priv); + + readonly_or_worm_enabled = priv->readonly_or_worm_enabled; + + if (frame->root->pid < GF_CLIENT_PID_MAX) + readonly_or_worm_enabled = _gf_false; + + return readonly_or_worm_enabled; +} + +static int +_check_key_is_zero_filled(dict_t *d, char *k, data_t *v, void *tmp) +{ + if (mem_0filled((const char *)v->data, v->len)) { + /* -1 means, no more iterations, treat as 'break' */ + return -1; + } + return 0; +} int32_t -ro_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +ro_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { - STACK_UNWIND_STRICT (xattrop, frame, -1, EROFS, NULL, xdata); - return 0; + gf_boolean_t allzero = _gf_false; + int ret = 0; + + ret = dict_foreach(dict, _check_key_is_zero_filled, NULL); + if (ret == 0) + allzero = _gf_true; + + if (is_readonly_or_worm_enabled(frame, this) && !allzero) + STACK_UNWIND_STRICT(xattrop, frame, -1, EROFS, NULL, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, loc, flags, dict, + xdata); + return 0; } int32_t -ro_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +ro_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { - STACK_UNWIND_STRICT (fxattrop, frame, -1, EROFS, NULL, xdata); - return 0; + gf_boolean_t allzero = _gf_false; + int ret = 0; + + ret = dict_foreach(dict, _check_key_is_zero_filled, NULL); + if (ret == 0) + allzero = _gf_true; + + if (is_readonly_or_worm_enabled(frame, this) && !allzero) + STACK_UNWIND_STRICT(fxattrop, frame, -1, EROFS, NULL, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, fd, flags, dict, + xdata); + + return 0; } int32_t -ro_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, - loc_t *loc, const char *basename, entrylk_cmd cmd, - entrylk_type type, dict_t *xdata) +ro_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { - STACK_UNWIND_STRICT (entrylk, frame, -1, EROFS, xdata); - return 0; + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->entrylk, + volume, loc, basename, cmd, type, xdata); + + return 0; } int32_t -ro_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, - fd_t *fd, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +ro_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { - STACK_UNWIND_STRICT (fentrylk, frame, -1, EROFS, xdata); - return 0; + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fentrylk, + volume, fd, basename, cmd, type, xdata); + + return 0; } int32_t -ro_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, - loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata) +ro_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *lock, dict_t *xdata) { - STACK_UNWIND_STRICT (inodelk, frame, -1, EROFS, xdata); - return 0; + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->inodelk, + volume, loc, cmd, lock, xdata); + + return 0; } int32_t -ro_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, - fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata) +ro_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *lock, dict_t *xdata) { - STACK_UNWIND_STRICT (finodelk, frame, -1, EROFS, xdata); - return 0; + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->finodelk, + volume, fd, cmd, lock, xdata); + + return 0; } int32_t -ro_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd, - struct gf_flock *flock, dict_t *xdata) +ro_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd, + struct gf_flock *flock, dict_t *xdata) { - STACK_UNWIND_STRICT (lk, frame, -1, EROFS, NULL, xdata); - return 0; + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lk, fd, + cmd, flock, xdata); + + return 0; } int32_t -ro_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid, dict_t *xdata) +ro_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) { - STACK_UNWIND_STRICT (setattr, frame, -1, EROFS, NULL, NULL, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(setattr, frame, -1, EROFS, NULL, NULL, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, + xdata); + + return 0; } int32_t -ro_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid, dict_t *xdata) +ro_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) { - STACK_UNWIND_STRICT (fsetattr, frame, -1, EROFS, NULL, NULL, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(fsetattr, frame, -1, EROFS, NULL, NULL, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, + xdata); + + return 0; } +int32_t +ro_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(truncate, frame, -1, EROFS, NULL, NULL, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + + return 0; +} int32_t -ro_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata) +ro_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - STACK_UNWIND_STRICT (truncate, frame, -1, EROFS, NULL, NULL, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(ftruncate, frame, -1, EROFS, NULL, NULL, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + + return 0; } int32_t -ro_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, dict_t *xdata) +ro_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) { - STACK_UNWIND_STRICT (ftruncate, frame, -1, EROFS, NULL, NULL, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(fallocate, frame, -1, EROFS, NULL, NULL, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, + len, xdata); + return 0; } int -ro_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, mode_t umask, dict_t *xdata) +ro_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) { - STACK_UNWIND_STRICT (mknod, frame, -1, EROFS, NULL, NULL, NULL, NULL, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(mknod, frame, -1, EROFS, NULL, NULL, NULL, NULL, + xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, + xdata); + + return 0; } - int -ro_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - mode_t umask, dict_t *xdata) +ro_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { - STACK_UNWIND_STRICT (mkdir, frame, -1, EROFS, NULL, NULL, NULL, NULL, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(mkdir, frame, -1, EROFS, NULL, NULL, NULL, NULL, + xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, + xdata); + + return 0; } int32_t -ro_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, - dict_t *xdata) +ro_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - STACK_UNWIND_STRICT (unlink, frame, -1, EROFS, NULL, NULL, xdata); - return 0; -} + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(unlink, frame, -1, EROFS, NULL, NULL, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); + return 0; +} int -ro_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, - dict_t *xdata) +ro_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { - STACK_UNWIND_STRICT (rmdir, frame, -1, EROFS, NULL, NULL, xdata); - return 0; -} + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(rmdir, frame, -1, EROFS, NULL, NULL, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata); + return 0; +} int -ro_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, - loc_t *loc, mode_t umask, dict_t *xdata) +ro_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) { - STACK_UNWIND_STRICT (symlink, frame, -1, EROFS, NULL, NULL, NULL, - NULL, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(symlink, frame, -1, EROFS, NULL, NULL, NULL, NULL, + xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, + xdata); + + return 0; } - - int32_t -ro_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata) +ro_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - STACK_UNWIND_STRICT (rename, frame, -1, EROFS, NULL, NULL, NULL, NULL, - NULL, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(rename, frame, -1, EROFS, NULL, NULL, NULL, NULL, + NULL, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + + return 0; } - int32_t -ro_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata) +ro_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - STACK_UNWIND_STRICT (link, frame, -1, EROFS, NULL, NULL, NULL, NULL, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(link, frame, -1, EROFS, NULL, NULL, NULL, NULL, + xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + + return 0; } int32_t -ro_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +ro_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - STACK_UNWIND_STRICT (create, frame, -1, EROFS, NULL, NULL, NULL, - NULL, NULL, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(create, frame, -1, EROFS, NULL, NULL, NULL, NULL, + NULL, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, + umask, fd, xdata); + + return 0; } - static int32_t -ro_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, fd_t *fd, dict_t *xdata) +ro_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) { - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); - return 0; + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata); + return 0; } int32_t -ro_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, dict_t *xdata) +ro_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { - if (((flags & O_ACCMODE) == O_WRONLY) || - ((flags & O_ACCMODE) == O_RDWR)) { - STACK_UNWIND_STRICT (open, frame, -1, EROFS, NULL, xdata); - return 0; - } + if (is_readonly_or_worm_enabled(frame, this) && + (((flags & O_ACCMODE) == O_WRONLY) || + ((flags & O_ACCMODE) == O_RDWR))) { + STACK_UNWIND_STRICT(open, frame, -1, EROFS, NULL, xdata); + return 0; + } - STACK_WIND (frame, ro_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); - return 0; + STACK_WIND(frame, ro_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; } int32_t -ro_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, - int32_t flags, dict_t *xdata) +ro_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) { - STACK_UNWIND_STRICT (fsetxattr, frame, -1, EROFS, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(fsetxattr, frame, -1, EROFS, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, + xdata); + + return 0; } int32_t -ro_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict_t *xdata) +ro_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) { - STACK_UNWIND_STRICT (fsyncdir, frame, -1, EROFS, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(fsyncdir, frame, -1, EROFS, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsyncdir, fd, flags, xdata); + + return 0; } int32_t -ro_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, - int32_t count, off_t off, uint32_t flags, struct iobref *iobref, dict_t *xdata) +ro_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t off, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - STACK_UNWIND_STRICT (writev, frame, -1, EROFS, NULL, NULL, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(writev, frame, -1, EROFS, NULL, NULL, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, off, + flags, iobref, xdata); + + return 0; } - int32_t -ro_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags, dict_t *xdata) +ro_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { - STACK_UNWIND_STRICT (setxattr, frame, -1, EROFS, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(setxattr, frame, -1, EROFS, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, + xdata); + + return 0; } int32_t -ro_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) +ro_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - STACK_UNWIND_STRICT (removexattr, frame, -1, EROFS, xdata); - return 0; + if (is_readonly_or_worm_enabled(frame, this)) + STACK_UNWIND_STRICT(removexattr, frame, -1, EROFS, xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + + return 0; } diff --git a/xlators/features/read-only/src/read-only-common.h b/xlators/features/read-only/src/read-only-common.h index 5d4c7e260ed..5561961ffa2 100644 --- a/xlators/features/read-only/src/read-only-common.h +++ b/xlators/features/read-only/src/read-only-common.h @@ -7,109 +7,115 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> -#include "xlator.h" -#include "defaults.h" +gf_boolean_t +is_readonly_or_worm_enabled(call_frame_t *frame, xlator_t *this); int32_t -ro_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); +ro_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); int32_t -ro_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); +ro_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); int32_t -ro_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, - loc_t *loc, const char *basename, entrylk_cmd cmd, - entrylk_type type, dict_t *xdata); +ro_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata); int32_t -ro_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, - fd_t *fd, const char *basename, entrylk_cmd cmd, entrylk_type - type, dict_t *xdata); +ro_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata); int32_t -ro_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, - loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata); +ro_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *lock, dict_t *xdata); int32_t -ro_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, - fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata); +ro_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *lock, dict_t *xdata); int32_t -ro_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd, - struct gf_flock *flock, dict_t *xdata); +ro_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd, + struct gf_flock *flock, dict_t *xdata); int32_t -ro_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid, dict_t *xdata); +ro_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata); int32_t -ro_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid, dict_t *xdata); - +ro_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata); int32_t -ro_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata); +ro_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata); int32_t -ro_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, dict_t *xdata); +ro_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata); int -ro_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, mode_t umask, dict_t *xdata); +ro_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata); int -ro_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - mode_t umask, dict_t *xdata); +ro_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata); int32_t -ro_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, - dict_t *xdata); - -int -ro_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, +ro_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, dict_t *xdata); +int +ro_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata); int -ro_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, - loc_t *loc, mode_t umask, dict_t *xdata); +ro_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata); int32_t -ro_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata); +ro_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); int32_t -ro_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata); +ro_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); int32_t -ro_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata); +ro_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata); int32_t -ro_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, dict_t *xdata); +ro_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata); int32_t -ro_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, - int32_t flags, dict_t *xdata); +ro_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata); int32_t -ro_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict_t *xdata); +ro_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata); int32_t -ro_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, - int32_t count, off_t off, uint32_t flags, struct iobref *iobref, dict_t *xdata); +ro_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t off, uint32_t flags, struct iobref *iobref, + dict_t *xdata); int32_t -ro_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags, dict_t *xdata); +ro_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata); + +int32_t +ro_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata); int32_t -ro_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata); +ro_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata); diff --git a/xlators/features/changelog/src/changelog-notifier.h b/xlators/features/read-only/src/read-only-mem-types.h index 55e728356e6..c67d6c02cd0 100644 --- a/xlators/features/changelog/src/changelog-notifier.h +++ b/xlators/features/read-only/src/read-only-mem-types.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,12 +8,13 @@ cases as published by the Free Software Foundation. */ -#ifndef _CHANGELOG_NOTIFIER_H -#define _CHANGELOG_NOTIFIER_H +#ifndef __READONLY_MEM_TYPES_H__ +#define __READONLY_MEM_TYPES_H__ -#include "changelog-helpers.h" - -void * -changelog_notifier (void *data); +#include <glusterfs/mem-types.h> +enum gf_read_only_mem_types_ { + gf_read_only_mt_priv_t = gf_common_mt_end + 1, + gf_read_only_mt_end +}; #endif diff --git a/xlators/features/read-only/src/read-only.c b/xlators/features/read-only/src/read-only.c index e49e54a1b31..48654998e63 100644 --- a/xlators/features/read-only/src/read-only.c +++ b/xlators/features/read-only/src/read-only.c @@ -7,71 +7,138 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "defaults.h" #include "read-only-common.h" +#include "read-only-mem-types.h" +#include "read-only.h" int32_t -init (xlator_t *this) +mem_acct_init(xlator_t *this) { - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "translator not configured with exactly one child"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - return 0; + int ret = -1; + + ret = xlator_mem_acct_init(this, gf_read_only_mt_end + 1); + if (ret) + gf_log(this->name, GF_LOG_ERROR, + "Memory accounting " + "initialization failed."); + + return ret; } +int32_t +init(xlator_t *this) +{ + int ret = -1; + read_only_priv_t *priv = NULL; -void -fini (xlator_t *this) + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_ERROR, + "translator not configured with exactly one child"); + return -1; + } + + if (!this->parents) { + gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile "); + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_read_only_mt_priv_t); + if (!priv) + goto out; + + this->private = priv; + + GF_OPTION_INIT("read-only", priv->readonly_or_worm_enabled, bool, out); + + ret = 0; +out: + return ret; +} + +int +reconfigure(xlator_t *this, dict_t *options) { - return; + read_only_priv_t *priv = NULL; + int ret = -1; + gf_boolean_t readonly_or_worm_enabled = _gf_false; + + priv = this->private; + GF_ASSERT(priv); + + GF_OPTION_RECONF("read-only", readonly_or_worm_enabled, options, bool, out); + priv->readonly_or_worm_enabled = readonly_or_worm_enabled; + ret = 0; +out: + gf_log(this->name, GF_LOG_DEBUG, "returning %d", ret); + return ret; } +void +fini(xlator_t *this) +{ + read_only_priv_t *priv = NULL; + + priv = this->private; + if (!priv) + return; + + this->private = NULL; + GF_FREE(priv); + + return; +} struct xlator_fops fops = { - .mknod = ro_mknod, - .mkdir = ro_mkdir, - .unlink = ro_unlink, - .rmdir = ro_rmdir, - .symlink = ro_symlink, - .rename = ro_rename, - .link = ro_link, - .truncate = ro_truncate, - .open = ro_open, - .writev = ro_writev, - .setxattr = ro_setxattr, - .fsetxattr = ro_fsetxattr, - .removexattr = ro_removexattr, - .fsyncdir = ro_fsyncdir, - .ftruncate = ro_ftruncate, - .create = ro_create, - .setattr = ro_setattr, - .fsetattr = ro_fsetattr, - .xattrop = ro_xattrop, - .fxattrop = ro_fxattrop, - .inodelk = ro_inodelk, - .finodelk = ro_finodelk, - .entrylk = ro_entrylk, - .fentrylk = ro_fentrylk, - .lk = ro_lk, + .mknod = ro_mknod, + .mkdir = ro_mkdir, + .unlink = ro_unlink, + .rmdir = ro_rmdir, + .symlink = ro_symlink, + .rename = ro_rename, + .link = ro_link, + .truncate = ro_truncate, + .open = ro_open, + .writev = ro_writev, + .setxattr = ro_setxattr, + .fsetxattr = ro_fsetxattr, + .removexattr = ro_removexattr, + .fsyncdir = ro_fsyncdir, + .ftruncate = ro_ftruncate, + .create = ro_create, + .setattr = ro_setattr, + .fsetattr = ro_fsetattr, + .xattrop = ro_xattrop, + .fxattrop = ro_fxattrop, + .inodelk = ro_inodelk, + .finodelk = ro_finodelk, + .entrylk = ro_entrylk, + .fentrylk = ro_fentrylk, + .lk = ro_lk, + .fallocate = ro_fallocate, }; -struct xlator_cbks cbks = { -}; +struct xlator_cbks cbks = {}; struct volume_options options[] = { - { .key = {NULL} }, + {.key = {"read-only"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + /*.validate_fn = validate_boolean,*/ + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE, + .description = "When \"on\", makes a volume read-only. It is turned " + "\"off\" by default."}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "read-only", + .category = GF_TECH_PREVIEW, }; diff --git a/xlators/features/read-only/src/read-only.h b/xlators/features/read-only/src/read-only.h new file mode 100644 index 00000000000..aced5d3c577 --- /dev/null +++ b/xlators/features/read-only/src/read-only.h @@ -0,0 +1,37 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __READONLY_H__ +#define __READONLY_H__ + +#include <stdint.h> // for uint64_t, uint8_t +#include <sys/time.h> // for time_t +#include "glusterfs/glusterfs.h" // for gf_boolean_t + +typedef struct { + uint8_t worm : 1; + uint8_t retain : 1; + uint8_t legal_hold : 1; + uint8_t ret_mode : 1; + int64_t ret_period; + int64_t auto_commit_period; +} worm_reten_state_t; + +typedef struct { + gf_boolean_t readonly_or_worm_enabled; + gf_boolean_t worm_file; + gf_boolean_t worm_files_deletable; + int64_t reten_period; + int64_t com_period; + int reten_mode; + time_t start_time; +} read_only_priv_t; + +#endif diff --git a/xlators/features/read-only/src/worm-helper.c b/xlators/features/read-only/src/worm-helper.c new file mode 100644 index 00000000000..df45f2a940b --- /dev/null +++ b/xlators/features/read-only/src/worm-helper.c @@ -0,0 +1,395 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#include "read-only-mem-types.h" +#include "read-only.h" +#include <glusterfs/xlator.h> +#include <glusterfs/syncop.h> +#include "worm-helper.h" + +/*Function to check whether file is read-only. + * The input *stbuf contains the attributes of the file, which is used to check + * the write protection bits for all the users of the file. + * Return true if all the write bits are disabled,false otherwise*/ +gf_boolean_t +gf_worm_write_disabled(struct iatt *stbuf) +{ + gf_boolean_t ret = _gf_false; + + GF_VALIDATE_OR_GOTO("worm", stbuf, out); + + if (stbuf->ia_prot.owner.write == 0 && stbuf->ia_prot.group.write == 0 && + stbuf->ia_prot.other.write == 0) + ret = _gf_true; +out: + return ret; +} + +int32_t +worm_init_state(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr) +{ + int ret = -1; + uint64_t start_time = 0; + dict_t *dict = NULL; + + GF_VALIDATE_OR_GOTO("worm", this, out); + GF_VALIDATE_OR_GOTO(this->name, file_ptr, out); + + start_time = gf_time(); + dict = dict_new(); + if (!dict) { + gf_log(this->name, GF_LOG_ERROR, "Error creating the dict"); + goto out; + } + ret = dict_set_uint64(dict, "trusted.start_time", start_time); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Error in setting the dict"); + goto out; + } + if (fop_with_fd) + ret = syncop_fsetxattr(this, (fd_t *)file_ptr, dict, 0, NULL, NULL); + else + ret = syncop_setxattr(this, (loc_t *)file_ptr, dict, 0, NULL, NULL); +out: + if (dict) + dict_unref(dict); + return ret; +} + +/*Function to set the retention state for a file. + * It loads the WORM/Retention state into the retention_state pointer.*/ +int32_t +worm_set_state(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr, + worm_reten_state_t *retention_state, struct iatt *stbuf) +{ + read_only_priv_t *priv = NULL; + struct iatt stpre = { + 0, + }; + int ret = -1; + + GF_VALIDATE_OR_GOTO("worm", this, out); + GF_VALIDATE_OR_GOTO(this->name, file_ptr, out); + GF_VALIDATE_OR_GOTO(this->name, retention_state, out); + GF_VALIDATE_OR_GOTO(this->name, stbuf, out); + + priv = this->private; + GF_ASSERT(priv); + retention_state->worm = 1; + retention_state->retain = 1; + retention_state->legal_hold = 0; + retention_state->ret_mode = priv->reten_mode; + retention_state->ret_period = priv->reten_period; + retention_state->auto_commit_period = priv->com_period; + if (fop_with_fd) + ret = syncop_fstat(this, (fd_t *)file_ptr, &stpre, NULL, NULL); + else + ret = syncop_stat(this, (loc_t *)file_ptr, &stpre, NULL, NULL); + if (ret) + goto out; + stbuf->ia_mtime = stpre.ia_mtime; + stbuf->ia_atime = gf_time() + retention_state->ret_period; + + if (fop_with_fd) + ret = syncop_fsetattr(this, (fd_t *)file_ptr, stbuf, GF_SET_ATTR_ATIME, + NULL, NULL, NULL, NULL); + else + ret = syncop_setattr(this, (loc_t *)file_ptr, stbuf, GF_SET_ATTR_ATIME, + NULL, NULL, NULL, NULL); + if (ret) + goto out; + + ret = gf_worm_set_xattr(this, retention_state, fop_with_fd, file_ptr); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Error setting xattr"); + goto out; + } + ret = 0; +out: + return ret; +} + +/*This function gets the state of the WORM/Retention xattr and loads it in the + * dict pointer.*/ +int32_t +worm_get_state(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr, + worm_reten_state_t *reten_state) +{ + dict_t *dict = NULL; + char *val = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO("worm", this, out); + GF_VALIDATE_OR_GOTO(this->name, file_ptr, out); + GF_VALIDATE_OR_GOTO(this->name, reten_state, out); + + if (fop_with_fd) + ret = syncop_fgetxattr(this, (fd_t *)file_ptr, &dict, + "trusted.reten_state", NULL, NULL); + else + ret = syncop_getxattr(this, (loc_t *)file_ptr, &dict, + "trusted.reten_state", NULL, NULL); + if (ret < 0 || !dict) { + ret = -1; + goto out; + } + ret = dict_get_str(dict, "trusted.reten_state", &val); + if (ret) { + ret = -2; + gf_log(this->name, GF_LOG_ERROR, "Empty val"); + } + gf_worm_deserialize_state(val, reten_state); +out: + if (dict) + dict_unref(dict); + return ret; +} + +/*Function to lookup the current state of the WORM/Retention profile. + * Based on the retain value and the access time of the file, the transition + * from WORM/Retention to WORM is made.*/ +void +gf_worm_state_lookup(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr, + worm_reten_state_t *reten_state, struct iatt *stbuf) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO("worm", this, out); + GF_VALIDATE_OR_GOTO(this->name, file_ptr, out); + GF_VALIDATE_OR_GOTO(this->name, reten_state, out); + GF_VALIDATE_OR_GOTO(this->name, stbuf, out); + + stbuf->ia_atime -= reten_state->ret_period; + reten_state->retain = 0; + reten_state->ret_period = 0; + reten_state->auto_commit_period = 0; + ret = gf_worm_set_xattr(this, reten_state, fop_with_fd, file_ptr); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Error setting xattr"); + goto out; + } + + if (fop_with_fd) + ret = syncop_fsetattr(this, (fd_t *)file_ptr, stbuf, GF_SET_ATTR_ATIME, + NULL, NULL, NULL, NULL); + else + ret = syncop_setattr(this, (loc_t *)file_ptr, stbuf, GF_SET_ATTR_ATIME, + NULL, NULL, NULL, NULL); + if (ret) + goto out; + gf_log(this->name, GF_LOG_INFO, "Retention state reset"); +out: + return; +} + +/*This function serializes and stores the WORM/Retention state of a file in an + * uint64_t variable by setting the bits using the bitwise operations.*/ +void +gf_worm_serialize_state(worm_reten_state_t *reten_state, char *val) +{ + uint32_t state = 0; + + GF_VALIDATE_OR_GOTO("worm", reten_state, out); + GF_VALIDATE_OR_GOTO("worm", val, out); + + state |= reten_state->worm << 0; + state |= reten_state->retain << 1; + state |= reten_state->legal_hold << 2; + state |= reten_state->ret_mode << 3; + sprintf(val, "%d/%" PRIu64 "/%" PRIu64, state, reten_state->ret_period, + reten_state->auto_commit_period); + +out: + return; +} + +/*This function deserializes the data stored in the xattr of the file and loads + * the value to the reten_state structure.*/ +void +gf_worm_deserialize_state(char *val, worm_reten_state_t *reten_state) +{ + char *token = NULL; + uint32_t state = 0; + + GF_VALIDATE_OR_GOTO("worm", val, out); + GF_VALIDATE_OR_GOTO("worm", reten_state, out); + + token = strtok(val, "/"); + state = atoi(token); + reten_state->worm = (state >> 0) & 1; + reten_state->retain = (state >> 1) & 1; + reten_state->legal_hold = (state >> 2) & 1; + reten_state->ret_mode = (state >> 3) & 1; + token = strtok(NULL, "/"); + reten_state->ret_period = atoi(token); + token = strtok(NULL, "/"); + reten_state->auto_commit_period = atoi(token); + +out: + return; +} + +/*Function to set the xattr for a file. + * If the xattr is already present then it will replace that.*/ +int32_t +gf_worm_set_xattr(xlator_t *this, worm_reten_state_t *reten_state, + gf_boolean_t fop_with_fd, void *file_ptr) +{ + char val[100] = ""; + int ret = -1; + dict_t *dict = NULL; + + GF_VALIDATE_OR_GOTO("worm", this, out); + GF_VALIDATE_OR_GOTO(this->name, reten_state, out); + GF_VALIDATE_OR_GOTO(this->name, file_ptr, out); + + gf_worm_serialize_state(reten_state, val); + dict = dict_new(); + if (!dict) { + gf_log(this->name, GF_LOG_ERROR, "Error creating the dict"); + goto out; + } + ret = dict_set_str(dict, "trusted.reten_state", val); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Error in setting the dict"); + goto out; + } + if (fop_with_fd) + ret = syncop_fsetxattr(this, (fd_t *)file_ptr, dict, 0, NULL, NULL); + else + ret = syncop_setxattr(this, (loc_t *)file_ptr, dict, 0, NULL, NULL); +out: + if (dict) + dict_unref(dict); + return ret; +} + +/*This function checks whether a file's timeout is happened for the state + * transition and if yes, then it will do the transition from the current state + * to the appropriate state. It also decides whether to continue or to block + * the FOP. + * Return: + * 0 : If the FOP should continue i.e., if the file is not in the WORM-Retained + * state or if the FOP is unlink and the file is not in the Retained state. + * 1: If the FOP sholud block i.e., if the file is in WORM-Retained/WORM state. + * 2: Blocks the FOP if any operation fails while doing the state transition or + * fails to get the state of the file.*/ +int +gf_worm_state_transition(xlator_t *this, gf_boolean_t fop_with_fd, + void *file_ptr, glusterfs_fop_t op) +{ + int op_errno = EROFS; + int ret = -1; + time_t now = 0; + uint64_t com_period = 0; + uint64_t start_time = 0; + dict_t *dict = NULL; + worm_reten_state_t reten_state = { + 0, + }; + read_only_priv_t *priv = NULL; + struct iatt stbuf = { + 0, + }; + + priv = this->private; + GF_ASSERT(priv); + + if (fop_with_fd) + ret = syncop_fgetxattr(this, (fd_t *)file_ptr, &dict, + "trusted.start_time", NULL, NULL); + else + ret = syncop_getxattr(this, (loc_t *)file_ptr, &dict, + "trusted.start_time", NULL, NULL); + if (ret < 0 || !dict) { + op_errno = ret; + gf_msg(this->name, GF_LOG_ERROR, -ret, 0, "Error getting xattr"); + goto out; + } + ret = dict_get_uint64(dict, "trusted.start_time", &start_time); + if (ret) { + op_errno = ret; + gf_msg(this->name, GF_LOG_ERROR, -ret, 0, "Error getting start time"); + goto out; + } + + com_period = priv->com_period; + if (fop_with_fd) + ret = syncop_fstat(this, (fd_t *)file_ptr, &stbuf, NULL, NULL); + else + ret = syncop_stat(this, (loc_t *)file_ptr, &stbuf, NULL, NULL); + if (ret) { + op_errno = ret; + gf_msg(this->name, GF_LOG_ERROR, -ret, 0, "Error getting file stat"); + goto out; + } + + ret = worm_get_state(this, fop_with_fd, file_ptr, &reten_state); + if (ret == -2) { + op_errno = ret; + gf_msg(this->name, GF_LOG_ERROR, -ret, 0, + "Error getting worm/retention state"); + goto out; + } + + now = gf_time(); + + if (ret == -1 && (now - start_time) >= com_period) { + if ((now - stbuf.ia_mtime) >= com_period) { + ret = worm_set_state(this, fop_with_fd, file_ptr, &reten_state, + &stbuf); + if (ret) { + op_errno = ret; + gf_msg(this->name, GF_LOG_ERROR, -ret, 0, + "Error setting worm/retention state"); + goto out; + } + goto out; + } else { + op_errno = 0; + goto out; + } + } else if (ret == -1 && (now - start_time) < com_period) { + op_errno = 0; + goto out; + } else if (reten_state.retain && ((now >= stbuf.ia_atime))) { + gf_worm_state_lookup(this, fop_with_fd, file_ptr, &reten_state, &stbuf); + } + if (reten_state.worm && !reten_state.retain && priv->worm_files_deletable && + op == GF_FOP_UNLINK) { + op_errno = 0; + goto out; + } + +out: + if (dict) + dict_unref(dict); + return op_errno; +} + +/*Function to check whether a file is independently WORMed (i.e., file level + * WORM is set on the file). */ +int32_t +is_wormfile(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr) +{ + int ret = -1; + dict_t *dict = NULL; + + if (fop_with_fd) + ret = syncop_fgetxattr(this, (fd_t *)file_ptr, &dict, + "trusted.worm_file", NULL, NULL); + else + ret = syncop_getxattr(this, (loc_t *)file_ptr, &dict, + "trusted.worm_file", NULL, NULL); + if (dict) { + ret = 0; + dict_unref(dict); + } + return ret; +} diff --git a/xlators/features/read-only/src/worm-helper.h b/xlators/features/read-only/src/worm-helper.h new file mode 100644 index 00000000000..b42f8d2b40c --- /dev/null +++ b/xlators/features/read-only/src/worm-helper.h @@ -0,0 +1,44 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +gf_boolean_t +gf_worm_write_disabled(struct iatt *stbuf); + +int32_t +worm_init_state(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr); + +int32_t +worm_set_state(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr, + worm_reten_state_t *retention_state, struct iatt *stbuf); + +int32_t +worm_get_state(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr, + worm_reten_state_t *reten_state); + +void +gf_worm_state_lookup(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr, + worm_reten_state_t *reten_state, struct iatt *stbuf); + +void +gf_worm_serialize_state(worm_reten_state_t *reten_state, char *val); + +void +gf_worm_deserialize_state(char *val, worm_reten_state_t *reten_state); + +int32_t +gf_worm_set_xattr(xlator_t *this, worm_reten_state_t *reten_state, + gf_boolean_t fop_with_fd, void *file_ptr); + +int +gf_worm_state_transition(xlator_t *this, gf_boolean_t fop_with_fd, + void *file_ptr, glusterfs_fop_t op); + +int32_t +is_wormfile(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr); diff --git a/xlators/features/read-only/src/worm.c b/xlators/features/read-only/src/worm.c index 16c3eb3daed..1cc5526d5cd 100644 --- a/xlators/features/read-only/src/worm.c +++ b/xlators/features/read-only/src/worm.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2008-2012, 2016 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -7,83 +7,716 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "defaults.h" +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> #include "read-only-common.h" +#include "read-only-mem-types.h" +#include "read-only.h" +#include <glusterfs/syncop.h> +#include "worm-helper.h" + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + ret = xlator_mem_acct_init(this, gf_read_only_mt_end + 1); + if (ret) + gf_log(this->name, GF_LOG_ERROR, + "Memory accounting " + "initialization failed."); + + return ret; +} static int32_t -worm_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, fd_t *fd, dict_t *xdata) +worm_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); + if (is_readonly_or_worm_enabled(frame, this) && + (flags & (O_WRONLY | O_RDWR | O_APPEND | O_TRUNC))) { + STACK_UNWIND_STRICT(open, frame, -1, EROFS, NULL, NULL); return 0; + } + + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, + loc, flags, fd, xdata); + return 0; } -int32_t -worm_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, dict_t *xdata) +static int32_t +worm_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - if ((((flags & O_ACCMODE) == O_WRONLY) || - ((flags & O_ACCMODE) == O_RDWR)) && - !(flags & O_APPEND)) { - STACK_UNWIND_STRICT (open, frame, -1, EROFS, NULL, NULL); - return 0; + int op_errno = EROFS; + read_only_priv_t *priv = NULL; + + priv = this->private; + GF_ASSERT(priv); + if (is_readonly_or_worm_enabled(frame, this)) + goto out; + if (!priv->worm_file || (frame->root->pid < 0)) { + op_errno = 0; + goto out; + } + + gf_uuid_copy(oldloc->gfid, oldloc->inode->gfid); + if (is_wormfile(this, _gf_false, oldloc)) { + op_errno = 0; + goto out; + } + op_errno = gf_worm_state_transition(this, _gf_false, oldloc, GF_FOP_LINK); + +out: + if (op_errno) { + if (op_errno < 0) + op_errno = EROFS; + STACK_UNWIND_STRICT(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + } else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + return 0; +} + +static int32_t +worm_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + dict_t *xdata) +{ + int op_errno = EROFS; + read_only_priv_t *priv = NULL; + + priv = this->private; + GF_ASSERT(priv); + if (is_readonly_or_worm_enabled(frame, this)) { + goto out; + } + if (!priv->worm_file || (frame->root->pid < 0)) { + op_errno = 0; + goto out; + } + + gf_uuid_copy(loc->gfid, loc->inode->gfid); + if (is_wormfile(this, _gf_false, loc)) { + op_errno = 0; + goto out; + } + op_errno = gf_worm_state_transition(this, _gf_false, loc, GF_FOP_UNLINK); +out: + if (op_errno) { + if (op_errno < 0) + op_errno = EROFS; + STACK_UNWIND_STRICT(unlink, frame, -1, op_errno, NULL, NULL, NULL); + } else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, flags, xdata); + return 0; +} + +static int32_t +worm_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + int op_errno = EROFS; + read_only_priv_t *priv = NULL; + + priv = this->private; + GF_ASSERT(priv); + if (is_readonly_or_worm_enabled(frame, this)) + goto out; + if (!priv->worm_file || (frame->root->pid < 0)) { + op_errno = 0; + goto out; + } + + gf_uuid_copy(oldloc->gfid, oldloc->inode->gfid); + if (is_wormfile(this, _gf_false, oldloc)) { + op_errno = 0; + goto check_newloc; + } + op_errno = gf_worm_state_transition(this, _gf_false, oldloc, GF_FOP_RENAME); + + if (op_errno == 0) { + check_newloc: + if (newloc->inode != NULL) { + gf_uuid_copy(newloc->gfid, newloc->inode->gfid); + if (is_wormfile(this, _gf_false, newloc)) { + op_errno = 0; + goto out; + } + op_errno = gf_worm_state_transition(this, _gf_false, newloc, + GF_FOP_RENAME); } + } - STACK_WIND (frame, worm_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); - return 0; +out: + if (op_errno) { + if (op_errno < 0) + op_errno = EROFS; + STACK_UNWIND_STRICT(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); + } else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; } -int32_t -init (xlator_t *this) +static int32_t +worm_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "translator not configured with exactly one child"); - return -1; + int op_errno = EROFS; + read_only_priv_t *priv = NULL; + + priv = this->private; + GF_ASSERT(priv); + if (is_readonly_or_worm_enabled(frame, this)) + goto out; + if (!priv->worm_file || (frame->root->pid < 0)) { + op_errno = 0; + goto out; + } + + if (is_wormfile(this, _gf_false, loc)) { + op_errno = 0; + goto out; + } + op_errno = gf_worm_state_transition(this, _gf_false, loc, GF_FOP_TRUNCATE); + +out: + if (op_errno) { + if (op_errno < 0) + op_errno = EROFS; + STACK_UNWIND_STRICT(truncate, frame, -1, op_errno, NULL, NULL, NULL); + } else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; +} + +static int32_t +worm_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + int op_errno = EROFS; + read_only_priv_t *priv = NULL; + + priv = this->private; + GF_ASSERT(priv); + if (is_readonly_or_worm_enabled(frame, this)) + goto out; + if (!priv->worm_file || (frame->root->pid < 0)) { + op_errno = 0; + goto out; + } + + if (is_wormfile(this, _gf_true, fd)) { + op_errno = 0; + goto out; + } + op_errno = gf_worm_state_transition(this, _gf_true, fd, GF_FOP_FTRUNCATE); + +out: + if (op_errno) { + if (op_errno < 0) + op_errno = EROFS; + STACK_UNWIND_STRICT(ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + } else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; +} + +static int32_t +worm_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + gf_boolean_t rd_only = _gf_false; + worm_reten_state_t reten_state = { + 0, + }; + struct iatt stpre = { + 0, + }; + read_only_priv_t *priv = NULL; + int op_errno = EROFS; + int ret = -1; + + priv = this->private; + GF_ASSERT(priv); + if (!priv->worm_file) { + op_errno = 0; + goto out; + } + + if (is_wormfile(this, _gf_false, loc)) { + op_errno = 0; + goto out; + } + if (valid & GF_SET_ATTR_MODE) { + rd_only = gf_worm_write_disabled(stbuf); + if (!rd_only) { + op_errno = 0; + goto out; } - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); + ret = worm_set_state(this, _gf_false, loc, &reten_state, stbuf); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Error setting worm state"); + goto out; + } + } else if (valid & GF_SET_ATTR_ATIME) { + ret = worm_get_state(this, _gf_false, loc, &reten_state); + if (ret) { + op_errno = 0; + goto out; } + if (reten_state.retain) { + ret = syncop_stat(this, loc, &stpre, NULL, NULL); + if (ret) + goto out; + if (reten_state.ret_mode == 0) { + if (stbuf->ia_atime < stpre.ia_mtime) { + gf_log(this->name, GF_LOG_ERROR, + "Cannot set atime less than " + "the mtime for a WORM-Retained " + "file"); + goto out; + } + } else { + if (stbuf->ia_atime < stpre.ia_atime) { + gf_log(this->name, GF_LOG_ERROR, + "Cannot decrease the atime of a" + " WORM-Retained file in " + "Enterprise mode"); + goto out; + } + } + reten_state.ret_period = reten_state.ret_period + stbuf->ia_atime - + stpre.ia_atime; + ret = gf_worm_set_xattr(this, &reten_state, _gf_false, loc); + if (ret) { + goto out; + } + stbuf->ia_mtime = stpre.ia_mtime; + } + } + op_errno = 0; - return 0; +out: + if (op_errno) + STACK_UNWIND_STRICT(setattr, frame, -1, EROFS, NULL, NULL, NULL); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, + xdata); + return 0; +} + +static int32_t +worm_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + gf_boolean_t rd_only = _gf_false; + worm_reten_state_t reten_state = { + 0, + }; + struct iatt stpre = { + 0, + }; + read_only_priv_t *priv = NULL; + int op_errno = EROFS; + int ret = -1; + + priv = this->private; + GF_ASSERT(priv); + if (!priv->worm_file) { + op_errno = 0; + goto out; + } + + if (is_wormfile(this, _gf_true, fd)) { + op_errno = 0; + goto out; + } + if (valid & GF_SET_ATTR_MODE) { + rd_only = gf_worm_write_disabled(stbuf); + if (!rd_only) { + op_errno = 0; + goto out; + } + + ret = worm_set_state(this, _gf_true, fd, &reten_state, stbuf); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Error setting worm state"); + goto out; + } + } else if (valid & GF_SET_ATTR_ATIME) { + ret = worm_get_state(this, _gf_true, fd, &reten_state); + if (ret) { + op_errno = 0; + goto out; + } + if (reten_state.retain) { + ret = syncop_fstat(this, fd, &stpre, NULL, NULL); + if (ret) + goto out; + if (reten_state.ret_mode == 0) { + if (stbuf->ia_atime < stpre.ia_mtime) { + gf_log(this->name, GF_LOG_ERROR, + "Cannot set atime less than " + "the mtime for a WORM-Retained " + "file"); + goto out; + } + } else { + if (stbuf->ia_atime < stpre.ia_atime) { + gf_log(this->name, GF_LOG_ERROR, + "Cannot decrease the atime of a" + " WORM-Retained file in " + "Enterprise mode"); + goto out; + } + } + reten_state.ret_period = reten_state.ret_period + stbuf->ia_atime - + stpre.ia_atime; + ret = gf_worm_set_xattr(this, &reten_state, _gf_true, fd); + if (ret) { + goto out; + } + + stbuf->ia_mtime = stpre.ia_mtime; + } + } + op_errno = 0; + +out: + if (op_errno) + STACK_UNWIND_STRICT(fsetattr, frame, -1, op_errno, NULL, NULL, NULL); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, + xdata); + return 0; +} + +static int32_t +worm_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + read_only_priv_t *priv = NULL; + int op_errno = EROFS; + + priv = this->private; + GF_ASSERT(priv); + if (!priv->worm_file || (frame->root->pid < 0)) { + op_errno = 0; + goto out; + } + if (is_wormfile(this, _gf_true, fd)) { + op_errno = 0; + goto out; + } + op_errno = gf_worm_state_transition(this, _gf_true, fd, GF_FOP_WRITE); + +out: + if (op_errno) { + if (op_errno < 0) + op_errno = EROFS; + STACK_UNWIND_STRICT(writev, frame, -1, op_errno, NULL, NULL, NULL); + } else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, + offset, flags, iobref, xdata); + return 0; +} + +static int32_t +worm_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int ret = 0; + read_only_priv_t *priv = NULL; + // In case of an error exit because fd can be NULL and this would + // cause an segfault when performing fsetxattr . We explicitly + // unwind to avoid future problems + if (op_ret < 0) { + goto out; + } + + priv = this->private; + GF_ASSERT(priv); + if (priv->worm_file) { + ret = fd_ctx_set(fd, this, 1); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "Failed to set the fd ctx " + "for gfid:%s . Worm feature may not work for the gfid", + uuid_utoa(inode->gfid)); + } + ret = worm_init_state(this, _gf_true, fd); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Error initializing state"); + } + } + +out: + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); + return ret; +} + +static int32_t +worm_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + STACK_WIND(frame, worm_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; +} + +static void +set_reten_mode(read_only_priv_t *priv, char *reten_mode) +{ + if (strcmp(reten_mode, "relax") == 0) + priv->reten_mode = 0; + else + priv->reten_mode = 1; +} + +int32_t +init(xlator_t *this) +{ + int ret = -1; + read_only_priv_t *priv = NULL; + char *reten_mode = NULL; + + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_ERROR, + "translator not configured with exactly one child"); + return -1; + } + + if (!this->parents) { + gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile "); + } + + this->local_pool = mem_pool_new(read_only_priv_t, 64); + if (!this->local_pool) { + ret = -1; + gf_log(this->name, GF_LOG_ERROR, + "failed to create read_only_priv_t's memory pool"); + goto out; + } + + priv = mem_get0(this->local_pool); + if (!priv) { + gf_log(this->name, GF_LOG_ERROR, "Error allocating priv"); + goto out; + } + + this->private = priv; + + GF_OPTION_INIT("worm", priv->readonly_or_worm_enabled, bool, out); + GF_OPTION_INIT("worm-file-level", priv->worm_file, bool, out); + GF_OPTION_INIT("default-retention-period", priv->reten_period, int64, out); + GF_OPTION_INIT("auto-commit-period", priv->com_period, int64, out); + GF_OPTION_INIT("retention-mode", reten_mode, str, out); + set_reten_mode(priv, reten_mode); + GF_OPTION_INIT("worm-files-deletable", priv->worm_files_deletable, bool, + out); + + ret = 0; +out: + return ret; } +int +reconfigure(xlator_t *this, dict_t *options) +{ + read_only_priv_t *priv = NULL; + char *reten_mode = NULL; + int ret = -1; + + priv = this->private; + GF_ASSERT(priv); + + GF_OPTION_RECONF("worm", priv->readonly_or_worm_enabled, options, bool, + out); + GF_OPTION_RECONF("worm-file-level", priv->worm_file, options, bool, out); + GF_OPTION_RECONF("default-retention-period", priv->reten_period, options, + int64, out); + GF_OPTION_RECONF("retention-mode", reten_mode, options, str, out); + set_reten_mode(priv, reten_mode); + GF_OPTION_RECONF("auto-commit-period", priv->com_period, options, int64, + out); + GF_OPTION_RECONF("worm-files-deletable", priv->worm_files_deletable, + options, bool, out); + ret = 0; +out: + gf_log(this->name, GF_LOG_DEBUG, "returning %d", ret); + return ret; +} void -fini (xlator_t *this) +fini(xlator_t *this) { - return; + read_only_priv_t *priv = NULL; + + priv = this->private; + if (!priv) + goto out; + mem_put(priv); + this->private = NULL; + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; +out: + return; } struct xlator_fops fops = { - .open = worm_open, - - .unlink = ro_unlink, - .rmdir = ro_rmdir, - .rename = ro_rename, - .truncate = ro_truncate, - .removexattr = ro_removexattr, - .fsyncdir = ro_fsyncdir, - .xattrop = ro_xattrop, - .inodelk = ro_inodelk, - .finodelk = ro_finodelk, - .entrylk = ro_entrylk, - .fentrylk = ro_fentrylk, - .lk = ro_lk, + .open = worm_open, + .writev = worm_writev, + .setattr = worm_setattr, + .fsetattr = worm_fsetattr, + .rename = worm_rename, + .link = worm_link, + .unlink = worm_unlink, + .truncate = worm_truncate, + .ftruncate = worm_ftruncate, + .create = worm_create, + + .rmdir = ro_rmdir, + .removexattr = ro_removexattr, + .fsyncdir = ro_fsyncdir, + .xattrop = ro_xattrop, + .inodelk = ro_inodelk, + .finodelk = ro_finodelk, + .entrylk = ro_entrylk, + .fentrylk = ro_fentrylk, + .lk = ro_lk, }; -struct xlator_cbks cbks; +int32_t +worm_release(xlator_t *this, fd_t *fd) +{ + dict_t *dict = NULL; + int ret = -1; + dict = dict_new(); + uint64_t value = 0; + loc_t loc = { + 0, + }; + read_only_priv_t *priv = NULL; + priv = this->private; + + if (priv->worm_file) { + if (!dict) { + gf_log(this->name, GF_LOG_ERROR, "Error creating the dict"); + goto out; + } + + ret = fd_ctx_get(fd, this, &value); + if (ret) { + gf_log(this->name, GF_LOG_DEBUG, "Failed to get the fd ctx"); + } + if (!value) { + goto out; + } + + ret = dict_set_int8(dict, "trusted.worm_file", 1); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "Error in setting " + "the dict"); + goto out; + } + + loc.inode = inode_ref(fd->inode); + gf_uuid_copy(loc.gfid, fd->inode->gfid); + ret = syncop_setxattr(this, &loc, dict, 0, NULL, NULL); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Error setting xattr"); + goto out; + } + + gf_worm_state_transition(this, _gf_false, &loc, GF_FOP_WRITE); + } + +out: + loc_wipe(&loc); + if (dict) + dict_unref(dict); + return 0; +} + +struct xlator_cbks cbks = { + .release = worm_release, +}; struct volume_options options[] = { - { .key = {NULL} }, + {.key = {"worm"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + /*.validate_fn = validate_boolean,*/ + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE, + .description = "When \"on\", makes a volume get write once read many " + " feature. It is turned \"off\" by default."}, + {.key = {"worm-file-level"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + /*.validate_fn = validate_boolean,*/ + .op_version = {GD_OP_VERSION_3_8_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "When \"on\", activates the file level worm. " + "It is turned \"off\" by default."}, + {.key = {"worm-files-deletable"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + /*.validate_fn = validate_boolean,*/ + .op_version = {GD_OP_VERSION_3_13_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "When \"off\", doesn't allow the Worm files" + "to be deleted. It is turned \"on\" by default."}, + {.key = {"default-retention-period"}, + .type = GF_OPTION_TYPE_TIME, + .default_value = "120", + /*.validate_fn = validate_worm_period,*/ + .op_version = {GD_OP_VERSION_3_8_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "The default retention period for the files."}, + {.key = {"retention-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "relax", + /*.validate_fn = validate_reten_mode,*/ + .op_version = {GD_OP_VERSION_3_8_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "The mode of retention (relax/enterprise). " + "It is relax by default."}, + {.key = {"auto-commit-period"}, + .type = GF_OPTION_TYPE_TIME, + .default_value = "180", + /*.validate_fn = validate_worm_period,*/ + .op_version = {GD_OP_VERSION_3_8_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "Auto commit period for the files."}, + {.key = {NULL}}, }; +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "worm", + .category = GF_TECH_PREVIEW, +}; diff --git a/xlators/features/sdfs/Makefile.am b/xlators/features/sdfs/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/features/sdfs/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/sdfs/src/Makefile.am b/xlators/features/sdfs/src/Makefile.am new file mode 100644 index 00000000000..6118d46ad22 --- /dev/null +++ b/xlators/features/sdfs/src/Makefile.am @@ -0,0 +1,19 @@ +if WITH_SERVER +xlator_LTLIBRARIES = sdfs.la +endif +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +sdfs_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +sdfs_la_SOURCES = sdfs.c +sdfs_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = sdfs.h sdfs-messages.h $(top_builddir)/xlators/lib/src/libxlator.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/xdr/src/ -I$(top_builddir)/rpc/xdr/src/ + +AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/sdfs/src/sdfs-messages.h b/xlators/features/sdfs/src/sdfs-messages.h new file mode 100644 index 00000000000..3053efa8935 --- /dev/null +++ b/xlators/features/sdfs/src/sdfs-messages.h @@ -0,0 +1,67 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _DFS_MESSAGES_H_ +#define _DFS_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* file bit-rot-bitd-messages.h + * brief SDFS log-message IDs and their descriptions + */ + +/* NOTE: Rules for message additions + * 1) Each instance of a message is _better_ left with a unique message ID, even + * if the message format is the same. Reasoning is that, if the message + * format needs to change in one instance, the other instances are not + * impacted or the new change does not change the ID of the instance being + * modified. + * 2) Addition of a message, + * - Should increment the GLFS_NUM_MESSAGES + * - Append to the list of messages defined, towards the end + * - Retain macro naming as glfs_msg_X (for redability across developers) + * NOTE: Rules for message format modifications + * 3) Check acorss the code if the message ID macro in question is reused + * anywhere. If reused then then the modifications should ensure correctness + * everywhere, or needs a new message ID as (1) above was not adhered to. If + * not used anywhere, proceed with the required modification. + * NOTE: Rules for message deletion + * 4) Check (3) and if used anywhere else, then cannot be deleted. If not used + * anywhere, then can be deleted, but will leave a hole by design, as + * addition rules specify modification to the end of the list and not filling + * holes. + */ + +#define GLFS_SDFS_BASE GLFS_MSGID_COMP_SDFS +#define GLFS_SDFS_NUM_MESSAGES 2 +#define GLFS_MSGID_END (GLFS_SDFS_BASE + GLFS_SDFS_NUM_MESSAGES + 1) +/* Messaged with message IDs */ +#define glfs_msg_start_x GLFS_DFS_BASE, "Invalid: Start of messages" +/*------------*/ + +#define SDFS_MSG_ENTRYLK_ERROR (GLFS_SDFS_BASE + 1) +/*! + * @messageid + * @diagnosis + * @recommendedaction + * + */ + +#define SDFS_MSG_MKDIR_ERROR (GLFS_SDFS_BASE + 2) +/*! + * @messageid + * @diagnosis + * @recommendedaction + * + */ +/*------------*/ + +#define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" +#endif /* !_SDFS_MESSAGES_H_ */ diff --git a/xlators/features/sdfs/src/sdfs.c b/xlators/features/sdfs/src/sdfs.c new file mode 100644 index 00000000000..aaf13f0852e --- /dev/null +++ b/xlators/features/sdfs/src/sdfs.c @@ -0,0 +1,1479 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#include <libgen.h> +#include "sdfs.h" + +static int +sdfs_frame_return(call_frame_t *frame) +{ + sdfs_local_t *local = NULL; + + if (!frame) + return -1; + + local = frame->local; + + return GF_ATOMIC_DEC(local->call_cnt); +} + +static void +sdfs_lock_free(sdfs_entry_lock_t *entrylk) +{ + if (entrylk == NULL) + goto out; + + loc_wipe(&entrylk->parent_loc); + GF_FREE(entrylk->basename); + +out: + return; +} + +static void +sdfs_lock_array_free(sdfs_lock_t *lock) +{ + sdfs_entry_lock_t *entrylk = NULL; + int i = 0; + + if (lock == NULL) + goto out; + + for (i = 0; i < lock->lock_count; i++) { + entrylk = &lock->entrylk[i]; + sdfs_lock_free(entrylk); + } + +out: + return; +} + +static void +sdfs_local_cleanup(sdfs_local_t *local) +{ + if (!local) + return; + + loc_wipe(&local->loc); + loc_wipe(&local->parent_loc); + + if (local->stub) { + call_stub_destroy(local->stub); + local->stub = NULL; + } + + sdfs_lock_array_free(local->lock); + GF_FREE(local->lock); + + mem_put(local); +} + +static int +sdfs_build_parent_loc(loc_t *parent, loc_t *child) +{ + int ret = -1; + char *path = NULL; + + if (!child->parent) { + goto out; + } + parent->inode = inode_ref(child->parent); + path = gf_strdup(child->path); + if (!path) { + ret = -ENOMEM; + goto out; + } + + parent->path = dirname(path); + if (!parent->path) { + goto out; + } + + gf_uuid_copy(parent->gfid, child->pargfid); + return 0; + +out: + GF_FREE(path); + return ret; +} + +static sdfs_local_t * +sdfs_local_init(call_frame_t *frame, xlator_t *this) +{ + sdfs_local_t *local = NULL; + + local = mem_get0(this->local_pool); + if (!local) + goto out; + + frame->local = local; +out: + return local; +} + +static int +sdfs_get_new_frame_common(call_frame_t *frame, call_frame_t **new_frame) +{ + int ret = -1; + sdfs_local_t *local = NULL; + client_t *client = NULL; + + *new_frame = copy_frame(frame); + if (!*new_frame) { + goto err; + } + + client = frame->root->client; + gf_client_ref(client); + (*new_frame)->root->client = client; + + local = sdfs_local_init(*new_frame, THIS); + if (!local) { + goto err; + } + + local->main_frame = frame; + /*Set unique lk-owner for the fop*/ + set_lk_owner_from_ptr(&(*new_frame)->root->lk_owner, (*new_frame)->root); + + ret = 0; +err: + if ((ret == -1) && (*new_frame)) { + SDFS_STACK_DESTROY((*new_frame)); + *new_frame = NULL; + } + + return ret; +} + +static int +sdfs_get_new_frame(call_frame_t *frame, loc_t *loc, call_frame_t **new_frame) +{ + int ret = -1; + sdfs_local_t *local = NULL; + + ret = sdfs_get_new_frame_common(frame, new_frame); + if (ret < 0) { + goto err; + } + + local = (*new_frame)->local; + + ret = sdfs_build_parent_loc(&local->parent_loc, loc); + if (ret) { + goto err; + } + + ret = loc_copy(&local->loc, loc); + if (ret == -1) { + goto err; + } + + ret = 0; +err: + if (ret && (*new_frame)) { + SDFS_STACK_DESTROY((*new_frame)); + *new_frame = NULL; + ret = -1; + } + + return ret; +} + +static int +sdfs_get_new_frame_readdirp(call_frame_t *frame, fd_t *fd, + call_frame_t **new_frame) +{ + int ret = -1; + sdfs_local_t *local = NULL; + + ret = sdfs_get_new_frame_common(frame, new_frame); + if (ret < 0) { + goto err; + } + + local = (*new_frame)->local; + local->parent_loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->parent_loc.gfid, fd->inode->gfid); + + ret = 0; +err: + return ret; +} + +int +sdfs_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + call_stub_t *stub = NULL; + + local = frame->local; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + if (local->stub) { + stub = local->stub; + local->stub = NULL; + call_resume(stub); + } else { + if (op_ret < 0) + gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR, + "Unlocking entry lock failed for %s", local->loc.name); + + SDFS_STACK_DESTROY(frame); + } + + return 0; +} + +int +sdfs_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + + local = frame->local; + + STACK_UNWIND_STRICT(mkdir, local->main_frame, op_ret, op_errno, inode, + stbuf, preparent, postparent, xdata); + + local->main_frame = NULL; + STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + local->loc.name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata); + return 0; +} + +int +sdfs_mkdir_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + int op_errno = -1; + + local = frame->local; + + gf_uuid_unparse(loc->pargfid, gfid); + + if (local->op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR, + "Acquiring entry lock failed for directory %s " + "with parent gfid %s", + local->loc.name, gfid); + op_errno = local->op_errno; + goto err; + } + + STACK_WIND(frame, sdfs_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(mkdir, local->main_frame, -1, op_errno, NULL, NULL, + NULL, NULL, NULL); + + local->main_frame = NULL; + SDFS_STACK_DESTROY(frame); + return 0; +} + +int +sdfs_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + call_frame_t *new_frame = NULL; + call_stub_t *stub = NULL; + int op_errno = 0; + + if (-1 == sdfs_get_new_frame(frame, loc, &new_frame)) { + op_errno = ENOMEM; + goto err; + } + + stub = fop_mkdir_stub(new_frame, sdfs_mkdir_helper, loc, mode, umask, + xdata); + if (!stub) { + op_errno = ENOMEM; + goto err; + } + + local = new_frame->local; + local->stub = stub; + + STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + local->loc.name, ENTRYLK_LOCK, ENTRYLK_WRLCK, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + + if (new_frame) + SDFS_STACK_DESTROY(new_frame); + + return 0; +} + +int +sdfs_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + + local = frame->local; + + STACK_UNWIND_STRICT(rmdir, local->main_frame, op_ret, op_errno, preparent, + postparent, xdata); + + local->main_frame = NULL; + STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + local->loc.name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata); + return 0; +} + +int +sdfs_rmdir_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) +{ + sdfs_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + + gf_uuid_unparse(loc->pargfid, gfid); + + if (local->op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR, + "Acquiring entry lock failed for directory %s " + "with parent gfid %s", + local->loc.name, gfid); + goto err; + } + + STACK_WIND(frame, sdfs_rmdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(rmdir, local->main_frame, -1, local->op_errno, NULL, + NULL, NULL); + + local->main_frame = NULL; + SDFS_STACK_DESTROY(frame); + return 0; +} + +int +sdfs_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) +{ + sdfs_local_t *local = NULL; + call_frame_t *new_frame = NULL; + call_stub_t *stub = NULL; + int op_errno = 0; + + if (-1 == sdfs_get_new_frame(frame, loc, &new_frame)) { + op_errno = ENOMEM; + goto err; + } + + stub = fop_rmdir_stub(new_frame, sdfs_rmdir_helper, loc, flags, xdata); + if (!stub) { + op_errno = ENOMEM; + goto err; + } + + local = new_frame->local; + local->stub = stub; + + STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + local->loc.name, ENTRYLK_LOCK, ENTRYLK_WRLCK, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(rmdir, frame, -1, op_errno, NULL, NULL, NULL); + + if (new_frame) + SDFS_STACK_DESTROY(new_frame); + + return 0; +} + +int +sdfs_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + + local = frame->local; + + STACK_UNWIND_STRICT(create, local->main_frame, op_ret, op_errno, fd, inode, + stbuf, preparent, postparent, xdata); + + local->main_frame = NULL; + STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + local->loc.name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata); + return 0; +} + +int +sdfs_create_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flags, mode_t mode, mode_t umask, fd_t *fd, + dict_t *xdata) +{ + sdfs_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + + gf_uuid_unparse(loc->pargfid, gfid); + + if (local->op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR, + "Acquiring entry lock failed for directory %s " + "with parent gfid %s", + local->loc.name, gfid); + goto err; + } + + STACK_WIND(frame, sdfs_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + + return 0; +err: + STACK_UNWIND_STRICT(create, local->main_frame, -1, local->op_errno, NULL, + NULL, NULL, NULL, NULL, NULL); + + local->main_frame = NULL; + SDFS_STACK_DESTROY(frame); + return 0; +} + +int +sdfs_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + call_frame_t *new_frame = NULL; + call_stub_t *stub = NULL; + int op_errno = 0; + + if (-1 == sdfs_get_new_frame(frame, loc, &new_frame)) { + op_errno = ENOMEM; + goto err; + } + + stub = fop_create_stub(new_frame, sdfs_create_helper, loc, flags, mode, + umask, fd, xdata); + if (!stub) { + op_errno = ENOMEM; + goto err; + } + + local = new_frame->local; + local->stub = stub; + + STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + local->loc.name, ENTRYLK_LOCK, ENTRYLK_WRLCK, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); + + if (new_frame) + SDFS_STACK_DESTROY(new_frame); + + return 0; +} + +int +sdfs_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + + local = frame->local; + + STACK_UNWIND_STRICT(unlink, local->main_frame, op_ret, op_errno, preparent, + postparent, xdata); + + local->main_frame = NULL; + STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + local->loc.name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata); + return 0; +} + +int +sdfs_unlink_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) +{ + sdfs_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + + gf_uuid_unparse(loc->pargfid, gfid); + + if (local->op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR, + "Acquiring entry lock failed for directory %s " + "with parent gfid %s", + local->loc.name, gfid); + goto err; + } + + STACK_WIND(frame, sdfs_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, flags, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(unlink, local->main_frame, -1, local->op_errno, NULL, + NULL, NULL); + + local->main_frame = NULL; + SDFS_STACK_DESTROY(frame); + return 0; +} + +int +sdfs_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) +{ + sdfs_local_t *local = NULL; + call_frame_t *new_frame = NULL; + call_stub_t *stub = NULL; + int op_errno = 0; + + if (-1 == sdfs_get_new_frame(frame, loc, &new_frame)) { + op_errno = ENOMEM; + goto err; + } + + stub = fop_unlink_stub(new_frame, sdfs_unlink_helper, loc, flags, xdata); + if (!stub) { + op_errno = ENOMEM; + goto err; + } + + local = new_frame->local; + local->stub = stub; + + STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + local->loc.name, ENTRYLK_LOCK, ENTRYLK_WRLCK, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(unlink, frame, -1, op_errno, NULL, NULL, NULL); + + if (new_frame) + SDFS_STACK_DESTROY(new_frame); + + return 0; +} + +int +sdfs_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + + local = frame->local; + + STACK_UNWIND_STRICT(link, local->main_frame, op_ret, op_errno, inode, stbuf, + preparent, postparent, xdata); + + local->main_frame = NULL; + STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + local->loc.name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata); + return 0; +} + +int +sdfs_symlink_helper(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + + gf_uuid_unparse(loc->pargfid, gfid); + + if (local->op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR, + "Acquiring entry lock failed for directory %s " + "with parent gfid %s", + local->loc.name, gfid); + goto err; + } + + STACK_WIND(frame, sdfs_symlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(link, local->main_frame, -1, local->op_errno, NULL, + NULL, NULL, NULL, NULL); + + local->main_frame = NULL; + SDFS_STACK_DESTROY(frame); + return 0; +} + +int +sdfs_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + call_frame_t *new_frame = NULL; + call_stub_t *stub = NULL; + int op_errno = 0; + + if (-1 == sdfs_get_new_frame(frame, loc, &new_frame)) { + op_errno = ENOMEM; + goto err; + } + + stub = fop_symlink_stub(new_frame, sdfs_symlink_helper, linkname, loc, + umask, xdata); + if (!stub) { + op_errno = ENOMEM; + goto err; + } + + local = new_frame->local; + local->stub = stub; + + STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + local->loc.name, ENTRYLK_LOCK, ENTRYLK_WRLCK, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + + if (new_frame) + SDFS_STACK_DESTROY(new_frame); + + return 0; +} + +int +sdfs_common_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + int this_call_cnt = 0; + int lk_index = 0; + sdfs_lock_t *locks = NULL; + call_stub_t *stub = NULL; + + local = frame->local; + locks = local->lock; + lk_index = (long)cookie; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } else { + locks->entrylk->locked[lk_index] = _gf_true; + } + + this_call_cnt = sdfs_frame_return(frame); + if (this_call_cnt > 0) { + gf_log(this->name, GF_LOG_DEBUG, + "As there are more callcnt (%d) returning without WIND", + this_call_cnt); + return 0; + } + + if (local->stub) { + stub = local->stub; + local->stub = NULL; + call_resume(stub); + } else { + if (local->op_ret < 0) + gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR, + "unlocking entry lock failed "); + SDFS_STACK_DESTROY(frame); + } + + return 0; +} + +int +sdfs_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + sdfs_lock_t *lock = NULL; + int i = 0; + int lock_count = 0; + + local = frame->local; + lock = local->lock; + + STACK_UNWIND_STRICT(link, local->main_frame, op_ret, op_errno, inode, stbuf, + preparent, postparent, xdata); + + local->main_frame = NULL; + lock_count = lock->lock_count; + for (i = 0; i < lock_count; i++) { + STACK_WIND_COOKIE(frame, sdfs_common_entrylk_cbk, (void *)(long)i, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->entrylk, + this->name, &lock->entrylk[i].parent_loc, + lock->entrylk[i].basename, ENTRYLK_UNLOCK, + ENTRYLK_WRLCK, xdata); + } + + return 0; +} + +int +sdfs_link_helper(call_frame_t *frame, xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + sdfs_lock_t *locks = NULL; + gf_boolean_t stack_destroy = _gf_true; + int lock_count = 0; + int i = 0; + + local = frame->local; + locks = local->lock; + + if (local->op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR, + "Acquiring entry lock failed"); + goto err; + } + + STACK_WIND(frame, sdfs_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(link, local->main_frame, -1, local->op_errno, NULL, + NULL, NULL, NULL, NULL); + + local->main_frame = NULL; + for (i = 0; i < locks->lock_count && locks->entrylk->locked[i]; i++) { + lock_count++; + } + GF_ATOMIC_INIT(local->call_cnt, lock_count); + + for (i = 0; i < lock_count; i++) { + if (!locks->entrylk->locked[i]) { + lock_count++; + continue; + } + + stack_destroy = _gf_false; + STACK_WIND(frame, sdfs_common_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, + &locks->entrylk[i].parent_loc, locks->entrylk[i].basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata); + } + + if (stack_destroy) + SDFS_STACK_DESTROY(frame); + + return 0; +} + +static int +sdfs_init_entry_lock(sdfs_entry_lock_t *lock, loc_t *loc) +{ + int ret = 0; + + ret = sdfs_build_parent_loc(&lock->parent_loc, loc); + if (ret) + return -1; + + lock->basename = gf_strdup(loc->name); + if (!lock->basename) + return -1; + + return 0; +} + +int +sdfs_entry_lock_cmp(const void *l1, const void *l2) +{ + const sdfs_entry_lock_t *r1 = l1; + const sdfs_entry_lock_t *r2 = l2; + int ret = 0; + uuid_t gfid1 = {0}; + uuid_t gfid2 = {0}; + + loc_gfid((loc_t *)&r1->parent_loc, gfid1); + loc_gfid((loc_t *)&r2->parent_loc, gfid2); + ret = gf_uuid_compare(gfid1, gfid2); + /*Entrylks with NULL basename are the 'smallest'*/ + if (ret == 0) { + if (!r1->basename) + return -1; + if (!r2->basename) + return 1; + ret = strcmp(r1->basename, r2->basename); + } + + if (ret <= 0) + return -1; + else + return 1; +} + +int +sdfs_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + sdfs_local_t *local = NULL; + call_frame_t *new_frame = NULL; + call_stub_t *stub = NULL; + sdfs_lock_t *lock = NULL; + client_t *client = NULL; + int ret = 0; + int op_errno = ENOMEM; + + new_frame = copy_frame(frame); + if (!new_frame) { + op_errno = ENOMEM; + goto err; + } + /*Set unique lk-owner for the fop*/ + set_lk_owner_from_ptr(&new_frame->root->lk_owner, new_frame->root); + + gf_client_ref(client); + new_frame->root->client = client; + local = sdfs_local_init(new_frame, this); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->main_frame = frame; + + lock = GF_CALLOC(1, sizeof(*lock), gf_common_mt_char); + if (!lock) + goto err; + + local->lock = lock; + + ret = sdfs_init_entry_lock(&lock->entrylk[0], newloc); + if (ret) + goto err; + + ++lock->lock_count; + + local->lock = lock; + GF_ATOMIC_INIT(local->call_cnt, lock->lock_count); + + ret = loc_copy(&local->loc, newloc); + if (ret == -1) { + op_errno = ENOMEM; + goto err; + } + + stub = fop_link_stub(new_frame, sdfs_link_helper, oldloc, newloc, xdata); + if (!stub) { + op_errno = ENOMEM; + goto err; + } + + local->stub = stub; + + STACK_WIND_COOKIE(new_frame, sdfs_common_entrylk_cbk, 0, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, + &lock->entrylk[0].parent_loc, lock->entrylk[0].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, xdata); + + return 0; +err: + + STACK_UNWIND_STRICT(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + + if (new_frame) + SDFS_STACK_DESTROY(new_frame); + + return 0; +} + +int +sdfs_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + + local = frame->local; + + STACK_UNWIND_STRICT(mknod, local->main_frame, op_ret, op_errno, inode, + stbuf, preparent, postparent, xdata); + + local->main_frame = NULL; + STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + local->loc.name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata); + return 0; +} + +int +sdfs_mknod_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + + gf_uuid_unparse(loc->pargfid, gfid); + + if (local->op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR, + "Acquiring entry lock failed for directory %s " + "with parent gfid %s", + local->loc.name, gfid); + goto err; + } + + STACK_WIND(frame, sdfs_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(mknod, local->main_frame, -1, local->op_errno, NULL, + NULL, NULL, NULL, NULL); + + local->main_frame = NULL; + SDFS_STACK_DESTROY(frame); + return 0; +} + +int +sdfs_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + call_frame_t *new_frame = NULL; + call_stub_t *stub = NULL; + int op_errno = 0; + + if (-1 == sdfs_get_new_frame(frame, loc, &new_frame)) { + op_errno = ENOMEM; + goto err; + } + + stub = fop_mknod_stub(new_frame, sdfs_mknod_helper, loc, mode, rdev, umask, + xdata); + if (!stub) { + op_errno = ENOMEM; + goto err; + } + + local = new_frame->local; + local->stub = stub; + + STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + local->loc.name, ENTRYLK_LOCK, ENTRYLK_WRLCK, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + + if (new_frame) + SDFS_STACK_DESTROY(new_frame); + + return 0; +} + +int +sdfs_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + sdfs_local_t *local = NULL; + sdfs_lock_t *lock = NULL; + int i = 0; + int call_cnt = 0; + + local = frame->local; + lock = local->lock; + GF_ATOMIC_INIT(local->call_cnt, lock->lock_count); + + STACK_UNWIND_STRICT(rename, local->main_frame, op_ret, op_errno, stbuf, + preoldparent, postoldparent, prenewparent, + postnewparent, xdata); + + local->main_frame = NULL; + call_cnt = GF_ATOMIC_GET(local->call_cnt); + + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, sdfs_common_entrylk_cbk, (void *)(long)i, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->entrylk, + this->name, &lock->entrylk[i].parent_loc, + lock->entrylk[i].basename, ENTRYLK_UNLOCK, + ENTRYLK_WRLCK, xdata); + } + + return 0; +} + +int +sdfs_rename_helper(call_frame_t *frame, xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + sdfs_lock_t *lock = NULL; + gf_boolean_t stack_destroy = _gf_true; + int lock_count = 0; + int i = 0; + + local = frame->local; + lock = local->lock; + + if (local->op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR, + "Acquiring entry lock failed "); + goto err; + } + + STACK_WIND(frame, sdfs_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + + return 0; + +err: + STACK_UNWIND_STRICT(rename, local->main_frame, -1, local->op_errno, NULL, + NULL, NULL, NULL, NULL, NULL); + + local->main_frame = NULL; + for (i = 0; i < lock->lock_count && lock->entrylk->locked[i]; i++) { + lock_count++; + } + GF_ATOMIC_INIT(local->call_cnt, lock_count); + + for (i = 0; i < lock_count; i++) { + if (!lock->entrylk->locked[i]) { + lock_count++; + continue; + } + stack_destroy = _gf_false; + STACK_WIND(frame, sdfs_common_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, + &lock->entrylk[i].parent_loc, lock->entrylk[i].basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata); + } + + if (stack_destroy) + SDFS_STACK_DESTROY(frame); + + return 0; +} + +int +sdfs_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + sdfs_local_t *local = NULL; + sdfs_lock_t *lock = NULL; + call_frame_t *new_frame = NULL; + call_stub_t *stub = NULL; + client_t *client = NULL; + int ret = 0; + int op_errno = ENOMEM; + int i = 0; + int call_cnt = 0; + + new_frame = copy_frame(frame); + if (!new_frame) { + op_errno = ENOMEM; + goto err; + } + /*Set unique lk-owner for the fop*/ + set_lk_owner_from_ptr(&new_frame->root->lk_owner, new_frame->root); + + gf_client_ref(client); + new_frame->root->client = client; + local = sdfs_local_init(new_frame, this); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->main_frame = frame; + + lock = GF_CALLOC(1, sizeof(*lock), gf_common_mt_char); + if (!lock) + goto err; + + local->lock = lock; + + ret = sdfs_init_entry_lock(&lock->entrylk[0], oldloc); + if (ret) + goto err; + lock->entrylk->locked[0] = _gf_false; + + ++lock->lock_count; + + ret = sdfs_init_entry_lock(&lock->entrylk[1], newloc); + if (ret) + goto err; + lock->entrylk->locked[1] = _gf_false; + + ++lock->lock_count; + + qsort(lock->entrylk, lock->lock_count, sizeof(*lock->entrylk), + sdfs_entry_lock_cmp); + + local->lock = lock; + GF_ATOMIC_INIT(local->call_cnt, lock->lock_count); + + stub = fop_rename_stub(new_frame, sdfs_rename_helper, oldloc, newloc, + xdata); + if (!stub) { + op_errno = ENOMEM; + goto err; + } + + local->stub = stub; + call_cnt = GF_ATOMIC_GET(local->call_cnt); + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(new_frame, sdfs_common_entrylk_cbk, (void *)(long)i, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->entrylk, + this->name, &lock->entrylk[i].parent_loc, + lock->entrylk[i].basename, ENTRYLK_LOCK, + ENTRYLK_WRLCK, xdata); + } + + return 0; +err: + + STACK_UNWIND_STRICT(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); + + if (new_frame) + SDFS_STACK_DESTROY(new_frame); + + return 0; +} + +int +sdfs_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xdata, struct iatt *postparent) +{ + sdfs_local_t *local = NULL; + + local = frame->local; + + if (!local->loc.parent) { + sdfs_local_cleanup(local); + frame->local = NULL; + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, stbuf, + xdata, postparent); + return 0; + } + + STACK_UNWIND_STRICT(lookup, local->main_frame, op_ret, op_errno, inode, + stbuf, xdata, postparent); + + local->main_frame = NULL; + STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + local->loc.name, ENTRYLK_UNLOCK, ENTRYLK_RDLCK, xdata); + return 0; +} + +int +sdfs_lookup_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) +{ + sdfs_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + + gf_uuid_unparse(loc->pargfid, gfid); + + if (local->op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR, + "Acquiring entry lock failed for directory %s " + "with parent gfid %s", + local->loc.name, gfid); + goto err; + } + + STACK_WIND(frame, sdfs_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(lookup, local->main_frame, -1, local->op_errno, NULL, + NULL, NULL, NULL); + local->main_frame = NULL; + + SDFS_STACK_DESTROY(frame); + return 0; +} + +int +sdfs_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + call_frame_t *new_frame = NULL; + call_stub_t *stub = NULL; + int op_errno = 0; + + if (!loc->parent) { + local = sdfs_local_init(frame, this); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + return 0; + } + + if (-1 == sdfs_get_new_frame(frame, loc, &new_frame)) { + op_errno = ENOMEM; + goto err; + } + + stub = fop_lookup_stub(new_frame, sdfs_lookup_helper, loc, xdata); + if (!stub) { + op_errno = ENOMEM; + goto err; + } + + local = new_frame->local; + local->stub = stub; + + STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + local->loc.name, ENTRYLK_LOCK, ENTRYLK_RDLCK, xdata); + + return 0; + +err: + STACK_UNWIND_STRICT(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + + if (new_frame) + SDFS_STACK_DESTROY(new_frame); + + return 0; +} + +int32_t +sdfs_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + sdfs_local_t *local = NULL; + + local = frame->local; + STACK_UNWIND_STRICT(readdirp, local->main_frame, op_ret, op_errno, entries, + xdata); + + local->main_frame = NULL; + STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + NULL, ENTRYLK_UNLOCK, ENTRYLK_RDLCK, xdata); + return 0; +} + +int32_t +sdfs_readdirp_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + + gf_uuid_unparse(fd->inode->gfid, gfid); + + if (local->op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR, + "Acquiring entry lock failed for directory %s " + "with parent gfid %s", + local->loc.name, gfid); + goto err; + } + + STACK_WIND(frame, sdfs_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(readdirp, local->main_frame, -1, local->op_errno, NULL, + NULL); + + local->main_frame = NULL; + + SDFS_STACK_DESTROY(frame); + return 0; +} + +int32_t +sdfs_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + sdfs_local_t *local = NULL; + call_frame_t *new_frame = NULL; + call_stub_t *stub = NULL; + int op_errno = 0; + + if (-1 == sdfs_get_new_frame_readdirp(frame, fd, &new_frame)) { + op_errno = ENOMEM; + goto err; + } + + stub = fop_readdirp_stub(new_frame, sdfs_readdirp_helper, fd, size, off, + xdata); + if (!stub) { + op_errno = ENOMEM; + goto err; + } + + local = new_frame->local; + local->stub = stub; + + STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc, + NULL, ENTRYLK_LOCK, ENTRYLK_RDLCK, xdata); + + return 0; + +err: + STACK_UNWIND_STRICT(readdirp, frame, -1, op_errno, NULL, NULL); + + if (new_frame) + SDFS_STACK_DESTROY(new_frame); + + return 0; +} + +int +init(xlator_t *this) +{ + int ret = -1; + + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_ERROR, + "'dentry-fop-serializer' not configured with exactly one child"); + goto out; + } + + if (!this->parents) { + gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile "); + } + + this->local_pool = mem_pool_new(sdfs_local_t, 512); + if (!this->local_pool) { + goto out; + } + + GF_OPTION_INIT("pass-through", this->pass_through, bool, out); + + ret = 0; + +out: + return ret; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + int ret = -1; + + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out); + + ret = 0; +out: + return ret; +} + +void +fini(xlator_t *this) +{ + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + return; +} + +struct xlator_fops fops = { + .mkdir = sdfs_mkdir, + .rmdir = sdfs_rmdir, + .create = sdfs_create, + .unlink = sdfs_unlink, + .symlink = sdfs_symlink, + .link = sdfs_link, + .mknod = sdfs_mknod, + .rename = sdfs_rename, + .lookup = sdfs_lookup, + .readdirp = sdfs_readdirp, +}; + +struct xlator_cbks cbks; + +struct volume_options options[] = { + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"sdfs"}, + .description = "Enable/Disable dentry serialize functionality"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .op_version = {GD_OP_VERSION_4_0_0}, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "sdfs", + .category = GF_TECH_PREVIEW, +}; diff --git a/xlators/features/sdfs/src/sdfs.h b/xlators/features/sdfs/src/sdfs.h new file mode 100644 index 00000000000..dded5a2d7fc --- /dev/null +++ b/xlators/features/sdfs/src/sdfs.h @@ -0,0 +1,49 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <glusterfs/xlator.h> +#include <glusterfs/call-stub.h> +#include "sdfs-messages.h" +#include <glusterfs/atomic.h> + +#define SDFS_LOCK_COUNT_MAX 2 + +typedef struct { + loc_t parent_loc; + char *basename; + int locked[SDFS_LOCK_COUNT_MAX]; +} sdfs_entry_lock_t; + +typedef struct { + sdfs_entry_lock_t entrylk[SDFS_LOCK_COUNT_MAX]; + int lock_count; +} sdfs_lock_t; + +struct sdfs_local { + call_frame_t *main_frame; + loc_t loc; + loc_t parent_loc; + call_stub_t *stub; + sdfs_lock_t *lock; + int op_ret; + int op_errno; + gf_atomic_t call_cnt; +}; +typedef struct sdfs_local sdfs_local_t; + +#define SDFS_STACK_DESTROY(frame) \ + do { \ + sdfs_local_t *__local = NULL; \ + __local = frame->local; \ + frame->local = NULL; \ + gf_client_unref(frame->root->client); \ + STACK_DESTROY(frame->root); \ + sdfs_local_cleanup(__local); \ + } while (0) diff --git a/xlators/features/selinux/Makefile.am b/xlators/features/selinux/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/features/selinux/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/selinux/src/Makefile.am b/xlators/features/selinux/src/Makefile.am new file mode 100644 index 00000000000..4f1e5e149b3 --- /dev/null +++ b/xlators/features/selinux/src/Makefile.am @@ -0,0 +1,20 @@ +if WITH_SERVER +xlator_LTLIBRARIES = selinux.la +endif +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +selinux_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +selinux_la_SOURCES = selinux.c + +selinux_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = selinux.h selinux-messages.h selinux-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/features/path-convertor/src/path-mem-types.h b/xlators/features/selinux/src/selinux-mem-types.h index 77ada8d537a..553e59e5a9d 100644 --- a/xlators/features/path-convertor/src/path-mem-types.h +++ b/xlators/features/selinux/src/selinux-mem-types.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -7,16 +7,13 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef __PATH_MEM_TYPES_H__ -#define __PATH_MEM_TYPES_H__ +#ifndef __SELINUX_MEM_TYPES_H__ +#define __SELINUX_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> -enum gf_path_mem_types_ { - gf_path_mt_path_private_t = gf_common_mt_end + 1, - gf_path_mt_char, - gf_path_mt_regex_t, - gf_path_mt_end +enum gf_selinux_mem_types_ { + gf_selinux_mt_selinux_priv_t = gf_common_mt_end + 1, + gf_selinux_mt_end }; #endif - diff --git a/xlators/features/selinux/src/selinux-messages.h b/xlators/features/selinux/src/selinux-messages.h new file mode 100644 index 00000000000..f49a54f956c --- /dev/null +++ b/xlators/features/selinux/src/selinux-messages.h @@ -0,0 +1,30 @@ +/* + Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _SELINUX_MESSAGES_H__ +#define _SELINUX_MESSAGES_H__ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(SL, SL_MSG_INVALID_VOLFILE, SL_MSG_ENOMEM, + SL_MSG_MEM_ACCT_INIT_FAILED, SL_MSG_SELINUX_GLUSTER_XATTR_MISSING, + SL_MSG_SELINUX_XATTR_MISSING); + +#endif /*_SELINUX_MESSAGES_H */ diff --git a/xlators/features/selinux/src/selinux.c b/xlators/features/selinux/src/selinux.c new file mode 100644 index 00000000000..9b1b4b55e1a --- /dev/null +++ b/xlators/features/selinux/src/selinux.c @@ -0,0 +1,323 @@ +/* + Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <glusterfs/xlator.h> + +#include "selinux.h" +#include "selinux-messages.h" +#include "selinux-mem-types.h" +#include <glusterfs/compat-errno.h> + +static int +selinux_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *dict, dict_t *xdata) +{ + int ret = 0; + char *name = cookie; + + if (op_errno == 0 && dict && name && + (!strcmp(name, SELINUX_GLUSTER_XATTR))) { + ret = dict_rename_key(dict, SELINUX_GLUSTER_XATTR, SELINUX_XATTR); + if (ret < 0) + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SL_MSG_SELINUX_GLUSTER_XATTR_MISSING, + "getxattr failed for %s", SELINUX_XATTR); + } + + STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, dict, xdata); + return ret; +} + +static int +selinux_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + selinux_priv_t *priv = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + char *xattr_name = (char *)name; + + priv = this->private; + + GF_VALIDATE_OR_GOTO("selinux", priv, err); + + /* name can be NULL for listxattr calls */ + if (!priv->selinux_enabled || !name) + goto off; + + if (strcmp(name, SELINUX_XATTR) == 0) + xattr_name = SELINUX_GLUSTER_XATTR; + +off: + STACK_WIND_COOKIE(frame, selinux_fgetxattr_cbk, xattr_name, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fgetxattr, fd, + xattr_name, xdata); + return 0; +err: + STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, NULL, xdata); + + return 0; +} + +static int +selinux_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *dict, dict_t *xdata) +{ + int ret = 0; + char *name = cookie; + + if (op_errno == 0 && dict && name && + (!strcmp(name, SELINUX_GLUSTER_XATTR))) { + ret = dict_rename_key(dict, SELINUX_GLUSTER_XATTR, SELINUX_XATTR); + if (ret < 0) + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SL_MSG_SELINUX_GLUSTER_XATTR_MISSING, + "getxattr failed for %s", SELINUX_XATTR); + } + + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xdata); + + return 0; +} + +static int +selinux_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + selinux_priv_t *priv = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + char *xattr_name = (char *)name; + + priv = this->private; + + GF_VALIDATE_OR_GOTO("selinux", priv, err); + + /* name can be NULL for listxattr calls */ + if (!priv->selinux_enabled || !name) + goto off; + + if (strcmp(name, SELINUX_XATTR) == 0) + xattr_name = SELINUX_GLUSTER_XATTR; + +off: + STACK_WIND_COOKIE(frame, selinux_getxattr_cbk, xattr_name, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->getxattr, loc, + xattr_name, xdata); + return 0; +err: + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, NULL, xdata); + return 0; +} + +static int +selinux_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int +selinux_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int flags, dict_t *xdata) +{ + selinux_priv_t *priv = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + int32_t ret = -1; + + priv = this->private; + + GF_VALIDATE_OR_GOTO("selinux", priv, err); + + if (!priv->selinux_enabled && !dict) + goto off; + + ret = dict_rename_key(dict, SELINUX_XATTR, SELINUX_GLUSTER_XATTR); + if (ret < 0 && ret != -ENODATA) + goto err; + +off: + STACK_WIND(frame, selinux_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + + return 0; +err: + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int +selinux_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int +selinux_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int flags, dict_t *xdata) +{ + selinux_priv_t *priv = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + int32_t ret = -1; + + priv = this->private; + + GF_VALIDATE_OR_GOTO("selinux", priv, err); + + if (!priv->selinux_enabled && !dict) + goto off; + + ret = dict_rename_key(dict, SELINUX_XATTR, SELINUX_GLUSTER_XATTR); + if (ret < 0 && ret != -ENODATA) + goto err; + +off: + STACK_WIND(frame, selinux_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata); + return 0; +err: + STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO("selinux", this, out); + + ret = xlator_mem_acct_init(this, gf_selinux_mt_end + 1); + + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SL_MSG_MEM_ACCT_INIT_FAILED, + "Memory accounting init failed"); + return ret; + } +out: + return ret; +} + +int32_t +init(xlator_t *this) +{ + int32_t ret = -1; + selinux_priv_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("selinux", this, out); + + if (!this->children || this->children->next) { + gf_msg(this->name, GF_LOG_WARNING, 0, SL_MSG_INVALID_VOLFILE, + "Error: SELinux (%s) not configured with exactly one " + "child", + this->name); + return -1; + } + + if (this->parents == NULL) { + gf_msg(this->name, GF_LOG_WARNING, 0, SL_MSG_INVALID_VOLFILE, + "Dangling volume. Please check the volfile"); + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_selinux_mt_selinux_priv_t); + if (!priv) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); + goto out; + } + + GF_OPTION_INIT("selinux", priv->selinux_enabled, bool, out); + + this->local_pool = mem_pool_new(selinux_priv_t, 64); + if (!this->local_pool) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, SL_MSG_ENOMEM, + "Failed to create local_t's memory pool"); + goto out; + } + + this->private = (void *)priv; + ret = 0; +out: + if (ret) { + GF_FREE(priv); + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } + return ret; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + int32_t ret = -1; + selinux_priv_t *priv = NULL; + + priv = this->private; + + GF_OPTION_RECONF("selinux", priv->selinux_enabled, options, bool, out); + + ret = 0; +out: + return ret; +} + +void +fini(xlator_t *this) +{ + selinux_priv_t *priv = NULL; + + priv = this->private; + GF_FREE(priv); + + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + + return; +} + +struct xlator_fops fops = { + .getxattr = selinux_getxattr, + .fgetxattr = selinux_fgetxattr, + .setxattr = selinux_setxattr, + .fsetxattr = selinux_fsetxattr, +}; + +struct xlator_cbks cbks = {}; + +struct volume_options options[] = { + { + .key = {"selinux"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Enable/disable selinux translator", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_SETTABLE, + .tags = {"security", "linux"}, + }, + { + .key = {NULL}, + }}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "selinux", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/selinux/src/selinux.h b/xlators/features/selinux/src/selinux.h new file mode 100644 index 00000000000..1bbdad3bb36 --- /dev/null +++ b/xlators/features/selinux/src/selinux.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __SELINUX_H__ +#define __SELINUX_H__ + +#include <glusterfs/common-utils.h> + +#define SELINUX_XATTR "security.selinux" +#define SELINUX_GLUSTER_XATTR "trusted.glusterfs.selinux" + +struct selinux_priv { + gf_boolean_t selinux_enabled; +}; + +typedef struct selinux_priv selinux_priv_t; + +#endif diff --git a/xlators/features/shard/Makefile.am b/xlators/features/shard/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/features/shard/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/shard/src/Makefile.am b/xlators/features/shard/src/Makefile.am new file mode 100644 index 00000000000..bf5700d4bcc --- /dev/null +++ b/xlators/features/shard/src/Makefile.am @@ -0,0 +1,17 @@ +xlator_LTLIBRARIES = shard.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +shard_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +shard_la_SOURCES = shard.c + +shard_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = shard.h shard-mem-types.h shard-messages.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/shard/src/shard-mem-types.h b/xlators/features/shard/src/shard-mem-types.h new file mode 100644 index 00000000000..1fe7e2e2798 --- /dev/null +++ b/xlators/features/shard/src/shard-mem-types.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __SHARD_MEM_TYPES_H__ +#define __SHARD_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> + +enum gf_shard_mem_types_ { + gf_shard_mt_priv_t = gf_common_mt_end + 1, + gf_shard_mt_inode_list, + gf_shard_mt_inode_ctx_t, + gf_shard_mt_iovec, + gf_shard_mt_int64_t, + gf_shard_mt_uint64_t, + gf_shard_mt_end +}; +#endif diff --git a/xlators/features/shard/src/shard-messages.h b/xlators/features/shard/src/shard-messages.h new file mode 100644 index 00000000000..2d0867eb136 --- /dev/null +++ b/xlators/features/shard/src/shard-messages.h @@ -0,0 +1,39 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _SHARD_MESSAGES_H_ +#define _SHARD_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(SHARD, SHARD_MSG_BASE_FILE_LOOKUP_FAILED, SHARD_MSG_DICT_OP_FAILED, + SHARD_MSG_DOT_SHARD_NODIR, SHARD_MSG_FD_CTX_SET_FAILED, + SHARD_MSG_INODE_CTX_GET_FAILED, SHARD_MSG_INODE_CTX_SET_FAILED, + SHARD_MSG_INODE_PATH_FAILED, SHARD_MSG_INTERNAL_XATTR_MISSING, + SHARD_MSG_INVALID_VOLFILE, SHARD_MSG_LOOKUP_SHARD_FAILED, + SHARD_MSG_MEM_ACCT_INIT_FAILED, SHARD_MSG_NULL_THIS, + SHARD_MSG_SIZE_SET_FAILED, SHARD_MSG_STAT_FAILED, + SHARD_MSG_TRUNCATE_LAST_SHARD_FAILED, + SHARD_MSG_UPDATE_FILE_SIZE_FAILED, SHARD_MSG_FOP_NOT_SUPPORTED, + SHARD_MSG_INVALID_FOP, SHARD_MSG_MEMALLOC_FAILED, + SHARD_MSG_FOP_FAILED, SHARD_MSG_SHARDS_DELETION_FAILED, + SHARD_MSG_SHARD_DELETION_COMPLETED); + +#endif /* !_SHARD_MESSAGES_H_ */ diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c new file mode 100644 index 00000000000..e5f93063943 --- /dev/null +++ b/xlators/features/shard/src/shard.c @@ -0,0 +1,7382 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <unistd.h> + +#include "shard.h" +#include "shard-mem-types.h" +#include <glusterfs/byte-order.h> +#include <glusterfs/defaults.h> +#include <glusterfs/statedump.h> + +static gf_boolean_t +__is_shard_dir(uuid_t gfid) +{ + shard_priv_t *priv = THIS->private; + + if (gf_uuid_compare(gfid, priv->dot_shard_gfid) == 0) + return _gf_true; + + return _gf_false; +} + +static gf_boolean_t +__is_gsyncd_on_shard_dir(call_frame_t *frame, loc_t *loc) +{ + if (frame->root->pid == GF_CLIENT_PID_GSYNCD && + (__is_shard_dir(loc->pargfid) || + (loc->parent && __is_shard_dir(loc->parent->gfid)))) + return _gf_true; + + return _gf_false; +} + +void +shard_make_block_bname(int block_num, uuid_t gfid, char *buf, size_t len) +{ + char gfid_str[GF_UUID_BUF_SIZE] = { + 0, + }; + + gf_uuid_unparse(gfid, gfid_str); + snprintf(buf, len, "%s.%d", gfid_str, block_num); +} + +void +shard_make_block_abspath(int block_num, uuid_t gfid, char *filepath, size_t len) +{ + char gfid_str[GF_UUID_BUF_SIZE] = { + 0, + }; + + gf_uuid_unparse(gfid, gfid_str); + snprintf(filepath, len, "/%s/%s.%d", GF_SHARD_DIR, gfid_str, block_num); +} + +int +__shard_inode_ctx_get(inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx_p = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret == 0) { + *ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + return ret; + } + + ctx_p = GF_CALLOC(1, sizeof(*ctx_p), gf_shard_mt_inode_ctx_t); + if (!ctx_p) + return ret; + + INIT_LIST_HEAD(&ctx_p->ilist); + INIT_LIST_HEAD(&ctx_p->to_fsync_list); + + ctx_uint = (uint64_t)(uintptr_t)ctx_p; + ret = __inode_ctx_set(inode, this, &ctx_uint); + if (ret < 0) { + GF_FREE(ctx_p); + return ret; + } + + *ctx = ctx_p; + + return ret; +} + +int +shard_inode_ctx_get(inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx) +{ + int ret = 0; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_get(inode, this, ctx); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf, + uint64_t block_size, int32_t valid) +{ + int ret = -1; + shard_inode_ctx_t *ctx = NULL; + + ret = __shard_inode_ctx_get(inode, this, &ctx); + if (ret) + return ret; + + if (valid & SHARD_MASK_BLOCK_SIZE) + ctx->block_size = block_size; + + if (valid & SHARD_MASK_PROT) + ctx->stat.ia_prot = stbuf->ia_prot; + + if (valid & SHARD_MASK_NLINK) + ctx->stat.ia_nlink = stbuf->ia_nlink; + + if (valid & SHARD_MASK_UID) + ctx->stat.ia_uid = stbuf->ia_uid; + + if (valid & SHARD_MASK_GID) + ctx->stat.ia_gid = stbuf->ia_gid; + + if (valid & SHARD_MASK_SIZE) + ctx->stat.ia_size = stbuf->ia_size; + + if (valid & SHARD_MASK_BLOCKS) + ctx->stat.ia_blocks = stbuf->ia_blocks; + + if (valid & SHARD_MASK_TIMES) { + SHARD_TIME_UPDATE(ctx->stat.ia_mtime, ctx->stat.ia_mtime_nsec, + stbuf->ia_mtime, stbuf->ia_mtime_nsec); + SHARD_TIME_UPDATE(ctx->stat.ia_ctime, ctx->stat.ia_ctime_nsec, + stbuf->ia_ctime, stbuf->ia_ctime_nsec); + SHARD_TIME_UPDATE(ctx->stat.ia_atime, ctx->stat.ia_atime_nsec, + stbuf->ia_atime, stbuf->ia_atime_nsec); + } + + if (valid & SHARD_MASK_OTHERS) { + ctx->stat.ia_ino = stbuf->ia_ino; + gf_uuid_copy(ctx->stat.ia_gfid, stbuf->ia_gfid); + ctx->stat.ia_dev = stbuf->ia_dev; + ctx->stat.ia_type = stbuf->ia_type; + ctx->stat.ia_rdev = stbuf->ia_rdev; + ctx->stat.ia_blksize = stbuf->ia_blksize; + } + + if (valid & SHARD_MASK_REFRESH_RESET) + ctx->refresh = _gf_false; + + return 0; +} + +int +shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf, + uint64_t block_size, int32_t valid) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_set(inode, this, stbuf, block_size, valid); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this) +{ + int ret = -1; + shard_inode_ctx_t *ctx = NULL; + + ret = __shard_inode_ctx_get(inode, this, &ctx); + if (ret) + return ret; + + ctx->refresh = _gf_true; + + return 0; +} +int +shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_set_refresh_flag(inode, this); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this) +{ + int ret = -1; + shard_inode_ctx_t *ctx = NULL; + + ret = __shard_inode_ctx_get(inode, this, &ctx); + if (ret) + return ret; + + ctx->refreshed = _gf_true; + return 0; +} + +int +shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_mark_dir_refreshed(inode, this); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this, + inode_t *shard_inode) +{ + int ret = -1; + shard_inode_ctx_t *base_ictx = NULL; + shard_inode_ctx_t *shard_ictx = NULL; + + ret = __shard_inode_ctx_get(base_inode, this, &base_ictx); + if (ret) + return ret; + + ret = __shard_inode_ctx_get(shard_inode, this, &shard_ictx); + if (ret) + return ret; + + if (shard_ictx->fsync_needed) { + shard_ictx->fsync_needed++; + return 1; + } + + list_add_tail(&shard_ictx->to_fsync_list, &base_ictx->to_fsync_list); + shard_ictx->inode = shard_inode; + shard_ictx->fsync_needed++; + base_ictx->fsync_count++; + shard_ictx->base_inode = base_inode; + + return 0; +} + +int +shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this, + inode_t *shard_inode) +{ + int ret = -1; + + /* This ref acts as a refkeepr on the base inode. We + * need to keep this inode alive as it holds the head + * of the to_fsync_list. + */ + inode_ref(base_inode); + inode_ref(shard_inode); + + LOCK(&base_inode->lock); + LOCK(&shard_inode->lock); + { + ret = __shard_inode_ctx_add_to_fsync_list(base_inode, this, + shard_inode); + } + UNLOCK(&shard_inode->lock); + UNLOCK(&base_inode->lock); + + /* Unref the base inode corresponding to the ref above, if the shard is + * found to be already part of the fsync list. + */ + if (ret != 0) { + inode_unref(base_inode); + inode_unref(shard_inode); + } + return ret; +} + +gf_boolean_t +__shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this) +{ + int ret = -1; + shard_inode_ctx_t *ctx = NULL; + + ret = __shard_inode_ctx_get(inode, this, &ctx); + /* If inode ctx get fails, better to err on the side of caution and + * try again? Unless the failure is due to mem-allocation. + */ + if (ret) + return _gf_true; + + return !ctx->refreshed; +} + +gf_boolean_t +shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this) +{ + gf_boolean_t flag = _gf_false; + + LOCK(&inode->lock); + { + flag = __shard_inode_ctx_needs_lookup(inode, this); + } + UNLOCK(&inode->lock); + + return flag; +} +int +__shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, struct iatt *stbuf) +{ + int ret = -1; + shard_inode_ctx_t *ctx = NULL; + + ret = __shard_inode_ctx_get(inode, this, &ctx); + if (ret) + return ret; + + if ((stbuf->ia_size != ctx->stat.ia_size) || + (stbuf->ia_blocks != ctx->stat.ia_blocks)) + ctx->refresh = _gf_true; + + return 0; +} + +int +shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, struct iatt *stbuf) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_invalidate(inode, this, stbuf); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this, + uint64_t *block_size) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret < 0) + return ret; + + ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + + *block_size = ctx->block_size; + + return 0; +} + +int +shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this, + uint64_t *block_size) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_get_block_size(inode, this, block_size); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this, + int *fsync_count) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret < 0) + return ret; + + ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + + *fsync_count = ctx->fsync_needed; + + return 0; +} + +int +shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this, + int *fsync_count) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_get_fsync_count(inode, this, fsync_count); + } + UNLOCK(&inode->lock); + + return ret; +} +int +__shard_inode_ctx_get_all(inode_t *inode, xlator_t *this, + shard_inode_ctx_t *ctx_out) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret < 0) + return ret; + + ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + + memcpy(ctx_out, ctx, sizeof(shard_inode_ctx_t)); + return 0; +} + +int +shard_inode_ctx_get_all(inode_t *inode, xlator_t *this, + shard_inode_ctx_t *ctx_out) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_get_all(inode, this, ctx_out); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this, + struct iatt *buf, + gf_boolean_t *need_refresh) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret < 0) + return ret; + + ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + + if (ctx->refresh == _gf_false) + *buf = ctx->stat; + else + *need_refresh = _gf_true; + + return 0; +} + +int +shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this, + struct iatt *buf, + gf_boolean_t *need_refresh) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_fill_iatt_from_cache(inode, this, buf, + need_refresh); + } + UNLOCK(&inode->lock); + + return ret; +} + +void +shard_local_wipe(shard_local_t *local) +{ + int i = 0; + int count = 0; + + count = local->num_blocks; + + syncbarrier_destroy(&local->barrier); + loc_wipe(&local->loc); + loc_wipe(&local->dot_shard_loc); + loc_wipe(&local->dot_shard_rm_loc); + loc_wipe(&local->loc2); + loc_wipe(&local->tmp_loc); + loc_wipe(&local->int_inodelk.loc); + loc_wipe(&local->int_entrylk.loc); + loc_wipe(&local->newloc); + + if (local->name) + GF_FREE(local->name); + + if (local->int_entrylk.basename) + GF_FREE(local->int_entrylk.basename); + if (local->fd) + fd_unref(local->fd); + + if (local->xattr_req) + dict_unref(local->xattr_req); + if (local->xattr_rsp) + dict_unref(local->xattr_rsp); + + for (i = 0; i < count; i++) { + if (!local->inode_list) + break; + + if (local->inode_list[i]) + inode_unref(local->inode_list[i]); + } + + GF_FREE(local->inode_list); + + GF_FREE(local->vector); + if (local->iobref) + iobref_unref(local->iobref); + if (local->list_inited) + gf_dirent_free(&local->entries_head); + if (local->inodelk_frame) + SHARD_STACK_DESTROY(local->inodelk_frame); + if (local->entrylk_frame) + SHARD_STACK_DESTROY(local->entrylk_frame); +} + +int +shard_modify_size_and_block_count(struct iatt *stbuf, dict_t *dict) +{ + int ret = -1; + void *size_attr = NULL; + uint64_t size_array[4]; + + ret = dict_get_ptr(dict, GF_XATTR_SHARD_FILE_SIZE, &size_attr); + if (ret) { + gf_msg_callingfn(THIS->name, GF_LOG_ERROR, 0, + SHARD_MSG_INTERNAL_XATTR_MISSING, + "Failed to " + "get " GF_XATTR_SHARD_FILE_SIZE " for %s", + uuid_utoa(stbuf->ia_gfid)); + return ret; + } + + memcpy(size_array, size_attr, sizeof(size_array)); + + stbuf->ia_size = ntoh64(size_array[0]); + stbuf->ia_blocks = ntoh64(size_array[2]); + + return 0; +} + +int +shard_call_count_return(call_frame_t *frame) +{ + int call_count = 0; + shard_local_t *local = NULL; + + local = frame->local; + + LOCK(&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK(&frame->lock); + + return call_count; +} + +static char * +shard_internal_dir_string(shard_internal_dir_type_t type) +{ + char *str = NULL; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: + str = GF_SHARD_DIR; + break; + case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: + str = GF_SHARD_REMOVE_ME_DIR; + break; + default: + break; + } + return str; +} + +static int +shard_init_internal_dir_loc(xlator_t *this, shard_local_t *local, + shard_internal_dir_type_t type) +{ + int ret = -1; + char *bname = NULL; + inode_t *parent = NULL; + loc_t *internal_dir_loc = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + if (!local) + return -1; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: + internal_dir_loc = &local->dot_shard_loc; + bname = GF_SHARD_DIR; + parent = inode_ref(this->itable->root); + break; + case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: + internal_dir_loc = &local->dot_shard_rm_loc; + bname = GF_SHARD_REMOVE_ME_DIR; + parent = inode_ref(priv->dot_shard_inode); + break; + default: + break; + } + + internal_dir_loc->inode = inode_new(this->itable); + internal_dir_loc->parent = parent; + ret = inode_path(internal_dir_loc->parent, bname, + (char **)&internal_dir_loc->path); + if (ret < 0 || !(internal_dir_loc->inode)) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed on %s", bname); + goto out; + } + + internal_dir_loc->name = strrchr(internal_dir_loc->path, '/'); + if (internal_dir_loc->name) + internal_dir_loc->name++; + + ret = 0; +out: + return ret; +} + +inode_t * +__shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this, + inode_t *base_inode, int block_num, + uuid_t gfid) +{ + char block_bname[256] = { + 0, + }; + inode_t *lru_inode = NULL; + shard_priv_t *priv = NULL; + shard_inode_ctx_t *ctx = NULL; + shard_inode_ctx_t *lru_inode_ctx = NULL; + shard_inode_ctx_t *lru_base_inode_ctx = NULL; + inode_t *fsync_inode = NULL; + inode_t *lru_base_inode = NULL; + gf_boolean_t do_fsync = _gf_false; + + priv = this->private; + + shard_inode_ctx_get(linked_inode, this, &ctx); + + if (list_empty(&ctx->ilist)) { + if (priv->inode_count + 1 <= priv->lru_limit) { + /* If this inode was linked here for the first time (indicated + * by empty list), and if there is still space in the priv list, + * add this ctx to the tail of the list. + */ + /* For as long as an inode is in lru list, we try to + * keep it alive by holding a ref on it. + */ + inode_ref(linked_inode); + if (base_inode) + gf_uuid_copy(ctx->base_gfid, base_inode->gfid); + else + gf_uuid_copy(ctx->base_gfid, gfid); + ctx->block_num = block_num; + list_add_tail(&ctx->ilist, &priv->ilist_head); + priv->inode_count++; + ctx->base_inode = inode_ref(base_inode); + } else { + /*If on the other hand there is no available slot for this inode + * in the list, delete the lru inode from the head of the list, + * unlink it. And in its place add this new inode into the list. + */ + lru_inode_ctx = list_first_entry(&priv->ilist_head, + shard_inode_ctx_t, ilist); + GF_ASSERT(lru_inode_ctx->block_num > 0); + lru_base_inode = lru_inode_ctx->base_inode; + list_del_init(&lru_inode_ctx->ilist); + lru_inode = inode_find(linked_inode->table, + lru_inode_ctx->stat.ia_gfid); + /* If the lru inode was part of the pending-fsync list, + * the base inode needs to be unref'd, the lru inode + * deleted from fsync list and fsync'd in a new frame, + * and then unlinked in memory and forgotten. + */ + if (!lru_base_inode) + goto after_fsync_check; + LOCK(&lru_base_inode->lock); + LOCK(&lru_inode->lock); + { + if (!list_empty(&lru_inode_ctx->to_fsync_list)) { + list_del_init(&lru_inode_ctx->to_fsync_list); + lru_inode_ctx->fsync_needed = 0; + do_fsync = _gf_true; + __shard_inode_ctx_get(lru_base_inode, this, + &lru_base_inode_ctx); + lru_base_inode_ctx->fsync_count--; + } + } + UNLOCK(&lru_inode->lock); + UNLOCK(&lru_base_inode->lock); + + after_fsync_check: + if (!do_fsync) { + shard_make_block_bname(lru_inode_ctx->block_num, + lru_inode_ctx->base_gfid, block_bname, + sizeof(block_bname)); + /* The following unref corresponds to the ref held at + * the time the shard was added to the lru list. + */ + inode_unref(lru_inode); + inode_unlink(lru_inode, priv->dot_shard_inode, block_bname); + inode_forget(lru_inode, 0); + } else { + /* The following unref corresponds to the ref + * held when the shard was added to fsync list. + */ + inode_unref(lru_inode); + fsync_inode = lru_inode; + if (lru_base_inode) + inode_unref(lru_base_inode); + } + /* The following unref corresponds to the ref + * held by inode_find() above. + */ + inode_unref(lru_inode); + + /* The following unref corresponds to the ref held on the base shard + * at the time of adding shard inode to lru list + */ + if (lru_base_inode) + inode_unref(lru_base_inode); + + /* For as long as an inode is in lru list, we try to + * keep it alive by holding a ref on it. + */ + inode_ref(linked_inode); + if (base_inode) + gf_uuid_copy(ctx->base_gfid, base_inode->gfid); + else + gf_uuid_copy(ctx->base_gfid, gfid); + ctx->block_num = block_num; + ctx->base_inode = inode_ref(base_inode); + list_add_tail(&ctx->ilist, &priv->ilist_head); + } + } else { + /* If this is not the first time this inode is being operated on, move + * it to the most recently used end of the list. + */ + list_move_tail(&ctx->ilist, &priv->ilist_head); + } + return fsync_inode; +} + +int +shard_common_failure_unwind(glusterfs_fop_t fop, call_frame_t *frame, + int32_t op_ret, int32_t op_errno) +{ + switch (fop) { + case GF_FOP_LOOKUP: + SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, NULL, NULL, + NULL, NULL); + break; + case GF_FOP_STAT: + SHARD_STACK_UNWIND(stat, frame, op_ret, op_errno, NULL, NULL); + break; + case GF_FOP_FSTAT: + SHARD_STACK_UNWIND(fstat, frame, op_ret, op_errno, NULL, NULL); + break; + case GF_FOP_TRUNCATE: + SHARD_STACK_UNWIND(truncate, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_FTRUNCATE: + SHARD_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_MKNOD: + SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, NULL, NULL, NULL, + NULL, NULL); + break; + case GF_FOP_LINK: + SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, NULL, NULL, NULL, + NULL, NULL); + break; + case GF_FOP_CREATE: + SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, NULL, NULL, + NULL, NULL, NULL, NULL); + break; + case GF_FOP_UNLINK: + SHARD_STACK_UNWIND(unlink, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_RENAME: + SHARD_STACK_UNWIND(rename, frame, op_ret, op_errno, NULL, NULL, + NULL, NULL, NULL, NULL); + break; + case GF_FOP_WRITE: + SHARD_STACK_UNWIND(writev, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_FALLOCATE: + SHARD_STACK_UNWIND(fallocate, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_ZEROFILL: + SHARD_STACK_UNWIND(zerofill, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_DISCARD: + SHARD_STACK_UNWIND(discard, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_READ: + SHARD_STACK_UNWIND(readv, frame, op_ret, op_errno, NULL, -1, NULL, + NULL, NULL); + break; + case GF_FOP_FSYNC: + SHARD_STACK_UNWIND(fsync, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_REMOVEXATTR: + SHARD_STACK_UNWIND(removexattr, frame, op_ret, op_errno, NULL); + break; + case GF_FOP_FREMOVEXATTR: + SHARD_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, NULL); + break; + case GF_FOP_FGETXATTR: + SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, NULL, NULL); + break; + case GF_FOP_GETXATTR: + SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, NULL, NULL); + break; + case GF_FOP_FSETXATTR: + SHARD_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, NULL); + break; + case GF_FOP_SETXATTR: + SHARD_STACK_UNWIND(setxattr, frame, op_ret, op_errno, NULL); + break; + case GF_FOP_SETATTR: + SHARD_STACK_UNWIND(setattr, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_FSETATTR: + SHARD_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_SEEK: + SHARD_STACK_UNWIND(seek, frame, op_ret, op_errno, 0, NULL); + break; + default: + gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, + "Invalid fop id = %d", fop); + break; + } + return 0; +} + +int +shard_common_inode_write_success_unwind(glusterfs_fop_t fop, + call_frame_t *frame, int32_t op_ret) +{ + shard_local_t *local = frame->local; + + /* the below 3 variables are required because, in SHARD_STACK_UNWIND() + macro, there is a check for local being null. So many static analyzers + backtrace the code with assumption of possible (local == NULL) case, + and complains for below lines. By handling it like below, we overcome + the warnings */ + + struct iatt *prebuf = ((local) ? &local->prebuf : NULL); + struct iatt *postbuf = ((local) ? &local->postbuf : NULL); + dict_t *xattr_rsp = ((local) ? local->xattr_rsp : NULL); + + switch (fop) { + case GF_FOP_WRITE: + SHARD_STACK_UNWIND(writev, frame, op_ret, 0, prebuf, postbuf, + xattr_rsp); + break; + case GF_FOP_FALLOCATE: + SHARD_STACK_UNWIND(fallocate, frame, op_ret, 0, prebuf, postbuf, + xattr_rsp); + break; + case GF_FOP_ZEROFILL: + SHARD_STACK_UNWIND(zerofill, frame, op_ret, 0, prebuf, postbuf, + xattr_rsp); + break; + case GF_FOP_DISCARD: + SHARD_STACK_UNWIND(discard, frame, op_ret, 0, prebuf, postbuf, + xattr_rsp); + break; + default: + gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, + "Invalid fop id = %d", fop); + break; + } + return 0; +} + +int +shard_evicted_inode_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + char block_bname[256] = { + 0, + }; + fd_t *anon_fd = cookie; + inode_t *shard_inode = NULL; + shard_inode_ctx_t *ctx = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + + if (anon_fd == NULL || op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, SHARD_MSG_MEMALLOC_FAILED, + "fsync failed on shard"); + goto out; + } + shard_inode = anon_fd->inode; + + LOCK(&priv->lock); + LOCK(&shard_inode->lock); + { + __shard_inode_ctx_get(shard_inode, this, &ctx); + if ((list_empty(&ctx->to_fsync_list)) && (list_empty(&ctx->ilist))) { + shard_make_block_bname(ctx->block_num, shard_inode->gfid, + block_bname, sizeof(block_bname)); + inode_unlink(shard_inode, priv->dot_shard_inode, block_bname); + /* The following unref corresponds to the ref held by + * inode_link() at the time the shard was created or + * looked up + */ + inode_unref(shard_inode); + inode_forget(shard_inode, 0); + } + } + UNLOCK(&shard_inode->lock); + UNLOCK(&priv->lock); + +out: + if (anon_fd) + fd_unref(anon_fd); + STACK_DESTROY(frame->root); + return 0; +} + +int +shard_initiate_evicted_inode_fsync(xlator_t *this, inode_t *inode) +{ + fd_t *anon_fd = NULL; + call_frame_t *fsync_frame = NULL; + + fsync_frame = create_frame(this, this->ctx->pool); + if (!fsync_frame) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, + "Failed to create new frame " + "to fsync shard"); + return -1; + } + + anon_fd = fd_anonymous(inode); + if (!anon_fd) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, + "Failed to create anon fd to" + " fsync shard"); + STACK_DESTROY(fsync_frame->root); + return -1; + } + + STACK_WIND_COOKIE(fsync_frame, shard_evicted_inode_fsync_cbk, anon_fd, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, + anon_fd, 1, NULL); + return 0; +} + +int +shard_common_inode_write_post_lookup_shards_handler(call_frame_t *frame, + xlator_t *this); + +int +shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, + shard_post_resolve_fop_handler_t post_res_handler) +{ + int i = -1; + uint32_t shard_idx_iter = 0; + char path[PATH_MAX] = { + 0, + }; + uuid_t gfid = { + 0, + }; + inode_t *inode = NULL; + inode_t *res_inode = NULL; + inode_t *fsync_inode = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + uint64_t resolve_count = 0; + + priv = this->private; + local = frame->local; + local->call_count = 0; + shard_idx_iter = local->first_block; + res_inode = local->resolver_base_inode; + + if ((local->op_ret < 0) || (local->resolve_not)) + goto out; + + /* If this prealloc FOP is for fresh file creation, then the size of the + * file will be 0. Then there will be no shards associated with this file. + * So we can skip the lookup process for the shards which do not exists + * and directly issue mknod to crete shards. + * + * In case the prealloc fop is to extend the preallocated file to bigger + * size then just lookup and populate inodes of existing shards and + * update the create count + */ + if (local->fop == GF_FOP_FALLOCATE) { + if (!local->prebuf.ia_size) { + local->inode_list[0] = inode_ref(res_inode); + local->create_count = local->last_block; + shard_common_inode_write_post_lookup_shards_handler(frame, this); + return 0; + } + if (local->prebuf.ia_size < local->total_size) + local->create_count = local->last_block - + ((local->prebuf.ia_size - 1) / + local->block_size); + } + + resolve_count = local->last_block - local->create_count; + + if (res_inode) + gf_uuid_copy(gfid, res_inode->gfid); + else + gf_uuid_copy(gfid, local->base_gfid); + + while (shard_idx_iter <= resolve_count) { + i++; + if (shard_idx_iter == 0) { + local->inode_list[i] = inode_ref(res_inode); + shard_idx_iter++; + continue; + } + + shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); + + inode = NULL; + inode = inode_resolve(this->itable, path); + if (inode) { + gf_msg_debug(this->name, 0, + "Shard %d already " + "present. gfid=%s. Saving inode for future.", + shard_idx_iter, uuid_utoa(inode->gfid)); + local->inode_list[i] = inode; + /* Let the ref on the inodes that are already present + * in inode table still be held so that they don't get + * forgotten by the time the fop reaches the actual + * write stage. + */ + LOCK(&priv->lock); + { + fsync_inode = __shard_update_shards_inode_list( + inode, this, res_inode, shard_idx_iter, gfid); + } + UNLOCK(&priv->lock); + shard_idx_iter++; + if (fsync_inode) + shard_initiate_evicted_inode_fsync(this, fsync_inode); + continue; + } else { + local->call_count++; + shard_idx_iter++; + } + } +out: + post_res_handler(frame, this); + return 0; +} + +int +shard_update_file_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + inode_t *inode = NULL; + shard_local_t *local = NULL; + + local = frame->local; + + if ((local->fd) && (local->fd->inode)) + inode = local->fd->inode; + else if (local->loc.inode) + inode = local->loc.inode; + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SHARD_MSG_UPDATE_FILE_SIZE_FAILED, + "Update to file size" + " xattr failed on %s", + uuid_utoa(inode->gfid)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto err; + } + + if (shard_modify_size_and_block_count(&local->postbuf, dict)) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } +err: + local->post_update_size_handler(frame, this); + return 0; +} + +int +shard_set_size_attrs(int64_t size, int64_t block_count, int64_t **size_attr_p) +{ + int ret = -1; + int64_t *size_attr = NULL; + + if (!size_attr_p) + goto out; + + size_attr = GF_CALLOC(4, sizeof(int64_t), gf_shard_mt_int64_t); + if (!size_attr) + goto out; + + size_attr[0] = hton64(size); + /* As sharding evolves, it _may_ be necessary to embed more pieces of + * information within the same xattr. So allocating slots for them in + * advance. For now, only bytes 0-63 and 128-191 which would make up the + * current size and block count respectively of the file are valid. + */ + size_attr[2] = hton64(block_count); + + *size_attr_p = size_attr; + + ret = 0; +out: + return ret; +} + +int +shard_update_file_size(call_frame_t *frame, xlator_t *this, fd_t *fd, + loc_t *loc, shard_post_update_size_fop_handler_t handler) +{ + int ret = -1; + int64_t *size_attr = NULL; + int64_t delta_blocks = 0; + inode_t *inode = NULL; + shard_local_t *local = NULL; + dict_t *xattr_req = NULL; + + local = frame->local; + local->post_update_size_handler = handler; + + xattr_req = dict_new(); + if (!xattr_req) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + if (fd) + inode = fd->inode; + else + inode = loc->inode; + + /* If both size and block count have not changed, then skip the xattrop. + */ + delta_blocks = GF_ATOMIC_GET(local->delta_blocks); + if ((local->delta_size + local->hole_size == 0) && (delta_blocks == 0)) { + goto out; + } + + ret = shard_set_size_attrs(local->delta_size + local->hole_size, + delta_blocks, &size_attr); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SIZE_SET_FAILED, + "Failed to set size attrs for %s", uuid_utoa(inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + ret = dict_set_bin(xattr_req, GF_XATTR_SHARD_FILE_SIZE, size_attr, 8 * 4); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set key %s into dict. gfid=%s", + GF_XATTR_SHARD_FILE_SIZE, uuid_utoa(inode->gfid)); + GF_FREE(size_attr); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + if (fd) + STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, fd, + GF_XATTROP_ADD_ARRAY64, xattr_req, NULL); + else + STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, loc, + GF_XATTROP_ADD_ARRAY64, xattr_req, NULL); + + dict_unref(xattr_req); + return 0; + +out: + if (xattr_req) + dict_unref(xattr_req); + handler(frame, this); + return 0; +} + +static inode_t * +shard_link_internal_dir_inode(shard_local_t *local, inode_t *inode, + struct iatt *buf, shard_internal_dir_type_t type) +{ + inode_t *linked_inode = NULL; + shard_priv_t *priv = NULL; + char *bname = NULL; + inode_t **priv_inode = NULL; + inode_t *parent = NULL; + + priv = THIS->private; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: + bname = GF_SHARD_DIR; + priv_inode = &priv->dot_shard_inode; + parent = inode->table->root; + break; + case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: + bname = GF_SHARD_REMOVE_ME_DIR; + priv_inode = &priv->dot_shard_rm_inode; + parent = priv->dot_shard_inode; + break; + default: + break; + } + + linked_inode = inode_link(inode, parent, bname, buf); + inode_lookup(linked_inode); + *priv_inode = linked_inode; + return linked_inode; +} + +int +shard_refresh_internal_dir_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + shard_local_t *local = NULL; + inode_t *linked_inode = NULL; + shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; + + local = frame->local; + + if (op_ret) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto out; + } + + /* To-Do: Fix refcount increment per call to + * shard_link_internal_dir_inode(). + */ + linked_inode = shard_link_internal_dir_inode(local, inode, buf, type); + shard_inode_ctx_mark_dir_refreshed(linked_inode, this); +out: + shard_common_resolve_shards(frame, this, local->post_res_handler); + return 0; +} + +int +shard_refresh_internal_dir(call_frame_t *frame, xlator_t *this, + shard_internal_dir_type_t type) +{ + loc_t loc = { + 0, + }; + inode_t *inode = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + uuid_t gfid = { + 0, + }; + + local = frame->local; + priv = this->private; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: + gf_uuid_copy(gfid, priv->dot_shard_gfid); + break; + case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: + gf_uuid_copy(gfid, priv->dot_shard_rm_gfid); + break; + default: + break; + } + + inode = inode_find(this->itable, gfid); + + if (!shard_inode_ctx_needs_lookup(inode, this)) { + local->op_ret = 0; + goto out; + } + + /* Plain assignment because the ref is already taken above through + * call to inode_find() + */ + loc.inode = inode; + gf_uuid_copy(loc.gfid, gfid); + + STACK_WIND_COOKIE(frame, shard_refresh_internal_dir_cbk, (void *)(long)type, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, &loc, + NULL); + loc_wipe(&loc); + + return 0; + +out: + shard_common_resolve_shards(frame, this, local->post_res_handler); + return 0; +} + +int +shard_lookup_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + inode_t *link_inode = NULL; + shard_local_t *local = NULL; + shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; + + local = frame->local; + + if (op_ret) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + + if (!IA_ISDIR(buf->ia_type)) { + gf_msg(this->name, GF_LOG_CRITICAL, 0, SHARD_MSG_DOT_SHARD_NODIR, + "%s already exists and " + "is not a directory. Please remove it from all bricks " + "and try again", + shard_internal_dir_string(type)); + local->op_ret = -1; + local->op_errno = EIO; + goto unwind; + } + + link_inode = shard_link_internal_dir_inode(local, inode, buf, type); + if (link_inode != inode) { + shard_refresh_internal_dir(frame, this, type); + } else { + shard_inode_ctx_mark_dir_refreshed(link_inode, this); + shard_common_resolve_shards(frame, this, local->post_res_handler); + } + return 0; + +unwind: + local->post_res_handler(frame, this); + return 0; +} + +int +shard_lookup_internal_dir(call_frame_t *frame, xlator_t *this, + shard_post_resolve_fop_handler_t post_res_handler, + shard_internal_dir_type_t type) +{ + int ret = -1; + dict_t *xattr_req = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + uuid_t *gfid = NULL; + loc_t *loc = NULL; + gf_boolean_t free_gfid = _gf_true; + + local = frame->local; + priv = this->private; + local->post_res_handler = post_res_handler; + + gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); + if (!gfid) + goto err; + + xattr_req = dict_new(); + if (!xattr_req) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: + gf_uuid_copy(*gfid, priv->dot_shard_gfid); + loc = &local->dot_shard_loc; + break; + case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: + gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid); + loc = &local->dot_shard_rm_loc; + break; + default: + bzero(*gfid, sizeof(uuid_t)); + break; + } + + ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set gfid of %s into dict", + shard_internal_dir_string(type)); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } else { + free_gfid = _gf_false; + } + + STACK_WIND_COOKIE(frame, shard_lookup_internal_dir_cbk, (void *)(long)type, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, loc, + xattr_req); + + dict_unref(xattr_req); + return 0; + +err: + if (xattr_req) + dict_unref(xattr_req); + if (free_gfid) + GF_FREE(gfid); + post_res_handler(frame, this); + return 0; +} + +static void +shard_inode_ctx_update(inode_t *inode, xlator_t *this, dict_t *xdata, + struct iatt *buf) +{ + int ret = 0; + uint64_t size = 0; + void *bsize = NULL; + + if (shard_inode_ctx_get_block_size(inode, this, &size)) { + /* Fresh lookup */ + ret = dict_get_ptr(xdata, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); + if (!ret) + size = ntoh64(*((uint64_t *)bsize)); + /* If the file is sharded, set its block size, otherwise just + * set 0. + */ + + shard_inode_ctx_set(inode, this, buf, size, SHARD_MASK_BLOCK_SIZE); + } + /* If the file is sharded, also set the remaining attributes, + * except for ia_size and ia_blocks. + */ + if (size) { + shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK); + (void)shard_inode_ctx_invalidate(inode, this, buf); + } +} + +int +shard_delete_shards(void *opaque); + +int +shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data); + +int +shard_start_background_deletion(xlator_t *this) +{ + int ret = 0; + gf_boolean_t i_cleanup = _gf_true; + shard_priv_t *priv = NULL; + call_frame_t *cleanup_frame = NULL; + + priv = this->private; + + LOCK(&priv->lock); + { + switch (priv->bg_del_state) { + case SHARD_BG_DELETION_NONE: + i_cleanup = _gf_true; + priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING; + break; + case SHARD_BG_DELETION_LAUNCHING: + i_cleanup = _gf_false; + break; + case SHARD_BG_DELETION_IN_PROGRESS: + priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING; + i_cleanup = _gf_false; + break; + default: + break; + } + } + UNLOCK(&priv->lock); + if (!i_cleanup) + return 0; + + cleanup_frame = create_frame(this, this->ctx->pool); + if (!cleanup_frame) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, + "Failed to create " + "new frame to delete shards"); + ret = -ENOMEM; + goto err; + } + + set_lk_owner_from_ptr(&cleanup_frame->root->lk_owner, cleanup_frame->root); + + ret = synctask_new(this->ctx->env, shard_delete_shards, + shard_delete_shards_cbk, cleanup_frame, cleanup_frame); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, errno, + SHARD_MSG_SHARDS_DELETION_FAILED, + "failed to create task to do background " + "cleanup of shards"); + STACK_DESTROY(cleanup_frame->root); + goto err; + } + return 0; + +err: + LOCK(&priv->lock); + { + priv->bg_del_state = SHARD_BG_DELETION_NONE; + } + UNLOCK(&priv->lock); + return ret; +} + +int +shard_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + int ret = -1; + shard_priv_t *priv = NULL; + gf_boolean_t i_start_cleanup = _gf_false; + + priv = this->private; + + if (op_ret < 0) + goto unwind; + + if (IA_ISDIR(buf->ia_type)) + goto unwind; + + /* Also, if the file is sharded, get the file size and block cnt xattr, + * and store them in the stbuf appropriately. + */ + + if (dict_get(xdata, GF_XATTR_SHARD_FILE_SIZE) && + frame->root->pid != GF_CLIENT_PID_GSYNCD) + shard_modify_size_and_block_count(buf, xdata); + + /* If this was a fresh lookup, there are two possibilities: + * 1) If the file is sharded (indicated by the presence of block size + * xattr), store this block size, along with rdev and mode in its + * inode ctx. + * 2) If the file is not sharded, store size along with rdev and mode + * (which are anyway don't cares) in inode ctx. Since @ctx_tmp is + * already initialised to all zeroes, nothing more needs to be done. + */ + + (void)shard_inode_ctx_update(inode, this, xdata, buf); + + LOCK(&priv->lock); + { + if (priv->first_lookup_done == _gf_false) { + priv->first_lookup_done = _gf_true; + i_start_cleanup = _gf_true; + } + } + UNLOCK(&priv->lock); + + if (!i_start_cleanup) + goto unwind; + + ret = shard_start_background_deletion(this); + if (ret < 0) { + LOCK(&priv->lock); + { + priv->first_lookup_done = _gf_false; + } + UNLOCK(&priv->lock); + } + +unwind: + SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + return 0; +} + +int +shard_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + int ret = -1; + int32_t op_errno = ENOMEM; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + this->itable = loc->inode->table; + if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && + (frame->root->pid != GF_CLIENT_PID_GLFS_HEAL)) { + SHARD_ENTRY_FOP_CHECK(loc, op_errno, err); + } + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + loc_copy(&local->loc, loc); + + local->xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new(); + if (!local->xattr_req) + goto err; + + if (shard_inode_ctx_get_block_size(loc->inode, this, &block_size)) { + ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set dict" + " value: key:%s for path %s", + GF_XATTR_SHARD_BLOCK_SIZE, loc->path); + goto err; + } + } + + if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { + ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, + 8 * 4); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set dict value: key:%s for path %s.", + GF_XATTR_SHARD_FILE_SIZE, loc->path); + goto err; + } + } + + if ((xattr_req) && (dict_get(xattr_req, GF_CONTENT_KEY))) + dict_del(xattr_req, GF_CONTENT_KEY); + + STACK_WIND(frame, shard_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, local->xattr_req); + return 0; +err: + shard_common_failure_unwind(GF_FOP_LOOKUP, frame, -1, op_errno); + return 0; +} + +int +shard_set_iattr_invoke_post_handler(call_frame_t *frame, xlator_t *this, + inode_t *inode, int32_t op_ret, + int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + int ret = -1; + int32_t mask = SHARD_INODE_WRITE_MASK; + shard_local_t *local = frame->local; + shard_inode_ctx_t ctx = { + 0, + }; + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SHARD_MSG_BASE_FILE_LOOKUP_FAILED, + "Lookup on base file" + " failed : %s", + uuid_utoa(inode->gfid)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + + local->prebuf = *buf; + if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { + local->op_ret = -1; + local->op_errno = EINVAL; + goto unwind; + } + + if (shard_inode_ctx_get_all(inode, this, &ctx)) + mask = SHARD_ALL_MASK; + + ret = shard_inode_ctx_set(inode, this, &local->prebuf, 0, + (mask | SHARD_MASK_REFRESH_RESET)); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, SHARD_MSG_INODE_CTX_SET_FAILED, 0, + "Failed to set inode" + " write params into inode ctx for %s", + uuid_utoa(buf->ia_gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + +unwind: + local->handler(frame, this); + return 0; +} + +int +shard_fstat_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + shard_local_t *local = frame->local; + + shard_set_iattr_invoke_post_handler(frame, this, local->fd->inode, op_ret, + op_errno, buf, xdata); + return 0; +} + +int +shard_lookup_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + /* In case of op_ret < 0, inode passed to this function will be NULL + ex: in case of op_errno = ENOENT. So refer prefilled inode data + which is part of local. + Note: Reassigning/overriding the inode passed to this cbk with inode + which is part of *struct shard_local_t* won't cause any issue as + both inodes have same reference/address as of the inode passed */ + inode = ((shard_local_t *)frame->local)->loc.inode; + + shard_set_iattr_invoke_post_handler(frame, this, inode, op_ret, op_errno, + buf, xdata); + return 0; +} + +/* This function decides whether to make file based lookup or + * fd based lookup (fstat) depending on the 3rd and 4th arg. + * If fd != NULL and loc == NULL then call is for fstat + * If fd == NULL and loc != NULL then call is for file based + * lookup. Please pass args based on the requirement. + */ +int +shard_refresh_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, + fd_t *fd, shard_post_fop_handler_t handler) +{ + int ret = -1; + inode_t *inode = NULL; + shard_local_t *local = NULL; + dict_t *xattr_req = NULL; + gf_boolean_t need_refresh = _gf_false; + + local = frame->local; + local->handler = handler; + inode = fd ? fd->inode : loc->inode; + + ret = shard_inode_ctx_fill_iatt_from_cache(inode, this, &local->prebuf, + &need_refresh); + /* By this time, inode ctx should have been created either in create, + * mknod, readdirp or lookup. If not it is a bug! + */ + if ((ret == 0) && (need_refresh == _gf_false)) { + gf_msg_debug(this->name, 0, + "Skipping lookup on base file: %s" + "Serving prebuf off the inode ctx cache", + uuid_utoa(inode->gfid)); + goto out; + } + + xattr_req = dict_new(); + if (!xattr_req) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + SHARD_MD_READ_FOP_INIT_REQ_DICT(this, xattr_req, inode->gfid, local, out); + + if (fd) + STACK_WIND(frame, shard_fstat_base_file_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xattr_req); + else + STACK_WIND(frame, shard_lookup_base_file_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xattr_req); + + dict_unref(xattr_req); + return 0; + +out: + if (xattr_req) + dict_unref(xattr_req); + handler(frame, this); + return 0; +} + +int +shard_post_fstat_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret >= 0) + shard_inode_ctx_set(local->fd->inode, this, &local->prebuf, 0, + SHARD_LOOKUP_MASK); + + SHARD_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno, + &local->prebuf, local->xattr_rsp); + return 0; +} + +int +shard_post_stat_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret >= 0) + shard_inode_ctx_set(local->loc.inode, this, &local->prebuf, 0, + SHARD_LOOKUP_MASK); + + SHARD_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, + &local->prebuf, local->xattr_rsp); + return 0; +} + +int +shard_common_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + inode_t *inode = NULL; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_STAT_FAILED, + "stat failed: %s", + local->fd ? uuid_utoa(local->fd->inode->gfid) + : uuid_utoa((local->loc.inode)->gfid)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + + local->prebuf = *buf; + if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { + local->op_ret = -1; + local->op_errno = EINVAL; + goto unwind; + } + local->xattr_rsp = dict_ref(xdata); + + if (local->loc.inode) + inode = local->loc.inode; + else + inode = local->fd->inode; + + shard_inode_ctx_invalidate(inode, this, &local->prebuf); + +unwind: + local->handler(frame, this); + return 0; +} + +int +shard_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) { + STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; + } + + ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size from inode ctx of %s", + uuid_utoa(loc->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; + } + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + local->handler = shard_post_stat_handler; + loc_copy(&local->loc, loc); + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + + SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid, + local, err); + + STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, local->xattr_req); + return 0; +err: + shard_common_failure_unwind(GF_FOP_STAT, frame, -1, ENOMEM); + return 0; +} + +int +shard_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) { + STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; + } + + ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size from inode ctx of %s", + uuid_utoa(fd->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; + } + + if (!this->itable) + this->itable = fd->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + local->handler = shard_post_fstat_handler; + local->fd = fd_ref(fd); + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + + SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, + local, err); + + STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, local->xattr_req); + return 0; +err: + shard_common_failure_unwind(GF_FOP_FSTAT, frame, -1, ENOMEM); + return 0; +} + +int +shard_post_update_size_truncate_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->fop == GF_FOP_TRUNCATE) + SHARD_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->postbuf, NULL); + else + SHARD_STACK_UNWIND(ftruncate, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->postbuf, NULL); + return 0; +} + +int +shard_truncate_last_shard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + inode_t *inode = NULL; + int64_t delta_blocks = 0; + shard_local_t *local = NULL; + + local = frame->local; + + SHARD_UNSET_ROOT_FS_ID(frame, local); + + inode = (local->fop == GF_FOP_TRUNCATE) ? local->loc.inode + : local->fd->inode; + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SHARD_MSG_TRUNCATE_LAST_SHARD_FAILED, + "truncate on last" + " shard failed : %s", + uuid_utoa(inode->gfid)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto err; + } + + local->postbuf.ia_size = local->offset; + /* Let the delta be negative. We want xattrop to do subtraction */ + local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size; + delta_blocks = GF_ATOMIC_ADD(local->delta_blocks, + postbuf->ia_blocks - prebuf->ia_blocks); + GF_ASSERT(delta_blocks <= 0); + local->postbuf.ia_blocks += delta_blocks; + local->hole_size = 0; + + shard_inode_ctx_set(inode, this, &local->postbuf, 0, SHARD_MASK_TIMES); + shard_update_file_size(frame, this, NULL, &local->loc, + shard_post_update_size_truncate_handler); + return 0; +err: + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; +} + +int +shard_truncate_last_shard(call_frame_t *frame, xlator_t *this, inode_t *inode) +{ + size_t last_shard_size_after = 0; + loc_t loc = { + 0, + }; + shard_local_t *local = NULL; + + local = frame->local; + + /* A NULL inode could be due to the fact that the last shard which + * needs to be truncated does not exist due to it lying in a hole + * region. So the only thing left to do in that case would be an + * update to file size xattr. + */ + if (!inode) { + gf_msg_debug(this->name, 0, + "Last shard to be truncated absent in backend: %" PRIu64 + " of gfid %s. Directly proceeding to update file size", + local->first_block, uuid_utoa(local->loc.inode->gfid)); + shard_update_file_size(frame, this, NULL, &local->loc, + shard_post_update_size_truncate_handler); + return 0; + } + + SHARD_SET_ROOT_FS_ID(frame, local); + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + last_shard_size_after = (local->offset % local->block_size); + + STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, &loc, last_shard_size_after, + NULL); + loc_wipe(&loc); + return 0; +} + +void +shard_unlink_block_inode(shard_local_t *local, int shard_block_num); + +int +shard_truncate_htol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + int ret = 0; + int call_count = 0; + int shard_block_num = (long)cookie; + uint64_t block_count = 0; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto done; + } + ret = dict_get_uint64(xdata, GF_GET_FILE_BLOCK_COUNT, &block_count); + if (!ret) { + GF_ATOMIC_SUB(local->delta_blocks, block_count); + } else { + /* dict_get failed possibly due to a heterogeneous cluster? */ + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to get key %s from dict during truncate of gfid %s", + GF_GET_FILE_BLOCK_COUNT, + uuid_utoa(local->resolver_base_inode->gfid)); + } + + shard_unlink_block_inode(local, shard_block_num); +done: + call_count = shard_call_count_return(frame); + if (call_count == 0) { + SHARD_UNSET_ROOT_FS_ID(frame, local); + shard_truncate_last_shard(frame, this, local->inode_list[0]); + } + return 0; +} + +int +shard_truncate_htol(call_frame_t *frame, xlator_t *this, inode_t *inode) +{ + int i = 1; + int ret = -1; + int call_count = 0; + uint32_t cur_block = 0; + uint32_t last_block = 0; + char path[PATH_MAX] = { + 0, + }; + char *bname = NULL; + loc_t loc = { + 0, + }; + gf_boolean_t wind_failed = _gf_false; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + dict_t *xdata_req = NULL; + + local = frame->local; + priv = this->private; + + cur_block = local->first_block + 1; + last_block = local->last_block; + + /* Determine call count */ + for (i = 1; i < local->num_blocks; i++) { + if (!local->inode_list[i]) + continue; + call_count++; + } + + if (!call_count) { + /* Call count = 0 implies that all of the shards that need to be + * unlinked do not exist. So shard xlator would now proceed to + * do the final truncate + size updates. + */ + gf_msg_debug(this->name, 0, + "Shards to be unlinked as part of " + "truncate absent in backend: %s. Directly " + "proceeding to update file size", + uuid_utoa(inode->gfid)); + local->postbuf.ia_size = local->offset; + local->postbuf.ia_blocks = local->prebuf.ia_blocks; + local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size; + GF_ATOMIC_INIT(local->delta_blocks, 0); + local->hole_size = 0; + shard_update_file_size(frame, this, local->fd, &local->loc, + shard_post_update_size_truncate_handler); + return 0; + } + + local->call_count = call_count; + i = 1; + xdata_req = dict_new(); + if (!xdata_req) { + shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); + return 0; + } + ret = dict_set_uint64(xdata_req, GF_GET_FILE_BLOCK_COUNT, 8 * 8); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set key %s into dict during truncate of %s", + GF_GET_FILE_BLOCK_COUNT, + uuid_utoa(local->resolver_base_inode->gfid)); + dict_unref(xdata_req); + shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); + return 0; + } + + SHARD_SET_ROOT_FS_ID(frame, local); + while (cur_block <= last_block) { + if (!local->inode_list[i]) { + cur_block++; + i++; + continue; + } + if (wind_failed) { + shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1, + ENOMEM, NULL, NULL, NULL); + goto next; + } + + shard_make_block_abspath(cur_block, inode->gfid, path, sizeof(path)); + bname = strrchr(path, '/') + 1; + loc.parent = inode_ref(priv->dot_shard_inode); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed" + " on %s. Base file gfid = %s", + bname, uuid_utoa(inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe(&loc); + wind_failed = _gf_true; + shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1, + ENOMEM, NULL, NULL, NULL); + goto next; + } + loc.name = strrchr(loc.path, '/'); + if (loc.name) + loc.name++; + loc.inode = inode_ref(local->inode_list[i]); + + STACK_WIND_COOKIE(frame, shard_truncate_htol_cbk, + (void *)(long)cur_block, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &loc, 0, xdata_req); + loc_wipe(&loc); + next: + i++; + cur_block++; + if (!--call_count) + break; + } + dict_unref(xdata_req); + return 0; +} + +int +shard_truncate_do(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->num_blocks == 1) { + /* This means that there are no shards to be unlinked. + * The fop boils down to truncating the last shard, updating + * the size and unwinding. + */ + shard_truncate_last_shard(frame, this, local->inode_list[0]); + return 0; + } else { + shard_truncate_htol(frame, this, local->loc.inode); + } + return 0; +} + +int +shard_post_lookup_shards_truncate_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + shard_truncate_do(frame, this); + return 0; +} + +void +shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode, + struct iatt *buf) +{ + int list_index = 0; + char block_bname[256] = { + 0, + }; + uuid_t gfid = { + 0, + }; + inode_t *linked_inode = NULL; + xlator_t *this = NULL; + inode_t *fsync_inode = NULL; + shard_priv_t *priv = NULL; + inode_t *base_inode = NULL; + + this = THIS; + priv = this->private; + if (local->loc.inode) { + gf_uuid_copy(gfid, local->loc.inode->gfid); + base_inode = local->loc.inode; + } else if (local->resolver_base_inode) { + gf_uuid_copy(gfid, local->resolver_base_inode->gfid); + base_inode = local->resolver_base_inode; + } else { + gf_uuid_copy(gfid, local->base_gfid); + } + + shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname)); + + shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK); + linked_inode = inode_link(inode, priv->dot_shard_inode, block_bname, buf); + inode_lookup(linked_inode); + list_index = block_num - local->first_block; + local->inode_list[list_index] = linked_inode; + + LOCK(&priv->lock); + { + fsync_inode = __shard_update_shards_inode_list( + linked_inode, this, base_inode, block_num, gfid); + } + UNLOCK(&priv->lock); + if (fsync_inode) + shard_initiate_evicted_inode_fsync(this, fsync_inode); +} + +int +shard_common_lookup_shards_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + int call_count = 0; + int shard_block_num = (long)cookie; + uuid_t gfid = { + 0, + }; + shard_local_t *local = NULL; + + local = frame->local; + if (local->resolver_base_inode) + gf_uuid_copy(gfid, local->resolver_base_inode->gfid); + else + gf_uuid_copy(gfid, local->base_gfid); + + if (op_ret < 0) { + /* Ignore absence of shards in the backend in truncate fop. */ + switch (local->fop) { + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + case GF_FOP_RENAME: + case GF_FOP_UNLINK: + if (op_errno == ENOENT) + goto done; + break; + case GF_FOP_WRITE: + case GF_FOP_READ: + case GF_FOP_ZEROFILL: + case GF_FOP_DISCARD: + case GF_FOP_FALLOCATE: + if ((!local->first_lookup_done) && (op_errno == ENOENT)) { + LOCK(&frame->lock); + { + local->create_count++; + } + UNLOCK(&frame->lock); + goto done; + } + break; + default: + break; + } + + /* else */ + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SHARD_MSG_LOOKUP_SHARD_FAILED, + "Lookup on shard %d " + "failed. Base file gfid = %s", + shard_block_num, uuid_utoa(gfid)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto done; + } + + shard_link_block_inode(local, shard_block_num, inode, buf); + +done: + if (local->lookup_shards_barriered) { + syncbarrier_wake(&local->barrier); + return 0; + } else { + call_count = shard_call_count_return(frame); + if (call_count == 0) { + if (!local->first_lookup_done) + local->first_lookup_done = _gf_true; + local->pls_fop_handler(frame, this); + } + } + return 0; +} + +dict_t * +shard_create_gfid_dict(dict_t *dict) +{ + int ret = 0; + dict_t *new = NULL; + unsigned char *gfid = NULL; + + new = dict_copy_with_ref(dict, NULL); + if (!new) + return NULL; + + gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_char); + if (!gfid) { + ret = -1; + goto out; + } + + gf_uuid_generate(gfid); + + ret = dict_set_gfuuid(new, "gfid-req", gfid, false); + +out: + if (ret) { + dict_unref(new); + new = NULL; + GF_FREE(gfid); + } + + return new; +} + +int +shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, + shard_post_lookup_shards_fop_handler_t handler) +{ + int i = 0; + int ret = 0; + int count = 0; + int call_count = 0; + int32_t shard_idx_iter = 0; + int lookup_count = 0; + char path[PATH_MAX] = { + 0, + }; + char *bname = NULL; + uuid_t gfid = { + 0, + }; + loc_t loc = { + 0, + }; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + gf_boolean_t wind_failed = _gf_false; + dict_t *xattr_req = NULL; + + priv = this->private; + local = frame->local; + count = call_count = local->call_count; + shard_idx_iter = local->first_block; + lookup_count = local->last_block - local->create_count; + local->pls_fop_handler = handler; + if (local->lookup_shards_barriered) + local->barrier.waitfor = local->call_count; + + if (inode) + gf_uuid_copy(gfid, inode->gfid); + else + gf_uuid_copy(gfid, local->base_gfid); + + while (shard_idx_iter <= lookup_count) { + if (local->inode_list[i]) { + i++; + shard_idx_iter++; + continue; + } + + if (wind_failed) { + shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, + this, -1, ENOMEM, NULL, NULL, NULL, + NULL); + goto next; + } + + shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); + + bname = strrchr(path, '/') + 1; + loc.inode = inode_new(this->itable); + loc.parent = inode_ref(priv->dot_shard_inode); + gf_uuid_copy(loc.pargfid, priv->dot_shard_gfid); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); + if (ret < 0 || !(loc.inode)) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed" + " on %s, base file gfid = %s", + bname, uuid_utoa(gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe(&loc); + wind_failed = _gf_true; + shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, + this, -1, ENOMEM, NULL, NULL, NULL, + NULL); + goto next; + } + + loc.name = strrchr(loc.path, '/'); + if (loc.name) + loc.name++; + + xattr_req = shard_create_gfid_dict(local->xattr_req); + if (!xattr_req) { + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + loc_wipe(&loc); + shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, + this, -1, ENOMEM, NULL, NULL, NULL, + NULL); + goto next; + } + + STACK_WIND_COOKIE(frame, shard_common_lookup_shards_cbk, + (void *)(long)shard_idx_iter, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &loc, xattr_req); + loc_wipe(&loc); + dict_unref(xattr_req); + next: + shard_idx_iter++; + i++; + + if (!--call_count) + break; + } + if (local->lookup_shards_barriered) { + syncbarrier_wait(&local->barrier, count); + local->pls_fop_handler(frame, this); + } + return 0; +} + +int +shard_post_resolve_truncate_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + if (local->op_errno == ENOENT) { + /* If lookup on /.shard fails with ENOENT, it means that + * the file was 0-byte in size but truncated sometime in + * the past to a higher size which is reflected in the + * size xattr, and now being truncated to a lower size. + * In this case, the only thing that needs to be done is + * to update the size xattr of the file and unwind. + */ + local->first_block = local->last_block = 0; + local->num_blocks = 1; + local->call_count = 0; + local->op_ret = 0; + local->postbuf.ia_size = local->offset; + shard_update_file_size(frame, this, local->fd, &local->loc, + shard_post_update_size_truncate_handler); + return 0; + } else { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + } + + if (!local->call_count) + shard_truncate_do(frame, this); + else + shard_common_lookup_shards(frame, this, local->loc.inode, + shard_post_lookup_shards_truncate_handler); + + return 0; +} + +int +shard_truncate_begin(call_frame_t *frame, xlator_t *this) +{ + int ret = 0; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + local = frame->local; + + /* First participant block here is the lowest numbered block that would + * hold the last byte of the file post successful truncation. + * Last participant block is the block that contains the last byte in + * the current state of the file. + * If (first block == last_block): + * then that means that the file only needs truncation of the + * first (or last since both are same) block. + * Else + * if (new_size % block_size == 0) + * then that means there is no truncate to be done with + * only shards from first_block + 1 through the last + * block needing to be unlinked. + * else + * both truncate of the first block and unlink of the + * remaining shards until end of file is required. + */ + local->first_block = (local->offset == 0) + ? 0 + : get_lowest_block(local->offset - 1, + local->block_size); + local->last_block = get_highest_block(0, local->prebuf.ia_size, + local->block_size); + + local->num_blocks = local->last_block - local->first_block + 1; + GF_ASSERT(local->num_blocks > 0); + local->resolver_base_inode = (local->fop == GF_FOP_TRUNCATE) + ? local->loc.inode + : local->fd->inode; + + if ((local->first_block == 0) && (local->num_blocks == 1)) { + if (local->fop == GF_FOP_TRUNCATE) + STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, &local->loc, + local->offset, local->xattr_req); + else + STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, local->fd, + local->offset, local->xattr_req); + return 0; + } + + local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), + gf_shard_mt_inode_list); + if (!local->inode_list) + goto err; + + local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); + if (!local->dot_shard_loc.inode) { + ret = shard_init_internal_dir_loc(this, local, + SHARD_INTERNAL_DIR_DOT_SHARD); + if (ret) + goto err; + shard_lookup_internal_dir(frame, this, + shard_post_resolve_truncate_handler, + SHARD_INTERNAL_DIR_DOT_SHARD); + } else { + local->post_res_handler = shard_post_resolve_truncate_handler; + shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); + } + return 0; + +err: + shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); + return 0; +} + +int +shard_post_lookup_truncate_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + struct iatt tmp_stbuf = { + 0, + }; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + local->postbuf = tmp_stbuf = local->prebuf; + + if (local->prebuf.ia_size == local->offset) { + /* If the file size is same as requested size, unwind the call + * immediately. + */ + if (local->fop == GF_FOP_TRUNCATE) + SHARD_STACK_UNWIND(truncate, frame, 0, 0, &local->prebuf, + &local->postbuf, NULL); + else + SHARD_STACK_UNWIND(ftruncate, frame, 0, 0, &local->prebuf, + &local->postbuf, NULL); + } else if (local->offset > local->prebuf.ia_size) { + /* If the truncate is from a lower to a higher size, set the + * new size xattr and unwind. + */ + local->hole_size = local->offset - local->prebuf.ia_size; + local->delta_size = 0; + GF_ATOMIC_INIT(local->delta_blocks, 0); + local->postbuf.ia_size = local->offset; + tmp_stbuf.ia_size = local->offset; + shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0, + SHARD_INODE_WRITE_MASK); + shard_update_file_size(frame, this, NULL, &local->loc, + shard_post_update_size_truncate_handler); + } else { + /* ... else + * i. unlink all shards that need to be unlinked. + * ii. truncate the last of the shards. + * iii. update the new size using setxattr. + * and unwind the fop. + */ + local->hole_size = 0; + local->delta_size = (local->offset - local->prebuf.ia_size); + GF_ATOMIC_INIT(local->delta_blocks, 0); + tmp_stbuf.ia_size = local->offset; + shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0, + SHARD_INODE_WRITE_MASK); + shard_truncate_begin(frame, this); + } + return 0; +} + +/* TO-DO: + * Fix updates to size and block count with racing write(s) and truncate(s). + */ + +int +shard_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size from inode ctx of %s", + uuid_utoa(loc->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; + } + + if (!this->itable) + this->itable = loc->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + ret = syncbarrier_init(&local->barrier); + if (ret) + goto err; + loc_copy(&local->loc, loc); + local->offset = offset; + local->block_size = block_size; + local->fop = GF_FOP_TRUNCATE; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + local->resolver_base_inode = loc->inode; + GF_ATOMIC_INIT(local->delta_blocks, 0); + + shard_refresh_base_file(frame, this, &local->loc, NULL, + shard_post_lookup_truncate_handler); + return 0; + +err: + shard_common_failure_unwind(GF_FOP_TRUNCATE, frame, -1, ENOMEM); + return 0; +} + +int +shard_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size from inode ctx of %s", + uuid_utoa(fd->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; + } + + if (!this->itable) + this->itable = fd->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + ret = syncbarrier_init(&local->barrier); + if (ret) + goto err; + local->fd = fd_ref(fd); + local->offset = offset; + local->block_size = block_size; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + local->fop = GF_FOP_FTRUNCATE; + + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + local->resolver_base_inode = fd->inode; + GF_ATOMIC_INIT(local->delta_blocks, 0); + + shard_refresh_base_file(frame, this, NULL, fd, + shard_post_lookup_truncate_handler); + return 0; +err: + shard_common_failure_unwind(GF_FOP_FTRUNCATE, frame, -1, ENOMEM); + return 0; +} + +int +shard_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int ret = -1; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret == -1) + goto unwind; + + ret = shard_inode_ctx_set(inode, this, buf, local->block_size, + SHARD_ALL_MASK); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED, + "Failed to set inode " + "ctx for %s", + uuid_utoa(inode->gfid)); + +unwind: + SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + + return 0; +} + +int +shard_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + + priv = this->private; + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + local->block_size = priv->block_size; + if (!__is_gsyncd_on_shard_dir(frame, loc)) { + SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err); + } + + STACK_WIND(frame, shard_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); + return 0; +err: + shard_common_failure_unwind(GF_FOP_MKNOD, frame, -1, ENOMEM); + return 0; +} + +int32_t +shard_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + shard_local_t *local = NULL; + + local = frame->local; + if (op_ret < 0) + goto err; + + shard_inode_ctx_set(inode, this, buf, 0, + SHARD_MASK_NLINK | SHARD_MASK_TIMES); + buf->ia_size = local->prebuf.ia_size; + buf->ia_blocks = local->prebuf.ia_blocks; + + SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; +err: + shard_common_failure_unwind(GF_FOP_LINK, frame, op_ret, op_errno); + return 0; +} + +int +shard_post_lookup_link_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + SHARD_STACK_UNWIND(link, frame, local->op_ret, local->op_errno, NULL, + NULL, NULL, NULL, NULL); + return 0; + } + + STACK_WIND(frame, shard_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, &local->loc, &local->loc2, + local->xattr_req); + return 0; +} + +int32_t +shard_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size from inode ctx of %s", + uuid_utoa(oldloc->inode->gfid)); + goto err; + } + + if (!block_size) { + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + return 0; + } + + if (!this->itable) + this->itable = oldloc->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + loc_copy(&local->loc, oldloc); + loc_copy(&local->loc2, newloc); + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + + shard_refresh_base_file(frame, this, &local->loc, NULL, + shard_post_lookup_link_handler); + return 0; +err: + shard_common_failure_unwind(GF_FOP_LINK, frame, -1, ENOMEM); + return 0; +} + +int +shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode); + +int +shard_post_lookup_shards_unlink_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + uuid_t gfid = { + 0, + }; + + local = frame->local; + + if (local->resolver_base_inode) + gf_uuid_copy(gfid, local->resolver_base_inode->gfid); + else + gf_uuid_copy(gfid, local->base_gfid); + + if ((local->op_ret < 0) && (local->op_errno != ENOENT)) { + gf_msg(this->name, GF_LOG_ERROR, local->op_errno, SHARD_MSG_FOP_FAILED, + "failed to delete shards of %s", uuid_utoa(gfid)); + return 0; + } + local->op_ret = 0; + local->op_errno = 0; + + shard_unlink_shards_do(frame, this, local->resolver_base_inode); + return 0; +} + +int +shard_post_resolve_unlink_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + local->lookup_shards_barriered = _gf_true; + + if (!local->call_count) + shard_unlink_shards_do(frame, this, local->resolver_base_inode); + else + shard_common_lookup_shards(frame, this, local->resolver_base_inode, + shard_post_lookup_shards_unlink_handler); + return 0; +} + +void +shard_unlink_block_inode(shard_local_t *local, int shard_block_num) +{ + char block_bname[256] = { + 0, + }; + uuid_t gfid = { + 0, + }; + inode_t *inode = NULL; + inode_t *base_inode = NULL; + xlator_t *this = NULL; + shard_priv_t *priv = NULL; + shard_inode_ctx_t *ctx = NULL; + shard_inode_ctx_t *base_ictx = NULL; + int unref_base_inode = 0; + int unref_shard_inode = 0; + + this = THIS; + priv = this->private; + + inode = local->inode_list[shard_block_num - local->first_block]; + shard_inode_ctx_get(inode, this, &ctx); + base_inode = ctx->base_inode; + if (base_inode) + gf_uuid_copy(gfid, base_inode->gfid); + else + gf_uuid_copy(gfid, ctx->base_gfid); + shard_make_block_bname(shard_block_num, gfid, block_bname, + sizeof(block_bname)); + + LOCK(&priv->lock); + if (base_inode) + LOCK(&base_inode->lock); + LOCK(&inode->lock); + { + __shard_inode_ctx_get(inode, this, &ctx); + if (!list_empty(&ctx->ilist)) { + list_del_init(&ctx->ilist); + priv->inode_count--; + unref_base_inode++; + unref_shard_inode++; + GF_ASSERT(priv->inode_count >= 0); + } + if (ctx->fsync_needed) { + unref_base_inode++; + unref_shard_inode++; + list_del_init(&ctx->to_fsync_list); + if (base_inode) { + __shard_inode_ctx_get(base_inode, this, &base_ictx); + base_ictx->fsync_count--; + } + } + } + UNLOCK(&inode->lock); + if (base_inode) + UNLOCK(&base_inode->lock); + + inode_unlink(inode, priv->dot_shard_inode, block_bname); + inode_ref_reduce_by_n(inode, unref_shard_inode); + inode_forget(inode, 0); + + if (base_inode && unref_base_inode) + inode_ref_reduce_by_n(base_inode, unref_base_inode); + UNLOCK(&priv->lock); +} + +int +shard_rename_cbk(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + SHARD_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->preoldparent, + &local->postoldparent, &local->prenewparent, + &local->postnewparent, local->xattr_rsp); + return 0; +} + +int32_t +shard_unlink_cbk(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = frame->local; + + SHARD_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, + &local->preoldparent, &local->postoldparent, + local->xattr_rsp); + return 0; +} + +int +shard_unlink_shards_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + int shard_block_num = (long)cookie; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto done; + } + + shard_unlink_block_inode(local, shard_block_num); +done: + syncbarrier_wake(&local->barrier); + return 0; +} + +int +shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode) +{ + int i = 0; + int ret = -1; + int count = 0; + uint32_t cur_block = 0; + uint32_t cur_block_idx = 0; /*this is idx into inode_list[] array */ + char *bname = NULL; + char path[PATH_MAX] = { + 0, + }; + uuid_t gfid = { + 0, + }; + loc_t loc = { + 0, + }; + gf_boolean_t wind_failed = _gf_false; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + local = frame->local; + + if (inode) + gf_uuid_copy(gfid, inode->gfid); + else + gf_uuid_copy(gfid, local->base_gfid); + + for (i = 0; i < local->num_blocks; i++) { + if (!local->inode_list[i]) + continue; + count++; + } + + if (!count) { + /* callcount = 0 implies that all of the shards that need to be + * unlinked are non-existent (in other words the file is full of + * holes). + */ + gf_msg_debug(this->name, 0, + "All shards that need to be " + "unlinked are non-existent: %s", + uuid_utoa(gfid)); + return 0; + } + + SHARD_SET_ROOT_FS_ID(frame, local); + local->barrier.waitfor = count; + cur_block = cur_block_idx + local->first_block; + + while (cur_block_idx < local->num_blocks) { + if (!local->inode_list[cur_block_idx]) + goto next; + + if (wind_failed) { + shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1, + ENOMEM, NULL, NULL, NULL); + goto next; + } + + shard_make_block_abspath(cur_block, gfid, path, sizeof(path)); + bname = strrchr(path, '/') + 1; + loc.parent = inode_ref(priv->dot_shard_inode); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed" + " on %s, base file gfid = %s", + bname, uuid_utoa(gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe(&loc); + wind_failed = _gf_true; + shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1, + ENOMEM, NULL, NULL, NULL); + goto next; + } + + loc.name = strrchr(loc.path, '/'); + if (loc.name) + loc.name++; + loc.inode = inode_ref(local->inode_list[cur_block_idx]); + + STACK_WIND_COOKIE(frame, shard_unlink_shards_do_cbk, + (void *)(long)cur_block, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &loc, local->xflag, + local->xattr_req); + loc_wipe(&loc); + next: + cur_block++; + cur_block_idx++; + } + syncbarrier_wait(&local->barrier, count); + SHARD_UNSET_ROOT_FS_ID(frame, local); + return 0; +} + +int +shard_regulated_shards_deletion(call_frame_t *cleanup_frame, xlator_t *this, + int now, int first_block, gf_dirent_t *entry) +{ + int i = 0; + int ret = 0; + shard_local_t *local = NULL; + uuid_t gfid = { + 0, + }; + + local = cleanup_frame->local; + + local->inode_list = GF_CALLOC(now, sizeof(inode_t *), + gf_shard_mt_inode_list); + if (!local->inode_list) + return -ENOMEM; + + local->first_block = first_block; + local->last_block = first_block + now - 1; + local->num_blocks = now; + gf_uuid_parse(entry->d_name, gfid); + gf_uuid_copy(local->base_gfid, gfid); + local->resolver_base_inode = inode_find(this->itable, gfid); + local->call_count = 0; + ret = syncbarrier_init(&local->barrier); + if (ret) { + GF_FREE(local->inode_list); + local->inode_list = NULL; + inode_unref(local->resolver_base_inode); + local->resolver_base_inode = NULL; + return -errno; + } + shard_common_resolve_shards(cleanup_frame, this, + shard_post_resolve_unlink_handler); + + for (i = 0; i < local->num_blocks; i++) { + if (local->inode_list[i]) + inode_unref(local->inode_list[i]); + } + GF_FREE(local->inode_list); + local->inode_list = NULL; + if (local->op_ret) + ret = -local->op_errno; + syncbarrier_destroy(&local->barrier); + inode_unref(local->resolver_base_inode); + local->resolver_base_inode = NULL; + STACK_RESET(cleanup_frame->root); + return ret; +} + +int +__shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this, + gf_dirent_t *entry, inode_t *inode) +{ + int ret = 0; + int shard_count = 0; + int first_block = 0; + int now = 0; + uint64_t size = 0; + uint64_t block_size = 0; + uint64_t size_array[4] = { + 0, + }; + void *bsize = NULL; + void *size_attr = NULL; + dict_t *xattr_rsp = NULL; + loc_t loc = { + 0, + }; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + local = cleanup_frame->local; + ret = dict_reset(local->xattr_req); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to reset dict"); + ret = -ENOMEM; + goto err; + } + + ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE); + ret = -ENOMEM; + goto err; + } + + ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, 8 * 4); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE); + ret = -ENOMEM; + goto err; + } + + loc.inode = inode_ref(inode); + loc.parent = inode_ref(priv->dot_shard_rm_inode); + ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed on %s", entry->d_name); + ret = -ENOMEM; + goto err; + } + + loc.name = strrchr(loc.path, '/'); + if (loc.name) + loc.name++; + ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, local->xattr_req, + &xattr_rsp); + if (ret) + goto err; + + ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to get dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE); + goto err; + } + block_size = ntoh64(*((uint64_t *)bsize)); + + ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_FILE_SIZE, &size_attr); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to get dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE); + goto err; + } + + memcpy(size_array, size_attr, sizeof(size_array)); + size = ntoh64(size_array[0]); + + shard_count = (size / block_size) - 1; + if (shard_count < 0) { + gf_msg_debug(this->name, 0, + "Size of %s hasn't grown beyond " + "its shard-block-size. Nothing to delete. " + "Returning", + entry->d_name); + /* File size < shard-block-size, so nothing to delete */ + ret = 0; + goto delete_marker; + } + if ((size % block_size) > 0) + shard_count++; + + if (shard_count == 0) { + gf_msg_debug(this->name, 0, + "Size of %s is exactly equal to " + "its shard-block-size. Nothing to delete. " + "Returning", + entry->d_name); + ret = 0; + goto delete_marker; + } + gf_msg_debug(this->name, 0, + "base file = %s, " + "shard-block-size=%" PRIu64 ", file-size=%" PRIu64 + ", " + "shard_count=%d", + entry->d_name, block_size, size, shard_count); + + /* Perform a gfid-based lookup to see if gfid corresponding to marker + * file's base name exists. + */ + loc_wipe(&loc); + loc.inode = inode_new(this->itable); + if (!loc.inode) { + ret = -ENOMEM; + goto err; + } + gf_uuid_parse(entry->d_name, loc.gfid); + ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); + if (!ret) { + gf_msg_debug(this->name, 0, + "Base shard corresponding to gfid " + "%s is present. Skipping shard deletion. " + "Returning", + entry->d_name); + ret = 0; + goto delete_marker; + } + + first_block = 1; + + while (shard_count) { + if (shard_count < local->deletion_rate) { + now = shard_count; + shard_count = 0; + } else { + now = local->deletion_rate; + shard_count -= local->deletion_rate; + } + + gf_msg_debug(this->name, 0, + "deleting %d shards starting from " + "block %d of gfid %s", + now, first_block, entry->d_name); + ret = shard_regulated_shards_deletion(cleanup_frame, this, now, + first_block, entry); + if (ret) + goto err; + first_block += now; + } + +delete_marker: + loc_wipe(&loc); + loc.inode = inode_ref(inode); + loc.parent = inode_ref(priv->dot_shard_rm_inode); + ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed on %s", entry->d_name); + ret = -ENOMEM; + goto err; + } + loc.name = strrchr(loc.path, '/'); + if (loc.name) + loc.name++; + ret = syncop_unlink(FIRST_CHILD(this), &loc, NULL, NULL); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SHARDS_DELETION_FAILED, + "Failed to delete %s " + "from /%s", + entry->d_name, GF_SHARD_REMOVE_ME_DIR); +err: + if (xattr_rsp) + dict_unref(xattr_rsp); + loc_wipe(&loc); + return ret; +} + +int +shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this, + gf_dirent_t *entry, inode_t *inode) +{ + int ret = -1; + loc_t loc = { + 0, + }; + shard_priv_t *priv = NULL; + + priv = this->private; + loc.inode = inode_ref(priv->dot_shard_rm_inode); + + ret = syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL, NULL); + if (ret < 0) { + if (ret == -EAGAIN) { + ret = 0; + } + goto out; + } + { + ret = __shard_delete_shards_of_entry(cleanup_frame, this, entry, inode); + } + syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL, NULL); +out: + loc_wipe(&loc); + return ret; +} + +int +shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data) +{ + SHARD_STACK_DESTROY(frame); + return 0; +} + +int +shard_resolve_internal_dir(xlator_t *this, shard_local_t *local, + shard_internal_dir_type_t type) +{ + int ret = 0; + char *bname = NULL; + loc_t *loc = NULL; + shard_priv_t *priv = NULL; + uuid_t gfid = { + 0, + }; + struct iatt stbuf = { + 0, + }; + + priv = this->private; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: + loc = &local->dot_shard_loc; + gf_uuid_copy(gfid, priv->dot_shard_gfid); + bname = GF_SHARD_DIR; + break; + case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: + loc = &local->dot_shard_rm_loc; + gf_uuid_copy(gfid, priv->dot_shard_rm_gfid); + bname = GF_SHARD_REMOVE_ME_DIR; + break; + default: + break; + } + + loc->inode = inode_find(this->itable, gfid); + if (!loc->inode) { + ret = shard_init_internal_dir_loc(this, local, type); + if (ret) + goto err; + ret = dict_reset(local->xattr_req); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to reset " + "dict"); + ret = -ENOMEM; + goto err; + } + ret = dict_set_gfuuid(local->xattr_req, "gfid-req", gfid, true); + ret = syncop_lookup(FIRST_CHILD(this), loc, &stbuf, NULL, + local->xattr_req, NULL); + if (ret < 0) { + if (ret != -ENOENT) + gf_msg(this->name, GF_LOG_ERROR, -ret, + SHARD_MSG_SHARDS_DELETION_FAILED, + "Lookup on %s failed, exiting", bname); + goto err; + } else { + shard_link_internal_dir_inode(local, loc->inode, &stbuf, type); + } + } + ret = 0; +err: + return ret; +} + +int +shard_lookup_marker_entry(xlator_t *this, shard_local_t *local, + gf_dirent_t *entry) +{ + int ret = 0; + loc_t loc = { + 0, + }; + + loc.inode = inode_new(this->itable); + if (!loc.inode) { + ret = -ENOMEM; + goto err; + } + loc.parent = inode_ref(local->fd->inode); + + ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed on %s", entry->d_name); + ret = -ENOMEM; + goto err; + } + + loc.name = strrchr(loc.path, '/'); + if (loc.name) + loc.name++; + + ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); + if (ret < 0) { + goto err; + } + entry->inode = inode_ref(loc.inode); + ret = 0; +err: + loc_wipe(&loc); + return ret; +} + +int +shard_delete_shards(void *opaque) +{ + int ret = 0; + off_t offset = 0; + loc_t loc = { + 0, + }; + inode_t *link_inode = NULL; + xlator_t *this = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + call_frame_t *cleanup_frame = NULL; + gf_boolean_t done = _gf_false; + + this = THIS; + priv = this->private; + INIT_LIST_HEAD(&entries.list); + + cleanup_frame = opaque; + + local = mem_get0(this->local_pool); + if (!local) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, + "Failed to create local to " + "delete shards"); + ret = -ENOMEM; + goto err; + } + cleanup_frame->local = local; + local->fop = GF_FOP_UNLINK; + + local->xattr_req = dict_new(); + if (!local->xattr_req) { + ret = -ENOMEM; + goto err; + } + local->deletion_rate = priv->deletion_rate; + + ret = shard_resolve_internal_dir(this, local, SHARD_INTERNAL_DIR_DOT_SHARD); + if (ret == -ENOENT) { + gf_msg_debug(this->name, 0, + ".shard absent. Nothing to" + " delete. Exiting"); + ret = 0; + goto err; + } else if (ret < 0) { + goto err; + } + + ret = shard_resolve_internal_dir(this, local, + SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); + if (ret == -ENOENT) { + gf_msg_debug(this->name, 0, + ".remove_me absent. " + "Nothing to delete. Exiting"); + ret = 0; + goto err; + } else if (ret < 0) { + goto err; + } + + local->fd = fd_anonymous(local->dot_shard_rm_loc.inode); + if (!local->fd) { + ret = -ENOMEM; + goto err; + } + + for (;;) { + offset = 0; + LOCK(&priv->lock); + { + if (priv->bg_del_state == SHARD_BG_DELETION_LAUNCHING) { + priv->bg_del_state = SHARD_BG_DELETION_IN_PROGRESS; + } else if (priv->bg_del_state == SHARD_BG_DELETION_IN_PROGRESS) { + priv->bg_del_state = SHARD_BG_DELETION_NONE; + done = _gf_true; + } + } + UNLOCK(&priv->lock); + if (done) + break; + while ( + (ret = syncop_readdirp(FIRST_CHILD(this), local->fd, 131072, offset, + &entries, local->xattr_req, NULL))) { + if (ret > 0) + ret = 0; + list_for_each_entry(entry, &entries.list, list) + { + offset = entry->d_off; + + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) + continue; + + if (!entry->inode) { + ret = shard_lookup_marker_entry(this, local, entry); + if (ret < 0) + continue; + } + link_inode = inode_link(entry->inode, local->fd->inode, + entry->d_name, &entry->d_stat); + + gf_msg_debug(this->name, 0, + "Initiating deletion of " + "shards of gfid %s", + entry->d_name); + ret = shard_delete_shards_of_entry(cleanup_frame, this, entry, + link_inode); + inode_unlink(link_inode, local->fd->inode, entry->d_name); + inode_unref(link_inode); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, + SHARD_MSG_SHARDS_DELETION_FAILED, + "Failed to clean up shards of gfid %s", + entry->d_name); + continue; + } + gf_msg(this->name, GF_LOG_INFO, 0, + SHARD_MSG_SHARD_DELETION_COMPLETED, + "Deleted " + "shards of gfid=%s from backend", + entry->d_name); + } + gf_dirent_free(&entries); + if (ret) + break; + } + } + ret = 0; + loc_wipe(&loc); + return ret; + +err: + LOCK(&priv->lock); + { + priv->bg_del_state = SHARD_BG_DELETION_NONE; + } + UNLOCK(&priv->lock); + loc_wipe(&loc); + return ret; +} + +int +shard_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + if (op_ret) + gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, + "Unlock failed. Please check brick logs for " + "more details"); + SHARD_STACK_DESTROY(frame); + return 0; +} + +int +shard_unlock_inodelk(call_frame_t *frame, xlator_t *this) +{ + loc_t *loc = NULL; + call_frame_t *lk_frame = NULL; + shard_local_t *local = NULL; + shard_local_t *lk_local = NULL; + shard_inodelk_t *lock = NULL; + + local = frame->local; + lk_frame = local->inodelk_frame; + lk_local = lk_frame->local; + local->inodelk_frame = NULL; + loc = &local->int_inodelk.loc; + lock = &lk_local->int_inodelk; + lock->flock.l_type = F_UNLCK; + + STACK_WIND(lk_frame, shard_unlock_inodelk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->inodelk, lock->domain, loc, F_SETLK, + &lock->flock, NULL); + local->int_inodelk.acquired_lock = _gf_false; + return 0; +} + +int +shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata); +int +shard_rename_src_base_file(call_frame_t *frame, xlator_t *this) +{ + int ret = 0; + loc_t *dst_loc = NULL; + loc_t tmp_loc = { + 0, + }; + shard_local_t *local = frame->local; + + if (local->dst_block_size) { + tmp_loc.parent = inode_ref(local->loc2.parent); + ret = inode_path(tmp_loc.parent, local->loc2.name, + (char **)&tmp_loc.path); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed" + " on pargfid=%s bname=%s", + uuid_utoa(tmp_loc.parent->gfid), local->loc2.name); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + + tmp_loc.name = strrchr(tmp_loc.path, '/'); + if (tmp_loc.name) + tmp_loc.name++; + dst_loc = &tmp_loc; + } else { + dst_loc = &local->loc2; + } + + /* To-Do: Request open-fd count on dst base file */ + STACK_WIND(frame, shard_rename_src_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, &local->loc, dst_loc, + local->xattr_req); + loc_wipe(&tmp_loc); + return 0; +err: + loc_wipe(&tmp_loc); + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; +} + +int +shard_unlink_base_file(call_frame_t *frame, xlator_t *this); + +int +shard_set_size_attrs_on_marker_file_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + + priv = this->private; + local = frame->local; + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, + "Xattrop on marker file failed " + "while performing %s; entry gfid=%s", + gf_fop_string(local->fop), local->newloc.name); + goto err; + } + + inode_unlink(local->newloc.inode, priv->dot_shard_rm_inode, + local->newloc.name); + + if (local->fop == GF_FOP_UNLINK) + shard_unlink_base_file(frame, this); + else if (local->fop == GF_FOP_RENAME) + shard_rename_src_base_file(frame, this); + return 0; +err: + shard_common_failure_unwind(local->fop, frame, op_ret, op_errno); + return 0; +} + +int +shard_set_size_attrs_on_marker_file(call_frame_t *frame, xlator_t *this) +{ + int op_errno = ENOMEM; + uint64_t bs = 0; + dict_t *xdata = NULL; + shard_local_t *local = NULL; + + local = frame->local; + xdata = dict_new(); + if (!xdata) + goto err; + + if (local->fop == GF_FOP_UNLINK) + bs = local->block_size; + else if (local->fop == GF_FOP_RENAME) + bs = local->dst_block_size; + SHARD_INODE_CREATE_INIT(this, bs, xdata, &local->newloc, + local->prebuf.ia_size, 0, err); + STACK_WIND(frame, shard_set_size_attrs_on_marker_file_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop, + &local->newloc, GF_XATTROP_GET_AND_SET, xdata, NULL); + dict_unref(xdata); + return 0; +err: + if (xdata) + dict_unref(xdata); + shard_common_failure_unwind(local->fop, frame, -1, op_errno); + return 0; +} + +int +shard_lookup_marker_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + inode_t *linked_inode = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + + local = frame->local; + priv = this->private; + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, + "Lookup on marker file failed " + "while performing %s; entry gfid=%s", + gf_fop_string(local->fop), local->newloc.name); + goto err; + } + + linked_inode = inode_link(inode, priv->dot_shard_rm_inode, + local->newloc.name, buf); + inode_unref(local->newloc.inode); + local->newloc.inode = linked_inode; + shard_set_size_attrs_on_marker_file(frame, this); + return 0; +err: + shard_common_failure_unwind(local->fop, frame, op_ret, op_errno); + return 0; +} + +int +shard_lookup_marker_file(call_frame_t *frame, xlator_t *this) +{ + int op_errno = ENOMEM; + dict_t *xattr_req = NULL; + shard_local_t *local = NULL; + + local = frame->local; + + xattr_req = shard_create_gfid_dict(local->xattr_req); + if (!xattr_req) + goto err; + + STACK_WIND(frame, shard_lookup_marker_file_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &local->newloc, xattr_req); + dict_unref(xattr_req); + return 0; +err: + shard_common_failure_unwind(local->fop, frame, -1, op_errno); + return 0; +} + +int +shard_create_marker_file_under_remove_me_cbk( + call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + inode_t *linked_inode = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + + local = frame->local; + priv = this->private; + + SHARD_UNSET_ROOT_FS_ID(frame, local); + if (op_ret < 0) { + if ((op_errno != EEXIST) && (op_errno != ENODATA)) { + local->op_ret = op_ret; + local->op_errno = op_errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, + "Marker file creation " + "failed while performing %s; entry gfid=%s", + gf_fop_string(local->fop), local->newloc.name); + goto err; + } else { + shard_lookup_marker_file(frame, this); + return 0; + } + } + + linked_inode = inode_link(inode, priv->dot_shard_rm_inode, + local->newloc.name, buf); + inode_unref(local->newloc.inode); + local->newloc.inode = linked_inode; + + if (local->fop == GF_FOP_UNLINK) + shard_unlink_base_file(frame, this); + else if (local->fop == GF_FOP_RENAME) + shard_rename_src_base_file(frame, this); + return 0; +err: + shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); + return 0; +} + +int +shard_create_marker_file_under_remove_me(call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + int ret = 0; + int op_errno = ENOMEM; + uint64_t bs = 0; + char g1[64] = { + 0, + }; + char g2[64] = { + 0, + }; + dict_t *xattr_req = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + SHARD_SET_ROOT_FS_ID(frame, local); + + xattr_req = shard_create_gfid_dict(local->xattr_req); + if (!xattr_req) + goto err; + + local->newloc.inode = inode_new(this->itable); + local->newloc.parent = inode_ref(priv->dot_shard_rm_inode); + ret = inode_path(local->newloc.parent, uuid_utoa(loc->inode->gfid), + (char **)&local->newloc.path); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed on " + "pargfid=%s bname=%s", + uuid_utoa_r(priv->dot_shard_rm_gfid, g1), + uuid_utoa_r(loc->inode->gfid, g2)); + goto err; + } + local->newloc.name = strrchr(local->newloc.path, '/'); + if (local->newloc.name) + local->newloc.name++; + + if (local->fop == GF_FOP_UNLINK) + bs = local->block_size; + else if (local->fop == GF_FOP_RENAME) + bs = local->dst_block_size; + + SHARD_INODE_CREATE_INIT(this, bs, xattr_req, &local->newloc, + local->prebuf.ia_size, 0, err); + + STACK_WIND(frame, shard_create_marker_file_under_remove_me_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, + &local->newloc, 0, 0, 0644, xattr_req); + dict_unref(xattr_req); + return 0; + +err: + if (xattr_req) + dict_unref(xattr_req); + shard_create_marker_file_under_remove_me_cbk(frame, 0, this, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); + return 0; +} + +int +shard_unlock_entrylk(call_frame_t *frame, xlator_t *this); + +int +shard_unlink_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + int ret = 0; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } else { + shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this); + local->preoldparent = *preparent; + local->postoldparent = *postparent; + if (xdata) + local->xattr_rsp = dict_ref(xdata); + if (local->cleanup_required) + shard_start_background_deletion(this); + } + + if (local->entrylk_frame) { + ret = shard_unlock_entrylk(frame, this); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + } + } + + ret = shard_unlock_inodelk(frame, this); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + } + + shard_unlink_cbk(frame, this); + return 0; +} + +int +shard_unlink_base_file(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = frame->local; + + /* To-Do: Request open-fd count on base file */ + STACK_WIND(frame, shard_unlink_base_file_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag, + local->xattr_req); + return 0; +} + +int +shard_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + if (op_ret) + gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, + "Unlock failed. Please check brick logs for " + "more details"); + SHARD_STACK_DESTROY(frame); + return 0; +} + +int +shard_unlock_entrylk(call_frame_t *frame, xlator_t *this) +{ + loc_t *loc = NULL; + call_frame_t *lk_frame = NULL; + shard_local_t *local = NULL; + shard_local_t *lk_local = NULL; + shard_entrylk_t *lock = NULL; + + local = frame->local; + lk_frame = local->entrylk_frame; + lk_local = lk_frame->local; + local->entrylk_frame = NULL; + lock = &lk_local->int_entrylk; + loc = &lock->loc; + + STACK_WIND(lk_frame, shard_unlock_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, loc, + lk_local->int_entrylk.basename, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, + NULL); + local->int_entrylk.acquired_lock = _gf_false; + return 0; +} + +int +shard_post_entrylk_fop_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + switch (local->fop) { + case GF_FOP_UNLINK: + case GF_FOP_RENAME: + shard_create_marker_file_under_remove_me(frame, this, + &local->int_inodelk.loc); + break; + default: + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, + "post-entrylk handler not defined. This case should not" + " be hit"); + break; + } + return 0; +} + +int +shard_acquire_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + call_frame_t *main_frame = NULL; + shard_local_t *local = NULL; + shard_local_t *main_local = NULL; + + local = frame->local; + main_frame = local->main_frame; + main_local = main_frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(main_local->fop, main_frame, op_ret, + op_errno); + return 0; + } + main_local->int_entrylk.acquired_lock = _gf_true; + shard_post_entrylk_fop_handler(main_frame, this); + return 0; +} + +int +shard_acquire_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid) +{ + char gfid_str[GF_UUID_BUF_SIZE] = { + 0, + }; + shard_local_t *local = NULL; + shard_local_t *entrylk_local = NULL; + shard_entrylk_t *int_entrylk = NULL; + call_frame_t *entrylk_frame = NULL; + + local = frame->local; + entrylk_frame = create_frame(this, this->ctx->pool); + if (!entrylk_frame) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, + "Failed to create new frame " + "to lock marker file"); + goto err; + } + + entrylk_local = mem_get0(this->local_pool); + if (!entrylk_local) { + STACK_DESTROY(entrylk_frame->root); + goto err; + } + + entrylk_frame->local = entrylk_local; + entrylk_local->main_frame = frame; + int_entrylk = &entrylk_local->int_entrylk; + + int_entrylk->loc.inode = inode_ref(inode); + set_lk_owner_from_ptr(&entrylk_frame->root->lk_owner, entrylk_frame->root); + local->entrylk_frame = entrylk_frame; + gf_uuid_unparse(gfid, gfid_str); + int_entrylk->basename = gf_strdup(gfid_str); + + STACK_WIND(entrylk_frame, shard_acquire_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &int_entrylk->loc, + int_entrylk->basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + return 0; +err: + shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); + return 0; +} + +int +shard_post_lookup_base_shard_rm_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); + return 0; + } + + if (local->prebuf.ia_nlink > 1) { + gf_msg_debug(this->name, 0, + "link count on %s > 1:%d, " + "performing rename()/unlink()", + local->int_inodelk.loc.path, local->prebuf.ia_nlink); + if (local->fop == GF_FOP_RENAME) + shard_rename_src_base_file(frame, this); + else if (local->fop == GF_FOP_UNLINK) + shard_unlink_base_file(frame, this); + } else { + gf_msg_debug(this->name, 0, + "link count on %s = 1, creating " + "file under .remove_me", + local->int_inodelk.loc.path); + local->cleanup_required = _gf_true; + shard_acquire_entrylk(frame, this, priv->dot_shard_rm_inode, + local->prebuf.ia_gfid); + } + return 0; +} + +int +shard_post_inodelk_fop_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + switch (local->fop) { + case GF_FOP_UNLINK: + case GF_FOP_RENAME: + shard_refresh_base_file(frame, this, &local->int_inodelk.loc, NULL, + shard_post_lookup_base_shard_rm_handler); + break; + default: + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, + "post-inodelk handler not defined. This case should not" + " be hit"); + break; + } + return 0; +} + +int +shard_acquire_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + call_frame_t *main_frame = NULL; + shard_local_t *local = NULL; + shard_local_t *main_local = NULL; + + local = frame->local; + main_frame = local->main_frame; + main_local = main_frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(main_local->fop, main_frame, op_ret, + op_errno); + return 0; + } + main_local->int_inodelk.acquired_lock = _gf_true; + shard_post_inodelk_fop_handler(main_frame, this); + return 0; +} + +int +shard_acquire_inodelk(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + call_frame_t *lk_frame = NULL; + shard_local_t *local = NULL; + shard_local_t *lk_local = NULL; + shard_inodelk_t *int_inodelk = NULL; + + local = frame->local; + lk_frame = create_frame(this, this->ctx->pool); + if (!lk_frame) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, + "Failed to create new frame " + "to lock base shard"); + goto err; + } + lk_local = mem_get0(this->local_pool); + if (!lk_local) { + STACK_DESTROY(lk_frame->root); + goto err; + } + + lk_frame->local = lk_local; + lk_local->main_frame = frame; + int_inodelk = &lk_local->int_inodelk; + + int_inodelk->flock.l_len = 0; + int_inodelk->flock.l_start = 0; + int_inodelk->domain = this->name; + int_inodelk->flock.l_type = F_WRLCK; + loc_copy(&local->int_inodelk.loc, loc); + set_lk_owner_from_ptr(&lk_frame->root->lk_owner, lk_frame->root); + local->inodelk_frame = lk_frame; + + STACK_WIND(lk_frame, shard_acquire_inodelk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->inodelk, int_inodelk->domain, + &local->int_inodelk.loc, F_SETLKW, &int_inodelk->flock, NULL); + return 0; +err: + shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); + return 0; +} + +int +shard_post_mkdir_rm_handler(call_frame_t *frame, xlator_t *this) +{ + loc_t *loc = NULL; + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); + return 0; + } + if (local->fop == GF_FOP_UNLINK) + loc = &local->loc; + else if (local->fop == GF_FOP_RENAME) + loc = &local->loc2; + shard_acquire_inodelk(frame, this, loc); + return 0; +} + +int +shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, + shard_post_resolve_fop_handler_t handler, + shard_internal_dir_type_t type); +int +shard_pre_mkdir_rm_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); + return 0; + } + shard_mkdir_internal_dir(frame, this, shard_post_mkdir_rm_handler, + SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); + return 0; +} + +void +shard_begin_rm_resolution(call_frame_t *frame, xlator_t *this) +{ + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + local->dot_shard_rm_loc.inode = inode_find(this->itable, + priv->dot_shard_rm_gfid); + if (!local->dot_shard_rm_loc.inode) { + local->dot_shard_loc.inode = inode_find(this->itable, + priv->dot_shard_gfid); + if (!local->dot_shard_loc.inode) { + shard_mkdir_internal_dir(frame, this, shard_pre_mkdir_rm_handler, + SHARD_INTERNAL_DIR_DOT_SHARD); + } else { + local->post_res_handler = shard_pre_mkdir_rm_handler; + shard_refresh_internal_dir(frame, this, + SHARD_INTERNAL_DIR_DOT_SHARD); + } + } else { + local->post_res_handler = shard_post_mkdir_rm_handler; + shard_refresh_internal_dir(frame, this, + SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); + } +} + +int +shard_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); + if ((ret) && (!IA_ISLNK(loc->inode->ia_type))) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size from inode ctx of %s", + uuid_utoa(loc->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); + return 0; + } + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + loc_copy(&local->loc, loc); + local->xflag = xflag; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + local->block_size = block_size; + local->resolver_base_inode = loc->inode; + local->fop = GF_FOP_UNLINK; + if (!this->itable) + this->itable = (local->loc.inode)->table; + + local->resolve_not = _gf_true; + shard_begin_rm_resolution(frame, this); + return 0; +err: + shard_common_failure_unwind(GF_FOP_UNLINK, frame, -1, ENOMEM); + return 0; +} + +int +shard_post_rename_lookup_handler(call_frame_t *frame, xlator_t *this) +{ + shard_rename_cbk(frame, this); + return 0; +} + +int +shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + int ret = 0; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto err; + } + /* Set ctx->refresh to TRUE to force a lookup on disk when + * shard_lookup_base_file() is called next to refresh the hard link + * count in ctx. Note that this is applicable only to the case where + * the rename dst is already existent and sharded. + */ + if ((local->dst_block_size) && (!local->cleanup_required)) + shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this); + + local->prebuf = *buf; + local->preoldparent = *preoldparent; + local->postoldparent = *postoldparent; + local->prenewparent = *prenewparent; + local->postnewparent = *postnewparent; + if (xdata) + local->xattr_rsp = dict_ref(xdata); + + if (local->dst_block_size) { + if (local->entrylk_frame) { + ret = shard_unlock_entrylk(frame, this); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + } + } + + ret = shard_unlock_inodelk(frame, this); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + goto err; + } + if (local->cleanup_required) + shard_start_background_deletion(this); + } + + /* Now the base file of src, if sharded, is looked up to gather ia_size + * and ia_blocks.*/ + if (local->block_size) { + local->tmp_loc.inode = inode_new(this->itable); + gf_uuid_copy(local->tmp_loc.gfid, (local->loc.inode)->gfid); + shard_refresh_base_file(frame, this, &local->tmp_loc, NULL, + shard_post_rename_lookup_handler); + } else { + shard_rename_cbk(frame, this); + } + return 0; +err: + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; +} + +int +shard_post_lookup_dst_base_file_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + /* Save dst base file attributes into postbuf so the information is not + * lost when it is overwritten after lookup on base file of src in + * shard_lookup_base_file_cbk(). + */ + local->postbuf = local->prebuf; + shard_rename_src_base_file(frame, this); + return 0; +} + +int +shard_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + uint64_t dst_block_size = 0; + shard_local_t *local = NULL; + + if (IA_ISDIR(oldloc->inode->ia_type)) { + STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; + } + + ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size); + if ((ret) && (!IA_ISLNK(oldloc->inode->ia_type))) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size from inode ctx of %s", + uuid_utoa(oldloc->inode->gfid)); + goto err; + } + + if (newloc->inode) + ret = shard_inode_ctx_get_block_size(newloc->inode, this, + &dst_block_size); + + /* The following stack_wind covers the case where: + * a. the src file is not sharded and dst doesn't exist, OR + * b. the src and dst both exist but are not sharded. + */ + if (((!block_size) && (!dst_block_size)) || + frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; + } + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + loc_copy(&local->loc, oldloc); + loc_copy(&local->loc2, newloc); + local->resolver_base_inode = newloc->inode; + local->fop = GF_FOP_RENAME; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + + local->block_size = block_size; + local->dst_block_size = dst_block_size; + if (!this->itable) + this->itable = (local->loc.inode)->table; + local->resolve_not = _gf_true; + + /* The following if-block covers the case where the dst file exists + * and is sharded. + */ + if (local->dst_block_size) { + shard_begin_rm_resolution(frame, this); + } else { + /* The following block covers the case where the dst either doesn't + * exist or is NOT sharded but the src is sharded. In this case, shard + * xlator would go ahead and rename src to dst. Once done, it would also + * lookup the base shard of src to get the ia_size and ia_blocks xattr + * values. + */ + shard_rename_src_base_file(frame, this); + } + return 0; + +err: + shard_common_failure_unwind(GF_FOP_RENAME, frame, -1, ENOMEM); + return 0; +} + +int +shard_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int ret = -1; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret == -1) + goto unwind; + + ret = shard_inode_ctx_set(inode, this, stbuf, local->block_size, + SHARD_ALL_MASK); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED, + "Failed to set inode " + "ctx for %s", + uuid_utoa(inode->gfid)); + +unwind: + SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf, + preparent, postparent, xdata); + return 0; +} + +int +shard_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + + priv = this->private; + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + local->block_size = priv->block_size; + + if (!__is_gsyncd_on_shard_dir(frame, loc)) { + SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err); + } + + STACK_WIND(frame, shard_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; +err: + shard_common_failure_unwind(GF_FOP_CREATE, frame, -1, ENOMEM); + return 0; +} + +int +shard_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + /* To-Do: Handle open with O_TRUNC under locks */ + SHARD_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata); + return 0; +} + +int +shard_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + STACK_WIND(frame, shard_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +} + +int +shard_readv_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) +{ + int i = 0; + int call_count = 0; + void *address = NULL; + uint64_t block_num = 0; + off_t off = 0; + struct iovec vec = { + 0, + }; + shard_local_t *local = NULL; + fd_t *anon_fd = cookie; + shard_inode_ctx_t *ctx = NULL; + + local = frame->local; + + /* If shard has already seen a failure here before, there is no point + * in aggregating subsequent reads, so just go to out. + */ + if (local->op_ret < 0) + goto out; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto out; + } + + if (local->op_ret >= 0) + local->op_ret += op_ret; + + shard_inode_ctx_get(anon_fd->inode, this, &ctx); + block_num = ctx->block_num; + + if (block_num == local->first_block) { + address = local->iobuf->ptr; + } else { + /* else + * address to start writing to = beginning of buffer + + * number of bytes until end of first block + + * + block_size times number of blocks + * between the current block and the first + */ + address = (char *)local->iobuf->ptr + + (local->block_size - (local->offset % local->block_size)) + + ((block_num - local->first_block - 1) * local->block_size); + } + + for (i = 0; i < count; i++) { + address = (char *)address + off; + memcpy(address, vector[i].iov_base, vector[i].iov_len); + off += vector[i].iov_len; + } + +out: + if (anon_fd) + fd_unref(anon_fd); + call_count = shard_call_count_return(frame); + if (call_count == 0) { + SHARD_UNSET_ROOT_FS_ID(frame, local); + if (local->op_ret < 0) { + shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, + local->op_errno); + } else { + if (xdata) + local->xattr_rsp = dict_ref(xdata); + vec.iov_base = local->iobuf->ptr; + if (local->offset + local->req_size > local->prebuf.ia_size) + local->total_size = local->prebuf.ia_size - local->offset; + vec.iov_len = local->total_size; + local->op_ret = local->total_size; + SHARD_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, + &vec, 1, &local->prebuf, local->iobref, + local->xattr_rsp); + return 0; + } + } + + return 0; +} + +int +shard_readv_do(call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + int last_block = 0; + int cur_block = 0; + off_t orig_offset = 0; + off_t shard_offset = 0; + size_t read_size = 0; + size_t remaining_size = 0; + fd_t *fd = NULL; + fd_t *anon_fd = NULL; + shard_local_t *local = NULL; + gf_boolean_t wind_failed = _gf_false; + + local = frame->local; + fd = local->fd; + + orig_offset = local->offset; + cur_block = local->first_block; + last_block = local->last_block; + remaining_size = local->total_size; + local->call_count = call_count = local->num_blocks; + + SHARD_SET_ROOT_FS_ID(frame, local); + + if (fd->flags & O_DIRECT) + local->flags = O_DIRECT; + + while (cur_block <= last_block) { + if (wind_failed) { + shard_readv_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, NULL, + 0, NULL, NULL, NULL); + goto next; + } + + shard_offset = orig_offset % local->block_size; + read_size = local->block_size - shard_offset; + if (read_size > remaining_size) + read_size = remaining_size; + + remaining_size -= read_size; + + if (cur_block == 0) { + anon_fd = fd_ref(fd); + } else { + anon_fd = fd_anonymous(local->inode_list[i]); + if (!anon_fd) { + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + shard_readv_do_cbk(frame, (void *)(long)anon_fd, this, -1, + ENOMEM, NULL, 0, NULL, NULL, NULL); + goto next; + } + } + + STACK_WIND_COOKIE(frame, shard_readv_do_cbk, anon_fd, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, anon_fd, read_size, + shard_offset, local->flags, local->xattr_req); + + orig_offset += read_size; + next: + cur_block++; + i++; + call_count--; + } + return 0; +} + +int +shard_common_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int shard_block_num = (long)cookie; + int call_count = 0; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + if (op_errno == EEXIST) { + LOCK(&frame->lock); + { + local->eexist_count++; + } + UNLOCK(&frame->lock); + } else { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + gf_msg_debug(this->name, 0, + "mknod of shard %d " + "failed: %s", + shard_block_num, strerror(op_errno)); + goto done; + } + + shard_link_block_inode(local, shard_block_num, inode, buf); + +done: + call_count = shard_call_count_return(frame); + if (call_count == 0) { + SHARD_UNSET_ROOT_FS_ID(frame, local); + local->create_count = 0; + local->post_mknod_handler(frame, this); + } + + return 0; +} + +int +shard_common_resume_mknod(call_frame_t *frame, xlator_t *this, + shard_post_mknod_fop_handler_t post_mknod_handler) +{ + int i = 0; + int shard_idx_iter = 0; + int last_block = 0; + int ret = 0; + int call_count = 0; + char path[PATH_MAX] = { + 0, + }; + mode_t mode = 0; + char *bname = NULL; + shard_priv_t *priv = NULL; + shard_inode_ctx_t ctx_tmp = { + 0, + }; + shard_local_t *local = NULL; + gf_boolean_t wind_failed = _gf_false; + fd_t *fd = NULL; + loc_t loc = { + 0, + }; + dict_t *xattr_req = NULL; + + local = frame->local; + priv = this->private; + fd = local->fd; + shard_idx_iter = local->first_block; + last_block = local->last_block; + call_count = local->call_count = local->create_count; + local->post_mknod_handler = post_mknod_handler; + + SHARD_SET_ROOT_FS_ID(frame, local); + + ret = shard_inode_ctx_get_all(fd->inode, this, &ctx_tmp); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get inode " + "ctx for %s", + uuid_utoa(fd->inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + mode = st_mode_from_ia(ctx_tmp.stat.ia_prot, ctx_tmp.stat.ia_type); + + while (shard_idx_iter <= last_block) { + if (local->inode_list[i]) { + shard_idx_iter++; + i++; + continue; + } + + if (wind_failed) { + shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, + -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); + goto next; + } + + shard_make_block_abspath(shard_idx_iter, fd->inode->gfid, path, + sizeof(path)); + + xattr_req = shard_create_gfid_dict(local->xattr_req); + if (!xattr_req) { + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, + -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); + goto next; + } + + bname = strrchr(path, '/') + 1; + loc.inode = inode_new(this->itable); + loc.parent = inode_ref(priv->dot_shard_inode); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); + if (ret < 0 || !(loc.inode)) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed" + "on %s, base file gfid = %s", + bname, uuid_utoa(fd->inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + loc_wipe(&loc); + dict_unref(xattr_req); + shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, + -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); + goto next; + } + + loc.name = strrchr(loc.path, '/'); + if (loc.name) + loc.name++; + + STACK_WIND_COOKIE(frame, shard_common_mknod_cbk, + (void *)(long)shard_idx_iter, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, &loc, mode, + ctx_tmp.stat.ia_rdev, 0, xattr_req); + loc_wipe(&loc); + dict_unref(xattr_req); + + next: + shard_idx_iter++; + i++; + if (!--call_count) + break; + } + + return 0; +err: + /* + * This block is for handling failure in shard_inode_ctx_get_all(). + * Failures in the while-loop are handled within the loop. + */ + SHARD_UNSET_ROOT_FS_ID(frame, local); + post_mknod_handler(frame, this); + return 0; +} + +int +shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this); + +int +shard_post_lookup_shards_readv_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (local->create_count) { + shard_common_resume_mknod(frame, this, shard_post_mknod_readv_handler); + } else { + shard_readv_do(frame, this); + } + + return 0; +} + +int +shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (!local->eexist_count) { + shard_readv_do(frame, this); + } else { + local->call_count = local->eexist_count; + shard_common_lookup_shards(frame, this, local->loc.inode, + shard_post_lookup_shards_readv_handler); + } + return 0; +} + +int +shard_post_resolve_readv_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + if (local->op_errno != ENOENT) { + shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, + local->op_errno); + return 0; + } else { + struct iovec vec = { + 0, + }; + + vec.iov_base = local->iobuf->ptr; + vec.iov_len = local->total_size; + local->op_ret = local->total_size; + SHARD_STACK_UNWIND(readv, frame, local->op_ret, 0, &vec, 1, + &local->prebuf, local->iobref, NULL); + return 0; + } + } + + if (local->call_count) { + shard_common_lookup_shards(frame, this, local->resolver_base_inode, + shard_post_lookup_shards_readv_handler); + } else { + shard_readv_do(frame, this); + } + + return 0; +} + +int +shard_post_lookup_readv_handler(call_frame_t *frame, xlator_t *this) +{ + int ret = 0; + struct iobuf *iobuf = NULL; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (local->offset >= local->prebuf.ia_size) { + /* If the read is being performed past the end of the file, + * unwind the FOP with 0 bytes read as status. + */ + struct iovec vec = { + 0, + }; + + iobuf = iobuf_get2(this->ctx->iobuf_pool, local->req_size); + if (!iobuf) + goto err; + + vec.iov_base = iobuf->ptr; + vec.iov_len = 0; + local->iobref = iobref_new(); + iobref_add(local->iobref, iobuf); + iobuf_unref(iobuf); + + SHARD_STACK_UNWIND(readv, frame, 0, 0, &vec, 1, &local->prebuf, + local->iobref, NULL); + return 0; + } + + local->first_block = get_lowest_block(local->offset, local->block_size); + + local->total_size = local->req_size; + + local->last_block = get_highest_block(local->offset, local->total_size, + local->block_size); + + local->num_blocks = local->last_block - local->first_block + 1; + GF_ASSERT(local->num_blocks > 0); + local->resolver_base_inode = local->loc.inode; + + local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), + gf_shard_mt_inode_list); + if (!local->inode_list) + goto err; + + iobuf = iobuf_get2(this->ctx->iobuf_pool, local->total_size); + if (!iobuf) + goto err; + + local->iobref = iobref_new(); + if (!local->iobref) { + iobuf_unref(iobuf); + goto err; + } + + if (iobref_add(local->iobref, iobuf) != 0) { + iobuf_unref(iobuf); + goto err; + } + + memset(iobuf->ptr, 0, local->total_size); + iobuf_unref(iobuf); + local->iobuf = iobuf; + + local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); + if (!local->dot_shard_loc.inode) { + ret = shard_init_internal_dir_loc(this, local, + SHARD_INTERNAL_DIR_DOT_SHARD); + if (ret) + goto err; + shard_lookup_internal_dir(frame, this, shard_post_resolve_readv_handler, + SHARD_INTERNAL_DIR_DOT_SHARD); + } else { + local->post_res_handler = shard_post_resolve_readv_handler; + shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); + } + return 0; +err: + shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); + return 0; +} + +int +shard_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int ret = 0; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size for %s from its inode ctx", + uuid_utoa(fd->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + /* block_size = 0 means that the file was created before + * sharding was enabled on the volume. + */ + STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); + return 0; + } + + if (!this->itable) + this->itable = fd->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + ret = syncbarrier_init(&local->barrier); + if (ret) + goto err; + local->fd = fd_ref(fd); + local->block_size = block_size; + local->offset = offset; + local->req_size = size; + local->flags = flags; + local->fop = GF_FOP_READ; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + + shard_refresh_base_file(frame, this, NULL, fd, + shard_post_lookup_readv_handler); + return 0; +err: + shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); + return 0; +} + +int +shard_common_inode_write_post_update_size_handler(call_frame_t *frame, + xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + } else { + shard_common_inode_write_success_unwind(local->fop, frame, + local->written_size); + } + return 0; +} + +static gf_boolean_t +shard_is_appending_write(shard_local_t *local) +{ + if (local->fop != GF_FOP_WRITE) + return _gf_false; + if (local->flags & O_APPEND) + return _gf_true; + if (local->fd->flags & O_APPEND) + return _gf_true; + return _gf_false; +} + +int +__shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode, + xlator_t *this) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret < 0) + return ret; + + ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + + if (shard_is_appending_write(local)) { + local->delta_size = local->total_size; + } else if (local->offset + local->total_size > ctx->stat.ia_size) { + local->delta_size = (local->offset + local->total_size) - + ctx->stat.ia_size; + } else { + local->delta_size = 0; + } + ctx->stat.ia_size += (local->delta_size); + local->postbuf = ctx->stat; + + return 0; +} + +int +shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode, + xlator_t *this) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_get_delta_size_from_inode_ctx(local, inode, this); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +shard_common_inode_write_do_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + int call_count = 0; + fd_t *anon_fd = cookie; + shard_local_t *local = NULL; + glusterfs_fop_t fop = 0; + + local = frame->local; + fop = local->fop; + + LOCK(&frame->lock); + { + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } else { + local->written_size += op_ret; + GF_ATOMIC_ADD(local->delta_blocks, + post->ia_blocks - pre->ia_blocks); + local->delta_size += (post->ia_size - pre->ia_size); + shard_inode_ctx_set(local->fd->inode, this, post, 0, + SHARD_MASK_TIMES); + if (local->fd->inode != anon_fd->inode) + shard_inode_ctx_add_to_fsync_list(local->fd->inode, this, + anon_fd->inode); + } + } + UNLOCK(&frame->lock); + + if (anon_fd) + fd_unref(anon_fd); + + call_count = shard_call_count_return(frame); + if (call_count == 0) { + SHARD_UNSET_ROOT_FS_ID(frame, local); + if (local->op_ret < 0) { + shard_common_failure_unwind(fop, frame, local->op_ret, + local->op_errno); + } else { + shard_get_delta_size_from_inode_ctx(local, local->fd->inode, this); + local->hole_size = 0; + if (xdata) + local->xattr_rsp = dict_ref(xdata); + shard_update_file_size( + frame, this, local->fd, NULL, + shard_common_inode_write_post_update_size_handler); + } + } + + return 0; +} + +int +shard_common_inode_write_wind(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vec, int count, off_t shard_offset, + size_t size) +{ + shard_local_t *local = NULL; + + local = frame->local; + + switch (local->fop) { + case GF_FOP_WRITE: + STACK_WIND_COOKIE( + frame, shard_common_inode_write_do_cbk, fd, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vec, count, shard_offset, + local->flags, local->iobref, local->xattr_req); + break; + case GF_FOP_FALLOCATE: + STACK_WIND_COOKIE( + frame, shard_common_inode_write_do_cbk, fd, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, local->flags, + shard_offset, size, local->xattr_req); + break; + case GF_FOP_ZEROFILL: + STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, + shard_offset, size, local->xattr_req); + break; + case GF_FOP_DISCARD: + STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, + shard_offset, size, local->xattr_req); + break; + default: + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, + "Invalid fop id = %d", local->fop); + break; + } + return 0; +} + +int +shard_common_inode_write_do(call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int count = 0; + int call_count = 0; + int last_block = 0; + uint32_t cur_block = 0; + fd_t *fd = NULL; + fd_t *anon_fd = NULL; + shard_local_t *local = NULL; + struct iovec *vec = NULL; + gf_boolean_t wind_failed = _gf_false; + gf_boolean_t odirect = _gf_false; + off_t orig_offset = 0; + off_t shard_offset = 0; + off_t vec_offset = 0; + size_t remaining_size = 0; + size_t shard_write_size = 0; + + local = frame->local; + fd = local->fd; + + orig_offset = local->offset; + remaining_size = local->total_size; + cur_block = local->first_block; + local->call_count = call_count = local->num_blocks; + last_block = local->last_block; + + SHARD_SET_ROOT_FS_ID(frame, local); + + if (dict_set_uint32(local->xattr_req, GLUSTERFS_WRITE_UPDATE_ATOMIC, 4)) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set " GLUSTERFS_WRITE_UPDATE_ATOMIC + " into " + "dict: %s", + uuid_utoa(fd->inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + local->call_count = 1; + shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, + ENOMEM, NULL, NULL, NULL); + return 0; + } + + if ((fd->flags & O_DIRECT) && (local->fop == GF_FOP_WRITE)) + odirect = _gf_true; + + while (cur_block <= last_block) { + if (wind_failed) { + shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, + ENOMEM, NULL, NULL, NULL); + goto next; + } + + shard_offset = orig_offset % local->block_size; + shard_write_size = local->block_size - shard_offset; + if (shard_write_size > remaining_size) + shard_write_size = remaining_size; + + remaining_size -= shard_write_size; + + if (local->fop == GF_FOP_WRITE) { + vec = NULL; + count = iov_subset(local->vector, local->count, vec_offset, + shard_write_size, &vec, 0); + if (count < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, + -1, ENOMEM, NULL, NULL, NULL); + goto next; + } + } + + if (cur_block == 0) { + anon_fd = fd_ref(fd); + } else { + anon_fd = fd_anonymous(local->inode_list[i]); + if (!anon_fd) { + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + GF_FREE(vec); + shard_common_inode_write_do_cbk(frame, (void *)(long)anon_fd, + this, -1, ENOMEM, NULL, NULL, + NULL); + goto next; + } + + if (local->fop == GF_FOP_WRITE) { + if (odirect) + local->flags = O_DIRECT; + else + local->flags = GF_ANON_FD_FLAGS; + } + } + + shard_common_inode_write_wind(frame, this, anon_fd, vec, count, + shard_offset, shard_write_size); + if (vec) + vec_offset += shard_write_size; + orig_offset += shard_write_size; + GF_FREE(vec); + vec = NULL; + next: + cur_block++; + i++; + call_count--; + } + return 0; +} + +int +shard_common_inode_write_post_mknod_handler(call_frame_t *frame, + xlator_t *this); + +int +shard_common_inode_write_post_lookup_shards_handler(call_frame_t *frame, + xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (local->create_count) { + shard_common_resume_mknod(frame, this, + shard_common_inode_write_post_mknod_handler); + } else { + shard_common_inode_write_do(frame, this); + } + + return 0; +} + +int +shard_common_inode_write_post_mknod_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (!local->eexist_count) { + shard_common_inode_write_do(frame, this); + } else { + local->call_count = local->eexist_count; + shard_common_lookup_shards( + frame, this, local->loc.inode, + shard_common_inode_write_post_lookup_shards_handler); + } + + return 0; +} + +int +shard_common_inode_write_post_resolve_handler(call_frame_t *frame, + xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (local->call_count) { + shard_common_lookup_shards( + frame, this, local->resolver_base_inode, + shard_common_inode_write_post_lookup_shards_handler); + } else if (local->create_count) { + shard_common_inode_write_post_lookup_shards_handler(frame, this); + } else { + shard_common_inode_write_do(frame, this); + } + + return 0; +} + +int +shard_common_inode_write_post_lookup_handler(call_frame_t *frame, + xlator_t *this) +{ + shard_local_t *local = frame->local; + shard_priv_t *priv = this->private; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + local->postbuf = local->prebuf; + + /*Adjust offset to EOF so that correct shard is chosen for append*/ + if (shard_is_appending_write(local)) + local->offset = local->prebuf.ia_size; + + local->first_block = get_lowest_block(local->offset, local->block_size); + local->last_block = get_highest_block(local->offset, local->total_size, + local->block_size); + local->num_blocks = local->last_block - local->first_block + 1; + GF_ASSERT(local->num_blocks > 0); + local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), + gf_shard_mt_inode_list); + if (!local->inode_list) { + shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); + return 0; + } + + gf_msg_trace(this->name, 0, + "%s: gfid=%s first_block=%" PRIu64 + " " + "last_block=%" PRIu64 " num_blocks=%" PRIu64 " offset=%" PRId64 + " total_size=%zu flags=%" PRId32 "", + gf_fop_list[local->fop], + uuid_utoa(local->resolver_base_inode->gfid), + local->first_block, local->last_block, local->num_blocks, + local->offset, local->total_size, local->flags); + + local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); + + if (!local->dot_shard_loc.inode) { + /*change handler*/ + shard_mkdir_internal_dir(frame, this, + shard_common_inode_write_post_resolve_handler, + SHARD_INTERNAL_DIR_DOT_SHARD); + } else { + /*change handler*/ + local->post_res_handler = shard_common_inode_write_post_resolve_handler; + shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); + } + return 0; +} + +int +shard_mkdir_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + inode_t *link_inode = NULL; + shard_local_t *local = NULL; + shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; + + local = frame->local; + + SHARD_UNSET_ROOT_FS_ID(frame, local); + + if (op_ret == -1) { + if (op_errno != EEXIST) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } else { + gf_msg_debug(this->name, 0, + "mkdir on %s failed " + "with EEXIST. Attempting lookup now", + shard_internal_dir_string(type)); + shard_lookup_internal_dir(frame, this, local->post_res_handler, + type); + return 0; + } + } + + link_inode = shard_link_internal_dir_inode(local, inode, buf, type); + if (link_inode != inode) { + shard_refresh_internal_dir(frame, this, type); + } else { + shard_inode_ctx_mark_dir_refreshed(link_inode, this); + shard_common_resolve_shards(frame, this, local->post_res_handler); + } + return 0; +unwind: + shard_common_resolve_shards(frame, this, local->post_res_handler); + return 0; +} + +int +shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, + shard_post_resolve_fop_handler_t handler, + shard_internal_dir_type_t type) +{ + int ret = -1; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + dict_t *xattr_req = NULL; + uuid_t *gfid = NULL; + loc_t *loc = NULL; + gf_boolean_t free_gfid = _gf_true; + + local = frame->local; + priv = this->private; + + local->post_res_handler = handler; + gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); + if (!gfid) + goto err; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: + gf_uuid_copy(*gfid, priv->dot_shard_gfid); + loc = &local->dot_shard_loc; + break; + case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: + gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid); + loc = &local->dot_shard_rm_loc; + break; + default: + bzero(*gfid, sizeof(uuid_t)); + break; + } + + xattr_req = dict_new(); + if (!xattr_req) + goto err; + + ret = shard_init_internal_dir_loc(this, local, type); + if (ret) + goto err; + + ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set gfid-req for %s", + shard_internal_dir_string(type)); + goto err; + } else { + free_gfid = _gf_false; + } + + SHARD_SET_ROOT_FS_ID(frame, local); + + STACK_WIND_COOKIE(frame, shard_mkdir_internal_dir_cbk, (void *)(long)type, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, loc, + 0755, 0, xattr_req); + dict_unref(xattr_req); + return 0; + +err: + if (xattr_req) + dict_unref(xattr_req); + local->op_ret = -1; + local->op_errno = ENOMEM; + if (free_gfid) + GF_FREE(gfid); + handler(frame, this); + return 0; +} + +int +shard_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + /* To-Do: Wind flush on all shards of the file */ + SHARD_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata); + return 0; +} + +int +shard_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + STACK_WIND(frame, shard_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; +} + +int +__shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode, + xlator_t *this) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret < 0) + return ret; + + ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + + local->postbuf.ia_ctime = ctx->stat.ia_ctime; + local->postbuf.ia_ctime_nsec = ctx->stat.ia_ctime_nsec; + local->postbuf.ia_atime = ctx->stat.ia_atime; + local->postbuf.ia_atime_nsec = ctx->stat.ia_atime_nsec; + local->postbuf.ia_mtime = ctx->stat.ia_mtime; + local->postbuf.ia_mtime_nsec = ctx->stat.ia_mtime_nsec; + + return 0; +} + +int +shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode, + xlator_t *this) +{ + int ret = 0; + + LOCK(&inode->lock); + { + ret = __shard_get_timestamps_from_inode_ctx(local, inode, this); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +shard_fsync_shards_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int call_count = 0; + uint64_t fsync_count = 0; + fd_t *anon_fd = cookie; + shard_local_t *local = NULL; + shard_inode_ctx_t *ctx = NULL; + shard_inode_ctx_t *base_ictx = NULL; + inode_t *base_inode = NULL; + gf_boolean_t unref_shard_inode = _gf_false; + + local = frame->local; + base_inode = local->fd->inode; + + if (local->op_ret < 0) + goto out; + + LOCK(&frame->lock); + { + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + UNLOCK(&frame->lock); + goto out; + } + shard_inode_ctx_set(local->fd->inode, this, postbuf, 0, + SHARD_MASK_TIMES); + } + UNLOCK(&frame->lock); + fd_ctx_get(anon_fd, this, &fsync_count); +out: + if (anon_fd && (base_inode != anon_fd->inode)) { + LOCK(&base_inode->lock); + LOCK(&anon_fd->inode->lock); + { + __shard_inode_ctx_get(anon_fd->inode, this, &ctx); + __shard_inode_ctx_get(base_inode, this, &base_ictx); + if (op_ret == 0) + ctx->fsync_needed -= fsync_count; + GF_ASSERT(ctx->fsync_needed >= 0); + if (ctx->fsync_needed != 0) { + list_add_tail(&ctx->to_fsync_list, &base_ictx->to_fsync_list); + base_ictx->fsync_count++; + } else { + unref_shard_inode = _gf_true; + } + } + UNLOCK(&anon_fd->inode->lock); + UNLOCK(&base_inode->lock); + } + + if (unref_shard_inode) + inode_unref(anon_fd->inode); + if (anon_fd) + fd_unref(anon_fd); + + call_count = shard_call_count_return(frame); + if (call_count != 0) + return 0; + + if (local->op_ret < 0) { + shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret, + local->op_errno); + } else { + shard_get_timestamps_from_inode_ctx(local, base_inode, this); + SHARD_STACK_UNWIND(fsync, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->postbuf, local->xattr_rsp); + } + return 0; +} + +int +shard_post_lookup_fsync_handler(call_frame_t *frame, xlator_t *this) +{ + int ret = 0; + int call_count = 0; + int fsync_count = 0; + fd_t *anon_fd = NULL; + inode_t *base_inode = NULL; + shard_local_t *local = NULL; + shard_inode_ctx_t *ctx = NULL; + shard_inode_ctx_t *iter = NULL; + struct list_head copy = { + 0, + }; + shard_inode_ctx_t *tmp = NULL; + + local = frame->local; + base_inode = local->fd->inode; + local->postbuf = local->prebuf; + INIT_LIST_HEAD(©); + + if (local->op_ret < 0) { + shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret, + local->op_errno); + return 0; + } + + LOCK(&base_inode->lock); + { + __shard_inode_ctx_get(base_inode, this, &ctx); + list_splice_init(&ctx->to_fsync_list, ©); + call_count = ctx->fsync_count; + ctx->fsync_count = 0; + } + UNLOCK(&base_inode->lock); + + local->call_count = ++call_count; + + /* Send fsync() on the base shard first */ + anon_fd = fd_ref(local->fd); + STACK_WIND_COOKIE(frame, shard_fsync_shards_cbk, anon_fd, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, anon_fd, local->datasync, + local->xattr_req); + call_count--; + anon_fd = NULL; + + list_for_each_entry_safe(iter, tmp, ©, to_fsync_list) + { + list_del_init(&iter->to_fsync_list); + fsync_count = 0; + shard_inode_ctx_get_fsync_count(iter->inode, this, &fsync_count); + GF_ASSERT(fsync_count > 0); + anon_fd = fd_anonymous(iter->inode); + if (!anon_fd) { + local->op_ret = -1; + local->op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, + SHARD_MSG_MEMALLOC_FAILED, + "Failed to create " + "anon fd to fsync shard"); + shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1, + ENOMEM, NULL, NULL, NULL); + continue; + } + + ret = fd_ctx_set(anon_fd, this, fsync_count); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_FD_CTX_SET_FAILED, + "Failed to set fd " + "ctx for shard inode gfid=%s", + uuid_utoa(iter->inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1, + ENOMEM, NULL, NULL, NULL); + continue; + } + STACK_WIND_COOKIE(frame, shard_fsync_shards_cbk, anon_fd, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, + anon_fd, local->datasync, local->xattr_req); + call_count--; + } + + return 0; +} + +int +shard_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + int ret = 0; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size for %s from its inode ctx", + uuid_utoa(fd->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; + } + + if (!this->itable) + this->itable = fd->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + local->fd = fd_ref(fd); + local->fop = GF_FOP_FSYNC; + local->datasync = datasync; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + + shard_refresh_base_file(frame, this, NULL, fd, + shard_post_lookup_fsync_handler); + return 0; +err: + shard_common_failure_unwind(GF_FOP_FSYNC, frame, -1, ENOMEM); + return 0; +} + +int +shard_readdir_past_dot_shard_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, gf_dirent_t *orig_entries, + dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) + goto unwind; + + list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list) + { + list_del_init(&entry->list); + list_add_tail(&entry->list, &local->entries_head.list); + + if (!entry->dict) + continue; + + if (IA_ISDIR(entry->d_stat.ia_type)) + continue; + + if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE)) + shard_modify_size_and_block_count(&entry->d_stat, entry->dict); + if (!entry->inode) + continue; + + shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat); + } + local->op_ret += op_ret; + +unwind: + if (local->fop == GF_FOP_READDIR) + SHARD_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, + &local->entries_head, xdata); + else + SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno, + &local->entries_head, xdata); + return 0; +} + +int32_t +shard_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *orig_entries, + dict_t *xdata) +{ + fd_t *fd = NULL; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + shard_local_t *local = NULL; + gf_boolean_t last_entry = _gf_false; + + local = frame->local; + fd = local->fd; + + if (op_ret < 0) + goto unwind; + + list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list) + { + if (last_entry) + last_entry = _gf_false; + + if (__is_root_gfid(fd->inode->gfid) && + !(strcmp(entry->d_name, GF_SHARD_DIR))) { + local->offset = entry->d_off; + op_ret--; + last_entry = _gf_true; + continue; + } + + list_del_init(&entry->list); + list_add_tail(&entry->list, &local->entries_head.list); + + if (!entry->dict) + continue; + + if (IA_ISDIR(entry->d_stat.ia_type)) + continue; + + if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE) && + frame->root->pid != GF_CLIENT_PID_GSYNCD) + shard_modify_size_and_block_count(&entry->d_stat, entry->dict); + + if (!entry->inode) + continue; + + shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat); + } + + local->op_ret = op_ret; + + if (last_entry) { + if (local->fop == GF_FOP_READDIR) + STACK_WIND(frame, shard_readdir_past_dot_shard_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir, + local->fd, local->readdir_size, local->offset, + local->xattr_req); + else + STACK_WIND(frame, shard_readdir_past_dot_shard_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, + local->fd, local->readdir_size, local->offset, + local->xattr_req); + return 0; + } + +unwind: + if (local->fop == GF_FOP_READDIR) + SHARD_STACK_UNWIND(readdir, frame, op_ret, op_errno, + &local->entries_head, xdata); + else + SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno, + &local->entries_head, xdata); + return 0; +} + +int +shard_readdir_do(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, int whichop, dict_t *xdata) +{ + int ret = 0; + shard_local_t *local = NULL; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } + + frame->local = local; + + local->fd = fd_ref(fd); + local->fop = whichop; + local->readdir_size = size; + INIT_LIST_HEAD(&local->entries_head.list); + local->list_inited = _gf_true; + + if (whichop == GF_FOP_READDIR) { + STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata); + } else { + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, + local, err); + ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "Failed to set " + "dict value: key:%s, directory gfid=%s", + GF_XATTR_SHARD_BLOCK_SIZE, uuid_utoa(fd->inode->gfid)); + goto err; + } + + STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, + local->xattr_req); + } + + return 0; + +err: + STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + +int32_t +shard_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIR, xdata); + return 0; +} + +int32_t +shard_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIRP, xdata); + return 0; +} + +int32_t +shard_modify_and_set_iatt_in_dict(dict_t *xdata, shard_local_t *local, + char *key) +{ + int ret = 0; + struct iatt *tmpbuf = NULL; + struct iatt *stbuf = NULL; + data_t *data = NULL; + + if (!xdata) + return 0; + + data = dict_get(xdata, key); + if (!data) + return 0; + + tmpbuf = data_to_iatt(data, key); + stbuf = GF_MALLOC(sizeof(struct iatt), gf_common_mt_char); + if (stbuf == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + *stbuf = *tmpbuf; + stbuf->ia_size = local->prebuf.ia_size; + stbuf->ia_blocks = local->prebuf.ia_blocks; + ret = dict_set_iatt(xdata, key, stbuf, false); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + return 0; + +err: + GF_FREE(stbuf); + return -1; +} + +int32_t +shard_common_remove_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + int ret = -1; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto err; + } + + ret = shard_modify_and_set_iatt_in_dict(xdata, local, GF_PRESTAT); + if (ret < 0) + goto err; + + ret = shard_modify_and_set_iatt_in_dict(xdata, local, GF_POSTSTAT); + if (ret < 0) + goto err; + + if (local->fd) + SHARD_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno, + xdata); + else + SHARD_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno, + xdata); + return 0; + +err: + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; +} + +int32_t +shard_post_lookup_remove_xattr_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (local->fd) + STACK_WIND(frame, shard_common_remove_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, local->fd, + local->name, local->xattr_req); + else + STACK_WIND(frame, shard_common_remove_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, &local->loc, + local->name, local->xattr_req); + return 0; +} + +int32_t +shard_common_remove_xattr(call_frame_t *frame, xlator_t *this, + glusterfs_fop_t fop, loc_t *loc, fd_t *fd, + const char *name, dict_t *xdata) +{ + int ret = -1; + int op_errno = ENOMEM; + uint64_t block_size = 0; + shard_local_t *local = NULL; + inode_t *inode = loc ? loc->inode : fd->inode; + + if ((IA_ISDIR(inode->ia_type)) || (IA_ISLNK(inode->ia_type))) { + if (loc) + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, + xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, + xdata); + return 0; + } + + /* If shard's special xattrs are attempted to be removed, + * fail the fop with EPERM (except if the client is gsyncd). + */ + if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { + GF_IF_NATIVE_XATTR_GOTO(SHARD_XATTR_PREFIX "*", name, op_errno, err); + } + + /* Repeat the same check for bulk-removexattr */ + if (xdata && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { + dict_del(xdata, GF_XATTR_SHARD_BLOCK_SIZE); + dict_del(xdata, GF_XATTR_SHARD_FILE_SIZE); + } + + ret = shard_inode_ctx_get_block_size(inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block size from inode ctx of %s", + uuid_utoa(inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + if (loc) + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, + xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, + xdata); + return 0; + } + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + local->fop = fop; + if (loc) { + if (loc_copy(&local->loc, loc) != 0) + goto err; + } + + if (fd) { + local->fd = fd_ref(fd); + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + } + + if (name) { + local->name = gf_strdup(name); + if (!local->name) + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + shard_refresh_base_file(frame, this, loc, fd, + shard_post_lookup_remove_xattr_handler); + return 0; +err: + shard_common_failure_unwind(fop, frame, -1, op_errno); + return 0; +} + +int32_t +shard_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + shard_common_remove_xattr(frame, this, GF_FOP_REMOVEXATTR, loc, NULL, name, + xdata); + return 0; +} + +int32_t +shard_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + shard_common_remove_xattr(frame, this, GF_FOP_FREMOVEXATTR, NULL, fd, name, + xdata); + return 0; +} + +int32_t +shard_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + if (op_ret < 0) + goto unwind; + + if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { + dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE); + dict_del(dict, GF_XATTR_SHARD_FILE_SIZE); + } + +unwind: + SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +int32_t +shard_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + int op_errno = EINVAL; + + if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) && + (!strncmp(name, SHARD_XATTR_PREFIX, SLEN(SHARD_XATTR_PREFIX)))) { + op_errno = ENODATA; + goto out; + } + + STACK_WIND(frame, shard_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + return 0; +out: + shard_common_failure_unwind(GF_FOP_FGETXATTR, frame, -1, op_errno); + return 0; +} + +int32_t +shard_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + if (op_ret < 0) + goto unwind; + + if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { + dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE); + dict_del(dict, GF_XATTR_SHARD_FILE_SIZE); + } + +unwind: + SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +int32_t +shard_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + int op_errno = EINVAL; + + if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) && + (!strncmp(name, SHARD_XATTR_PREFIX, sizeof(SHARD_XATTR_PREFIX) - 1))) { + op_errno = ENODATA; + goto out; + } + + STACK_WIND(frame, shard_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + return 0; +out: + shard_common_failure_unwind(GF_FOP_GETXATTR, frame, -1, op_errno); + return 0; +} + +int32_t +shard_common_set_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + int ret = -1; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto err; + } + + ret = shard_modify_and_set_iatt_in_dict(xdata, local, GF_PRESTAT); + if (ret < 0) + goto err; + + ret = shard_modify_and_set_iatt_in_dict(xdata, local, GF_POSTSTAT); + if (ret < 0) + goto err; + + if (local->fd) + SHARD_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno, + xdata); + else + SHARD_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, + xdata); + return 0; + +err: + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; +} + +int32_t +shard_post_lookup_set_xattr_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (local->fd) + STACK_WIND(frame, shard_common_set_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, local->fd, + local->xattr_req, local->flags, local->xattr_rsp); + else + STACK_WIND(frame, shard_common_set_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, &local->loc, + local->xattr_req, local->flags, local->xattr_rsp); + return 0; +} + +int32_t +shard_common_set_xattr(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + loc_t *loc, fd_t *fd, dict_t *dict, int32_t flags, + dict_t *xdata) +{ + int ret = -1; + int op_errno = ENOMEM; + uint64_t block_size = 0; + shard_local_t *local = NULL; + inode_t *inode = loc ? loc->inode : fd->inode; + + if ((IA_ISDIR(inode->ia_type)) || (IA_ISLNK(inode->ia_type))) { + if (loc) + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, + xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, + xdata); + return 0; + } + + /* Sharded or not, if shard's special xattrs are attempted to be set, + * fail the fop with EPERM (except if the client is gsyncd. + */ + if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { + GF_IF_INTERNAL_XATTR_GOTO(SHARD_XATTR_PREFIX "*", dict, op_errno, err); + } + + ret = shard_inode_ctx_get_block_size(inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block size from inode ctx of %s", + uuid_utoa(inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + if (loc) + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, + xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, + xdata); + return 0; + } + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + local->fop = fop; + if (loc) { + if (loc_copy(&local->loc, loc) != 0) + goto err; + } + + if (fd) { + local->fd = fd_ref(fd); + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + } + local->flags = flags; + /* Reusing local->xattr_req and local->xattr_rsp to store the setxattr dict + * and the xdata dict + */ + if (dict) + local->xattr_req = dict_ref(dict); + if (xdata) + local->xattr_rsp = dict_ref(xdata); + + shard_refresh_base_file(frame, this, loc, fd, + shard_post_lookup_set_xattr_handler); + return 0; +err: + shard_common_failure_unwind(fop, frame, -1, op_errno); + return 0; +} + +int32_t +shard_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + shard_common_set_xattr(frame, this, GF_FOP_FSETXATTR, NULL, fd, dict, flags, + xdata); + return 0; +} + +int32_t +shard_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + shard_common_set_xattr(frame, this, GF_FOP_SETXATTR, loc, NULL, dict, flags, + xdata); + return 0; +} + +int +shard_post_setattr_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->fop == GF_FOP_SETATTR) { + if (local->op_ret >= 0) + shard_inode_ctx_set(local->loc.inode, this, &local->postbuf, 0, + SHARD_LOOKUP_MASK); + SHARD_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->postbuf, local->xattr_rsp); + } else if (local->fop == GF_FOP_FSETATTR) { + if (local->op_ret >= 0) + shard_inode_ctx_set(local->fd->inode, this, &local->postbuf, 0, + SHARD_LOOKUP_MASK); + SHARD_STACK_UNWIND(fsetattr, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->postbuf, local->xattr_rsp); + } + + return 0; +} + +int +shard_common_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + + local->prebuf = *prebuf; + if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { + local->op_ret = -1; + local->op_errno = EINVAL; + goto unwind; + } + if (xdata) + local->xattr_rsp = dict_ref(xdata); + local->postbuf = *postbuf; + local->postbuf.ia_size = local->prebuf.ia_size; + local->postbuf.ia_blocks = local->prebuf.ia_blocks; + +unwind: + local->handler(frame, this); + return 0; +} + +int +shard_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) { + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; + } + + ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block size from inode ctx of %s", + uuid_utoa(loc->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; + } + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + local->handler = shard_post_setattr_handler; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + local->fop = GF_FOP_SETATTR; + loc_copy(&local->loc, loc); + + SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid, + local, err); + + STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, + local->xattr_req); + return 0; +err: + shard_common_failure_unwind(GF_FOP_SETATTR, frame, -1, ENOMEM); + return 0; +} + +int +shard_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) { + STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; + } + + ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block size from inode ctx of %s", + uuid_utoa(fd->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; + } + + if (!this->itable) + this->itable = fd->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + local->handler = shard_post_setattr_handler; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + local->fop = GF_FOP_FSETATTR; + local->fd = fd_ref(fd); + + SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, + local, err); + + STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, + local->xattr_req); + return 0; +err: + shard_common_failure_unwind(GF_FOP_FSETATTR, frame, -1, ENOMEM); + return 0; +} + +int +shard_common_inode_write_begin(call_frame_t *frame, xlator_t *this, + glusterfs_fop_t fop, fd_t *fd, + struct iovec *vector, int32_t count, + off_t offset, uint32_t flags, size_t len, + struct iobref *iobref, dict_t *xdata) +{ + int ret = 0; + int i = 0; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size for %s from its inode ctx", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + /* block_size = 0 means that the file was created before + * sharding was enabled on the volume. + */ + switch (fop) { + case GF_FOP_WRITE: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, + count, offset, flags, iobref, xdata); + break; + case GF_FOP_FALLOCATE: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, flags, + offset, len, xdata); + break; + case GF_FOP_ZEROFILL: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, + len, xdata); + break; + case GF_FOP_DISCARD: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, + len, xdata); + break; + default: + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, + "Invalid fop id = %d", fop); + break; + } + return 0; + } + + if (!this->itable) + this->itable = fd->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto out; + + frame->local = local; + + ret = syncbarrier_init(&local->barrier); + if (ret) + goto out; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto out; + + if (vector) { + local->vector = iov_dup(vector, count); + if (!local->vector) + goto out; + for (i = 0; i < count; i++) + local->total_size += vector[i].iov_len; + local->count = count; + } else { + local->total_size = len; + } + + local->fop = fop; + local->offset = offset; + local->flags = flags; + if (iobref) + local->iobref = iobref_ref(iobref); + local->fd = fd_ref(fd); + local->block_size = block_size; + local->resolver_base_inode = local->fd->inode; + GF_ATOMIC_INIT(local->delta_blocks, 0); + + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + + shard_refresh_base_file(frame, this, NULL, fd, + shard_common_inode_write_post_lookup_handler); + return 0; +out: + shard_common_failure_unwind(fop, frame, -1, ENOMEM); + return 0; +} + +int +shard_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + shard_common_inode_write_begin(frame, this, GF_FOP_WRITE, fd, vector, count, + offset, flags, 0, iobref, xdata); + return 0; +} + +int +shard_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t keep_size, off_t offset, size_t len, dict_t *xdata) +{ + if ((keep_size != 0) && (keep_size != FALLOC_FL_ZERO_RANGE) && + (keep_size != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))) + goto out; + + shard_common_inode_write_begin(frame, this, GF_FOP_FALLOCATE, fd, NULL, 0, + offset, keep_size, len, NULL, xdata); + return 0; +out: + shard_common_failure_unwind(GF_FOP_FALLOCATE, frame, -1, ENOTSUP); + return 0; +} + +int +shard_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + shard_common_inode_write_begin(frame, this, GF_FOP_ZEROFILL, fd, NULL, 0, + offset, 0, len, NULL, xdata); + return 0; +} + +int +shard_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + shard_common_inode_write_begin(frame, this, GF_FOP_DISCARD, fd, NULL, 0, + offset, 0, len, NULL, xdata); + return 0; +} + +int32_t +shard_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) +{ + /* TBD */ + gf_msg(this->name, GF_LOG_INFO, ENOTSUP, SHARD_MSG_FOP_NOT_SUPPORTED, + "seek called on %s.", uuid_utoa(fd->inode->gfid)); + shard_common_failure_unwind(GF_FOP_SEEK, frame, -1, ENOTSUP); + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init(this, gf_shard_mt_end + 1); + + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_MEM_ACCT_INIT_FAILED, + "Memory accounting init" + "failed"); + return ret; + } + + return ret; +} + +int +init(xlator_t *this) +{ + int ret = -1; + shard_priv_t *priv = NULL; + + if (!this) { + gf_msg("shard", GF_LOG_ERROR, 0, SHARD_MSG_NULL_THIS, + "this is NULL. init() failed"); + return -1; + } + + if (!this->parents) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE, + "Dangling volume. Check volfile"); + goto out; + } + + if (!this->children || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE, + "shard not configured with exactly one sub-volume. " + "Check volfile"); + goto out; + } + + priv = GF_CALLOC(1, sizeof(shard_priv_t), gf_shard_mt_priv_t); + if (!priv) + goto out; + + GF_OPTION_INIT("shard-block-size", priv->block_size, size_uint64, out); + + GF_OPTION_INIT("shard-deletion-rate", priv->deletion_rate, uint32, out); + + GF_OPTION_INIT("shard-lru-limit", priv->lru_limit, uint64, out); + + this->local_pool = mem_pool_new(shard_local_t, 128); + if (!this->local_pool) { + ret = -1; + goto out; + } + gf_uuid_parse(SHARD_ROOT_GFID, priv->dot_shard_gfid); + gf_uuid_parse(DOT_SHARD_REMOVE_ME_GFID, priv->dot_shard_rm_gfid); + + this->private = priv; + LOCK_INIT(&priv->lock); + INIT_LIST_HEAD(&priv->ilist_head); + ret = 0; +out: + if (ret) { + GF_FREE(priv); + mem_pool_destroy(this->local_pool); + } + + return ret; +} + +void +fini(xlator_t *this) +{ + shard_priv_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("shard", this, out); + + /*Itable was not created by shard, hence setting to NULL.*/ + this->itable = NULL; + + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + + priv = this->private; + if (!priv) + goto out; + + this->private = NULL; + LOCK_DESTROY(&priv->lock); + GF_FREE(priv); + +out: + return; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + int ret = -1; + shard_priv_t *priv = NULL; + + priv = this->private; + + GF_OPTION_RECONF("shard-block-size", priv->block_size, options, size, out); + + GF_OPTION_RECONF("shard-deletion-rate", priv->deletion_rate, options, + uint32, out); + ret = 0; + +out: + return ret; +} + +int +shard_forget(xlator_t *this, inode_t *inode) +{ + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + if (!priv) + return 0; + + inode_ctx_del(inode, this, &ctx_uint); + if (!ctx_uint) + return 0; + + ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + + /* When LRU limit reaches inode will be forcefully removed from the + * table, inode needs to be removed from LRU of shard as well. + */ + if (!list_empty(&ctx->ilist)) { + LOCK(&priv->lock); + { + list_del_init(&ctx->ilist); + priv->inode_count--; + } + UNLOCK(&priv->lock); + } + GF_FREE(ctx); + + return 0; +} + +int +shard_release(xlator_t *this, fd_t *fd) +{ + /* TBD */ + return 0; +} + +int +shard_priv_dump(xlator_t *this) +{ + shard_priv_t *priv = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + char *str = NULL; + + priv = this->private; + + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); + gf_proc_dump_add_section("%s", key_prefix); + str = gf_uint64_2human_readable(priv->block_size); + gf_proc_dump_write("shard-block-size", "%s", str); + gf_proc_dump_write("inode-count", "%d", priv->inode_count); + gf_proc_dump_write("ilist_head", "%p", &priv->ilist_head); + gf_proc_dump_write("lru-max-limit", "%" PRIu64, priv->lru_limit); + + GF_FREE(str); + + return 0; +} + +int +shard_releasedir(xlator_t *this, fd_t *fd) +{ + return 0; +} + +struct xlator_fops fops = { + .lookup = shard_lookup, + .open = shard_open, + .flush = shard_flush, + .fsync = shard_fsync, + .stat = shard_stat, + .fstat = shard_fstat, + .getxattr = shard_getxattr, + .fgetxattr = shard_fgetxattr, + .readv = shard_readv, + .writev = shard_writev, + .truncate = shard_truncate, + .ftruncate = shard_ftruncate, + .setxattr = shard_setxattr, + .fsetxattr = shard_fsetxattr, + .setattr = shard_setattr, + .fsetattr = shard_fsetattr, + .removexattr = shard_removexattr, + .fremovexattr = shard_fremovexattr, + .fallocate = shard_fallocate, + .discard = shard_discard, + .zerofill = shard_zerofill, + .readdir = shard_readdir, + .readdirp = shard_readdirp, + .create = shard_create, + .mknod = shard_mknod, + .link = shard_link, + .unlink = shard_unlink, + .rename = shard_rename, + .seek = shard_seek, +}; + +struct xlator_cbks cbks = { + .forget = shard_forget, + .release = shard_release, + .releasedir = shard_releasedir, +}; + +struct xlator_dumpops dumpops = { + .priv = shard_priv_dump, +}; + +struct volume_options options[] = { + { + .key = {"shard"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable shard", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + { + .key = {"shard-block-size"}, + .type = GF_OPTION_TYPE_SIZET, + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"shard"}, + .default_value = "64MB", + .min = SHARD_MIN_BLOCK_SIZE, + .max = SHARD_MAX_BLOCK_SIZE, + .description = "The size unit used to break a file into multiple " + "chunks", + }, + { + .key = {"shard-deletion-rate"}, + .type = GF_OPTION_TYPE_INT, + .op_version = {GD_OP_VERSION_5_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"shard"}, + .default_value = "100", + .min = 100, + .max = INT_MAX, + .description = "The number of shards to send deletes on at a time", + }, + { + .key = {"shard-lru-limit"}, + .type = GF_OPTION_TYPE_INT, + .op_version = {GD_OP_VERSION_5_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT, + .tags = {"shard"}, + .default_value = "16384", + .min = 20, + .max = INT_MAX, + .description = "The number of resolved shard inodes to keep in " + "memory. A higher number means shards that are " + "resolved will remain in memory longer, avoiding " + "frequent lookups on them when they participate in " + "file operations. The option also has a bearing on " + "amount of memory consumed by these inodes and their " + "internal metadata", + }, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "shard", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h new file mode 100644 index 00000000000..4fe181b64d5 --- /dev/null +++ b/xlators/features/shard/src/shard.h @@ -0,0 +1,348 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __SHARD_H__ +#define __SHARD_H__ + +#include <glusterfs/xlator.h> +#include <glusterfs/compat-errno.h> +#include "shard-messages.h" +#include <glusterfs/syncop.h> + +#define GF_SHARD_DIR ".shard" +#define GF_SHARD_REMOVE_ME_DIR ".remove_me" +#define SHARD_MIN_BLOCK_SIZE (4 * GF_UNIT_MB) +#define SHARD_MAX_BLOCK_SIZE (4 * GF_UNIT_TB) +#define SHARD_XATTR_PREFIX "trusted.glusterfs.shard." +#define GF_XATTR_SHARD_BLOCK_SIZE "trusted.glusterfs.shard.block-size" +/** + * Bit masks for the valid flag, which is used while updating ctx + **/ +#define SHARD_MASK_BLOCK_SIZE (1 << 0) +#define SHARD_MASK_PROT (1 << 1) +#define SHARD_MASK_NLINK (1 << 2) +#define SHARD_MASK_UID (1 << 3) +#define SHARD_MASK_GID (1 << 4) +#define SHARD_MASK_SIZE (1 << 6) +#define SHARD_MASK_BLOCKS (1 << 7) +#define SHARD_MASK_TIMES (1 << 8) +#define SHARD_MASK_OTHERS (1 << 9) +#define SHARD_MASK_REFRESH_RESET (1 << 10) + +#define SHARD_INODE_WRITE_MASK \ + (SHARD_MASK_SIZE | SHARD_MASK_BLOCKS | SHARD_MASK_TIMES) + +#define SHARD_LOOKUP_MASK \ + (SHARD_MASK_PROT | SHARD_MASK_NLINK | SHARD_MASK_UID | SHARD_MASK_GID | \ + SHARD_MASK_TIMES | SHARD_MASK_OTHERS) + +#define SHARD_ALL_MASK \ + (SHARD_MASK_BLOCK_SIZE | SHARD_MASK_PROT | SHARD_MASK_NLINK | \ + SHARD_MASK_UID | SHARD_MASK_GID | SHARD_MASK_SIZE | SHARD_MASK_BLOCKS | \ + SHARD_MASK_TIMES | SHARD_MASK_OTHERS) + +#define get_lowest_block(off, shard_size) ((off) / (shard_size)) +#define get_highest_block(off, len, shard_size) \ + (((((off) + (len)) == 0) ? 0 : ((off) + (len)-1)) / (shard_size)) + +int +shard_unlock_inodelk(call_frame_t *frame, xlator_t *this); + +int +shard_unlock_entrylk(call_frame_t *frame, xlator_t *this); + +#define SHARD_ENTRY_FOP_CHECK(loc, op_errno, label) \ + do { \ + if ((loc->name && !strcmp(GF_SHARD_DIR, loc->name)) && \ + (((loc->parent) && __is_root_gfid(loc->parent->gfid)) || \ + __is_root_gfid(loc->pargfid))) { \ + op_errno = EPERM; \ + goto label; \ + } \ + \ + if ((loc->parent && __is_shard_dir(loc->parent->gfid)) || \ + __is_shard_dir(loc->pargfid)) { \ + op_errno = EPERM; \ + goto label; \ + } \ + } while (0) + +#define SHARD_INODE_OP_CHECK(gfid, err, label) \ + do { \ + if (__is_shard_dir(gfid)) { \ + err = EPERM; \ + goto label; \ + } \ + } while (0) + +#define SHARD_STACK_UNWIND(fop, frame, params...) \ + do { \ + shard_local_t *__local = NULL; \ + if (frame) { \ + __local = frame->local; \ + if (__local && __local->int_inodelk.acquired_lock) \ + shard_unlock_inodelk(frame, frame->this); \ + if (__local && __local->int_entrylk.acquired_lock) \ + shard_unlock_entrylk(frame, frame->this); \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + if (__local) { \ + shard_local_wipe(__local); \ + mem_put(__local); \ + } \ + } while (0) + +#define SHARD_STACK_DESTROY(frame) \ + do { \ + shard_local_t *__local = NULL; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_DESTROY(frame->root); \ + if (__local) { \ + shard_local_wipe(__local); \ + mem_put(__local); \ + } \ + } while (0); + +#define SHARD_INODE_CREATE_INIT(this, block_size, xattr_req, loc, size, \ + block_count, label) \ + do { \ + int __ret = -1; \ + int64_t *__size_attr = NULL; \ + uint64_t *__bs = 0; \ + \ + __bs = GF_MALLOC(sizeof(uint64_t), gf_shard_mt_uint64_t); \ + if (!__bs) \ + goto label; \ + *__bs = hton64(block_size); \ + __ret = dict_set_bin(xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, __bs, \ + sizeof(*__bs)); \ + if (__ret) { \ + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, \ + "Failed to set key: %s " \ + "on path %s", \ + GF_XATTR_SHARD_BLOCK_SIZE, (loc)->path); \ + GF_FREE(__bs); \ + goto label; \ + } \ + \ + __ret = shard_set_size_attrs(size, block_count, &__size_attr); \ + if (__ret) \ + goto label; \ + \ + __ret = dict_set_bin(xattr_req, GF_XATTR_SHARD_FILE_SIZE, __size_attr, \ + 8 * 4); \ + if (__ret) { \ + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, \ + "Failed to set key: %s " \ + "on path %s", \ + GF_XATTR_SHARD_FILE_SIZE, (loc)->path); \ + GF_FREE(__size_attr); \ + goto label; \ + } \ + } while (0) + +#define SHARD_MD_READ_FOP_INIT_REQ_DICT(this, dict, gfid, local, label) \ + do { \ + int __ret = -1; \ + \ + __ret = dict_set_uint64(dict, GF_XATTR_SHARD_FILE_SIZE, 8 * 4); \ + if (__ret) { \ + local->op_ret = -1; \ + local->op_errno = ENOMEM; \ + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, \ + "Failed to set dict value:" \ + " key:%s for %s.", \ + GF_XATTR_SHARD_FILE_SIZE, uuid_utoa(gfid)); \ + goto label; \ + } \ + } while (0) + +#define SHARD_SET_ROOT_FS_ID(frame, local) \ + do { \ + if (!local->is_set_fsid) { \ + local->uid = frame->root->uid; \ + local->gid = frame->root->gid; \ + frame->root->uid = 0; \ + frame->root->gid = 0; \ + local->is_set_fsid = _gf_true; \ + } \ + } while (0) + +#define SHARD_UNSET_ROOT_FS_ID(frame, local) \ + do { \ + if (local->is_set_fsid) { \ + frame->root->uid = local->uid; \ + frame->root->gid = local->gid; \ + local->is_set_fsid = _gf_false; \ + } \ + } while (0) + +#define SHARD_TIME_UPDATE(ctx_sec, ctx_nsec, new_sec, new_nsec) \ + do { \ + if (ctx_sec == new_sec) \ + ctx_nsec = new_nsec = max(new_nsec, ctx_nsec); \ + else if (ctx_sec > new_sec) { \ + new_sec = ctx_sec; \ + new_nsec = ctx_nsec; \ + } else { \ + ctx_sec = new_sec; \ + ctx_nsec = new_nsec; \ + } \ + } while (0) + +typedef enum { + SHARD_BG_DELETION_NONE = 0, + SHARD_BG_DELETION_LAUNCHING, + SHARD_BG_DELETION_IN_PROGRESS, +} shard_bg_deletion_state_t; + +/* rm = "remove me" */ + +typedef struct shard_priv { + uint64_t block_size; + uuid_t dot_shard_gfid; + uuid_t dot_shard_rm_gfid; + inode_t *dot_shard_inode; + inode_t *dot_shard_rm_inode; + gf_lock_t lock; + int inode_count; + struct list_head ilist_head; + uint32_t deletion_rate; + shard_bg_deletion_state_t bg_del_state; + gf_boolean_t first_lookup_done; + uint64_t lru_limit; +} shard_priv_t; + +typedef struct { + loc_t loc; + char *domain; + struct gf_flock flock; + gf_boolean_t acquired_lock; +} shard_inodelk_t; + +typedef struct { + loc_t loc; + char *domain; + char *basename; + entrylk_cmd cmd; + entrylk_type type; + gf_boolean_t acquired_lock; +} shard_entrylk_t; + +typedef int32_t (*shard_post_fop_handler_t)(call_frame_t *frame, + xlator_t *this); +typedef int32_t (*shard_post_resolve_fop_handler_t)(call_frame_t *frame, + xlator_t *this); +typedef int32_t (*shard_post_lookup_shards_fop_handler_t)(call_frame_t *frame, + xlator_t *this); + +typedef int32_t (*shard_post_mknod_fop_handler_t)(call_frame_t *frame, + xlator_t *this); + +typedef int32_t (*shard_post_update_size_fop_handler_t)(call_frame_t *frame, + xlator_t *this); + +typedef struct shard_local { + int op_ret; + int op_errno; + uint64_t first_block; + uint64_t last_block; + uint64_t num_blocks; + int call_count; + int eexist_count; + int create_count; + int xflag; + int count; + uint32_t flags; + uint32_t uid; + uint32_t gid; + uint64_t block_size; + uint64_t dst_block_size; + int32_t datasync; + off_t offset; + size_t total_size; + size_t written_size; + size_t hole_size; + size_t req_size; + size_t readdir_size; + int64_t delta_size; + gf_atomic_t delta_blocks; + loc_t loc; + loc_t dot_shard_loc; + loc_t dot_shard_rm_loc; + loc_t loc2; + loc_t tmp_loc; + fd_t *fd; + dict_t *xattr_req; + dict_t *xattr_rsp; + inode_t **inode_list; + glusterfs_fop_t fop; + struct iatt prebuf; + struct iatt postbuf; + struct iatt preoldparent; + struct iatt postoldparent; + struct iatt prenewparent; + struct iatt postnewparent; + struct iovec *vector; + struct iobref *iobref; + struct iobuf *iobuf; + gf_dirent_t entries_head; + gf_boolean_t is_set_fsid; + gf_boolean_t list_inited; + shard_post_fop_handler_t handler; + shard_post_lookup_shards_fop_handler_t pls_fop_handler; + shard_post_resolve_fop_handler_t post_res_handler; + shard_post_mknod_fop_handler_t post_mknod_handler; + shard_post_update_size_fop_handler_t post_update_size_handler; + shard_inodelk_t int_inodelk; + shard_entrylk_t int_entrylk; + inode_t *resolver_base_inode; + gf_boolean_t first_lookup_done; + syncbarrier_t barrier; + gf_boolean_t lookup_shards_barriered; + gf_boolean_t unlink_shards_barriered; + gf_boolean_t resolve_not; + loc_t newloc; + call_frame_t *main_frame; + call_frame_t *inodelk_frame; + call_frame_t *entrylk_frame; + uint32_t deletion_rate; + gf_boolean_t cleanup_required; + uuid_t base_gfid; + char *name; +} shard_local_t; + +typedef struct shard_inode_ctx { + uint64_t block_size; /* The block size with which this inode is + sharded */ + struct iatt stat; + gf_boolean_t refresh; + /* The following members of inode ctx will be applicable only to the + * individual shards' ctx and never the base file ctx. + */ + struct list_head ilist; + uuid_t base_gfid; + int block_num; + gf_boolean_t refreshed; + struct list_head to_fsync_list; + int fsync_needed; + inode_t *inode; + int fsync_count; + inode_t *base_inode; +} shard_inode_ctx_t; + +typedef enum { + SHARD_INTERNAL_DIR_DOT_SHARD = 1, + SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME, +} shard_internal_dir_type_t; + +#endif /* __SHARD_H__ */ diff --git a/xlators/features/qemu-block/Makefile.am b/xlators/features/snapview-client/Makefile.am index af437a64d6d..af437a64d6d 100644 --- a/xlators/features/qemu-block/Makefile.am +++ b/xlators/features/snapview-client/Makefile.am diff --git a/xlators/features/snapview-client/src/Makefile.am b/xlators/features/snapview-client/src/Makefile.am new file mode 100644 index 00000000000..fa08656c537 --- /dev/null +++ b/xlators/features/snapview-client/src/Makefile.am @@ -0,0 +1,16 @@ +xlator_LTLIBRARIES = snapview-client.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +snapview_client_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +snapview_client_la_SOURCES = snapview-client.c +snapview_client_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = snapview-client.h snapview-client-mem-types.h snapview-client-messages.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/snapview-client/src/snapview-client-mem-types.h b/xlators/features/snapview-client/src/snapview-client-mem-types.h new file mode 100644 index 00000000000..3c3ab555a55 --- /dev/null +++ b/xlators/features/snapview-client/src/snapview-client-mem-types.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _SVC_MEM_TYPES_H +#define _SVC_MEM_TYPES_H + +#include <glusterfs/mem-types.h> + +enum svc_mem_types { + gf_svc_mt_svc_private_t = gf_common_mt_end + 1, + gf_svc_mt_svc_local_t, + gf_svc_mt_svc_inode_t, + gf_svc_mt_svc_fd_t, + gf_svc_mt_end +}; + +#endif diff --git a/xlators/features/snapview-client/src/snapview-client-messages.h b/xlators/features/snapview-client/src/snapview-client-messages.h new file mode 100644 index 00000000000..c02fb154930 --- /dev/null +++ b/xlators/features/snapview-client/src/snapview-client-messages.h @@ -0,0 +1,71 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _SNAPVIEW_CLIENT_MESSAGES_H_ +#define _SNAPVIEW_CLIENT_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(SNAPVIEW_CLIENT, SVC_MSG_NO_MEMORY, SVC_MSG_MEM_ACNT_FAILED, + SVC_MSG_SET_INODE_CONTEXT_FAILED, SVC_MSG_GET_INODE_CONTEXT_FAILED, + SVC_MSG_DELETE_INODE_CONTEXT_FAILED, SVC_MSG_SET_FD_CONTEXT_FAILED, + SVC_MSG_GET_FD_CONTEXT_FAILED, SVC_MSG_DICT_SET_FAILED, + SVC_MSG_SUBVOLUME_NULL, SVC_MSG_NO_CHILD_FOR_XLATOR, + SVC_MSG_XLATOR_CHILDREN_WRONG, SVC_MSG_NORMAL_GRAPH_LOOKUP_FAIL, + SVC_MSG_SNAPVIEW_GRAPH_LOOKUP_FAIL, SVC_MSG_OPENDIR_SPECIAL_DIR, + SVC_MSG_RENAME_SNAPSHOT_ENTRY, SVC_MSG_LINK_SNAPSHOT_ENTRY, + SVC_MSG_COPY_ENTRY_POINT_FAILED, SVC_MSG_ENTRY_POINT_SPECIAL_DIR, + SVC_MSG_STR_LEN, SVC_MSG_INVALID_ENTRY_POINT, SVC_MSG_NULL_PRIV, + SVC_MSG_PRIV_DESTROY_FAILED, SVC_MSG_ALLOC_FD_FAILED, + SVC_MSG_ALLOC_INODE_FAILED, SVC_MSG_NULL_SPECIAL_DIR, + SVC_MSG_MEM_POOL_GET_FAILED); + +#define SVC_MSG_ALLOC_FD_FAILED_STR "failed to allocate new fd context" +#define SVC_MSG_SET_FD_CONTEXT_FAILED_STR "failed to set fd context" +#define SVC_MSG_STR_LEN_STR \ + "destination buffer size is less than the length of entry point name" +#define SVC_MSG_NORMAL_GRAPH_LOOKUP_FAIL_STR "lookup failed on normal graph" +#define SVC_MSG_SNAPVIEW_GRAPH_LOOKUP_FAIL_STR "lookup failed on snapview graph" +#define SVC_MSG_SET_INODE_CONTEXT_FAILED_STR "failed to set inode context" +#define SVC_MSG_NO_MEMORY_STR "failed to allocate memory" +#define SVC_MSG_COPY_ENTRY_POINT_FAILED_STR \ + "failed to copy the entry point string" +#define SVC_MSG_GET_FD_CONTEXT_FAILED_STR "fd context not found" +#define SVC_MSG_GET_INODE_CONTEXT_FAILED_STR "failed to get inode context" +#define SVC_MSG_ALLOC_INODE_FAILED_STR "failed to allocate new inode" +#define SVC_MSG_DICT_SET_FAILED_STR "failed to set dict" +#define SVC_MSG_RENAME_SNAPSHOT_ENTRY_STR \ + "rename happening on a entry residing in snapshot" +#define SVC_MSG_DELETE_INODE_CONTEXT_FAILED_STR "failed to delete inode context" +#define SVC_MSG_NULL_PRIV_STR "priv NULL" +#define SVC_MSG_INVALID_ENTRY_POINT_STR "not a valid entry point" +#define SVC_MSG_MEM_ACNT_FAILED_STR "Memory accouting init failed" +#define SVC_MSG_NO_CHILD_FOR_XLATOR_STR "configured without any child" +#define SVC_MSG_XLATOR_CHILDREN_WRONG_STR \ + "snap-view-client has got wrong subvolumes. It can have only 2" +#define SVC_MSG_ENTRY_POINT_SPECIAL_DIR_STR \ + "entry point directory cannot be part of special directory" +#define SVC_MSG_NULL_SPECIAL_DIR_STR "null special directory" +#define SVC_MSG_MEM_POOL_GET_FAILED_STR \ + "could not get mem pool for frame->local" +#define SVC_MSG_PRIV_DESTROY_FAILED_STR "failed to destroy private" +#define SVC_MSG_LINK_SNAPSHOT_ENTRY_STR \ + "link happening on a entry residin gin snapshot" +#endif /* !_SNAPVIEW_CLIENT_MESSAGES_H_ */ diff --git a/xlators/features/snapview-client/src/snapview-client.c b/xlators/features/snapview-client/src/snapview-client.c new file mode 100644 index 00000000000..486c5179d5b --- /dev/null +++ b/xlators/features/snapview-client/src/snapview-client.c @@ -0,0 +1,2791 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "snapview-client.h" +#include <glusterfs/inode.h> +#include <glusterfs/byte-order.h> + +static void +svc_local_free(svc_local_t *local) +{ + if (local) { + loc_wipe(&local->loc); + if (local->fd) + fd_unref(local->fd); + if (local->xdata) + dict_unref(local->xdata); + mem_put(local); + } +} + +static xlator_t * +svc_get_subvolume(xlator_t *this, int inode_type) +{ + xlator_t *subvolume = NULL; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + + if (inode_type == VIRTUAL_INODE) + subvolume = SECOND_CHILD(this); + else + subvolume = FIRST_CHILD(this); + +out: + return subvolume; +} + +static int32_t +__svc_inode_ctx_set(xlator_t *this, inode_t *inode, int inode_type) +{ + uint64_t value = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + value = inode_type; + + ret = __inode_ctx_set(inode, this, &value); + +out: + return ret; +} + +static int +__svc_inode_ctx_get(xlator_t *this, inode_t *inode, int *inode_type) +{ + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + ret = __inode_ctx_get(inode, this, &value); + if (ret < 0) + goto out; + + *inode_type = (int)(value); + +out: + return ret; +} + +static int +svc_inode_ctx_get(xlator_t *this, inode_t *inode, int *inode_type) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + LOCK(&inode->lock); + { + ret = __svc_inode_ctx_get(this, inode, inode_type); + } + UNLOCK(&inode->lock); + +out: + return ret; +} + +static int32_t +svc_inode_ctx_set(xlator_t *this, inode_t *inode, int inode_type) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + LOCK(&inode->lock); + { + ret = __svc_inode_ctx_set(this, inode, inode_type); + } + UNLOCK(&inode->lock); + +out: + return ret; +} + +static svc_fd_t * +svc_fd_new(void) +{ + svc_fd_t *svc_fd = NULL; + + svc_fd = GF_CALLOC(1, sizeof(*svc_fd), gf_svc_mt_svc_fd_t); + + return svc_fd; +} + +static svc_fd_t * +__svc_fd_ctx_get(xlator_t *this, fd_t *fd) +{ + svc_fd_t *svc_fd = NULL; + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + ret = __fd_ctx_get(fd, this, &value); + if (ret) + return NULL; + + svc_fd = (svc_fd_t *)((long)value); + +out: + return svc_fd; +} + +static svc_fd_t * +svc_fd_ctx_get(xlator_t *this, fd_t *fd) +{ + svc_fd_t *svc_fd = NULL; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + LOCK(&fd->lock); + { + svc_fd = __svc_fd_ctx_get(this, fd); + } + UNLOCK(&fd->lock); + +out: + return svc_fd; +} + +static int +__svc_fd_ctx_set(xlator_t *this, fd_t *fd, svc_fd_t *svc_fd) +{ + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, svc_fd, out); + + value = (uint64_t)(long)svc_fd; + + ret = __fd_ctx_set(fd, this, value); + +out: + return ret; +} + +static svc_fd_t * +__svc_fd_ctx_get_or_new(xlator_t *this, fd_t *fd) +{ + svc_fd_t *svc_fd = NULL; + int ret = -1; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + inode = fd->inode; + svc_fd = __svc_fd_ctx_get(this, fd); + if (svc_fd) { + ret = 0; + goto out; + } + + svc_fd = svc_fd_new(); + if (!svc_fd) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, SVC_MSG_ALLOC_FD_FAILED, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + goto out; + } + + ret = __svc_fd_ctx_set(this, fd, svc_fd); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_SET_FD_CONTEXT_FAILED, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + ret = -1; + } + +out: + if (ret) { + GF_FREE(svc_fd); + svc_fd = NULL; + } + + return svc_fd; +} + +static svc_fd_t * +svc_fd_ctx_get_or_new(xlator_t *this, fd_t *fd) +{ + svc_fd_t *svc_fd = NULL; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + LOCK(&fd->lock); + { + svc_fd = __svc_fd_ctx_get_or_new(this, fd); + } + UNLOCK(&fd->lock); + +out: + return svc_fd; +} + +/** + * @this: xlator + * @entry_point: pointer to the buffer provided by consumer + * + * This function is mainly for copying the entry point name + * (stored as string in priv->path) to a buffer point to by + * @entry_point within the lock. It is for the consumer to + * allocate the memory for the buffer. + * + * This function is called by all the functions (or fops) + * who need to use priv->path for avoiding the race. + * For example, either in lookup or in any other fop, + * while priv->path is being accessed, a reconfigure can + * happen to change priv->path. This ensures that, a lock + * is taken before accessing priv->path. + **/ +int +gf_svc_get_entry_point(xlator_t *this, char *entry_point, size_t dest_size) +{ + int ret = -1; + svc_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + GF_VALIDATE_OR_GOTO(this->name, entry_point, out); + + priv = this->private; + + LOCK(&priv->lock); + { + if (dest_size <= strlen(priv->path)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_STR_LEN, + "dest-size=%zu", dest_size, "priv-path-len=%zu", + strlen(priv->path), "path=%s", priv->path, NULL); + } else { + snprintf(entry_point, dest_size, "%s", priv->path); + ret = 0; + } + } + UNLOCK(&priv->lock); + +out: + return ret; +} + +static int32_t +gf_svc_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + svc_local_t *local = NULL; + xlator_t *subvolume = NULL; + gf_boolean_t do_unwind = _gf_true; + int inode_type = -1; + int ret = -1; + + local = frame->local; + subvolume = local->subvolume; + if (!subvolume) { + gf_msg_callingfn(this->name, GF_LOG_ERROR, 0, SVC_MSG_SUBVOLUME_NULL, + "path: %s gfid: %s ", local->loc.path, + inode ? uuid_utoa(inode->gfid) : ""); + GF_ASSERT(0); + } + + /* There is a possibility that, the client process just came online + and does not have the inode on which the lookup came. In that case, + the fresh inode created from fuse for the lookup fop, won't have + the inode context set without which svc cannot decide where to + STACK_WIND to. So by default it decides to send the fop to the + regular subvolume (i.e first child of the xlator). If lookup fails + on the regular volume, then there is a possibility that the lookup + is happening on a virtual inode (i.e history data residing in snaps). + So if lookup fails with ENOENT and the inode context is not there, + then send the lookup to the 2nd child of svc. + + If there are any changes in volfile/client-restarted then inode-ctx + is lost. In this case if nameless lookup fails with ESTALE, + then send the lookup to the 2nd child of svc. + */ + if (op_ret) { + if (subvolume == FIRST_CHILD(this)) { + gf_smsg(this->name, + (op_errno == ENOENT || op_errno == ESTALE) ? GF_LOG_DEBUG + : GF_LOG_ERROR, + op_errno, SVC_MSG_NORMAL_GRAPH_LOOKUP_FAIL, "error=%s", + strerror(op_errno), NULL); + } else { + gf_smsg(this->name, + (op_errno == ENOENT || op_errno == ESTALE) ? GF_LOG_DEBUG + : GF_LOG_ERROR, + op_errno, SVC_MSG_SNAPVIEW_GRAPH_LOOKUP_FAIL, "error=%s", + strerror(op_errno), NULL); + goto out; + } + + if ((op_errno == ENOENT || op_errno == ESTALE) && + !gf_uuid_is_null(local->loc.gfid)) { + if (inode != NULL) + ret = svc_inode_ctx_get(this, inode, &inode_type); + + if (ret < 0 || inode == NULL) { + gf_msg_debug(this->name, 0, + "Lookup on normal graph failed. " + " Sending lookup to snapview-server"); + subvolume = SECOND_CHILD(this); + local->subvolume = subvolume; + STACK_WIND(frame, gf_svc_lookup_cbk, subvolume, + subvolume->fops->lookup, &local->loc, xdata); + do_unwind = _gf_false; + } + } + + goto out; + } + + if (subvolume == FIRST_CHILD(this)) + inode_type = NORMAL_INODE; + else + inode_type = VIRTUAL_INODE; + + ret = svc_inode_ctx_set(this, inode, inode_type); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_SET_INODE_CONTEXT_FAILED, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + +out: + if (do_unwind) { + SVC_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + } + + return 0; +} + +static int32_t +gf_svc_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int32_t ret = -1; + svc_local_t *local = NULL; + xlator_t *subvolume = NULL; + int op_ret = -1; + int op_errno = EINVAL; + inode_t *parent = NULL; + dict_t *new_xdata = NULL; + int inode_type = -1; + int parent_type = -1; + gf_boolean_t wind = _gf_false; + char entry_point[NAME_MAX + 1] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + ret = svc_inode_ctx_get(this, loc->inode, &inode_type); + if (!__is_root_gfid(loc->gfid)) { + if (loc->parent) { + parent = inode_ref(loc->parent); + ret = svc_inode_ctx_get(this, loc->parent, &parent_type); + } else { + parent = inode_parent(loc->inode, loc->pargfid, NULL); + if (parent) + ret = svc_inode_ctx_get(this, parent, &parent_type); + } + } + + local = mem_get0(this->local_pool); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, SVC_MSG_NO_MEMORY, NULL); + goto out; + } + + frame->local = local; + loc_copy(&local->loc, loc); + + if (__is_root_gfid(loc->inode->gfid)) { + subvolume = FIRST_CHILD(this); + GF_ASSERT(subvolume); + local->subvolume = subvolume; + wind = _gf_true; + goto out; + } + + /* nfs sends nameless lookups directly using the gfid. In that case + loc->name will be NULL. So check if loc->name is NULL. If so, then + try to get the subvolume using inode context. But if the inode has + not been looked up yet, then send the lookup call to the first + subvolume. + */ + + if (!loc->name) { + if (gf_uuid_is_null(loc->inode->gfid)) { + subvolume = FIRST_CHILD(this); + local->subvolume = subvolume; + wind = _gf_true; + goto out; + } else { + if (inode_type >= 0) + subvolume = svc_get_subvolume(this, inode_type); + else + subvolume = FIRST_CHILD(this); + local->subvolume = subvolume; + wind = _gf_true; + goto out; + } + } + + if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, + SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL); + goto out; + } + + if (strcmp(loc->name, entry_point)) { + if (parent_type == VIRTUAL_INODE) { + subvolume = SECOND_CHILD(this); + } else { + /* + * Either parent type is normal graph, or the parent + * type is uncertain. + */ + subvolume = FIRST_CHILD(this); + } + local->subvolume = subvolume; + } else { + subvolume = SECOND_CHILD(this); + local->subvolume = subvolume; + if (parent_type == NORMAL_INODE) { + /* Indication of whether the lookup is happening on the + entry point or not, to the snapview-server. + */ + SVC_ENTRY_POINT_SET(this, xdata, op_ret, op_errno, new_xdata, ret, + out); + } + } + + wind = _gf_true; + +out: + if (wind) + STACK_WIND(frame, gf_svc_lookup_cbk, subvolume, subvolume->fops->lookup, + loc, xdata); + else + SVC_STACK_UNWIND(lookup, frame, op_ret, op_errno, NULL, NULL, NULL, + NULL); + if (new_xdata) + dict_unref(new_xdata); + + if (parent) + inode_unref(parent); + + return 0; +} + +static int32_t +gf_svc_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + xlator_t *subvolume = NULL; + int32_t ret = -1; + int inode_type = -1; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + svc_private_t *priv = NULL; + const char *path = NULL; + int path_len = -1; + int snap_len = -1; + loc_t root_loc = { + 0, + }; + loc_t *temp_loc = NULL; + char entry_point[NAME_MAX + 1] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + priv = this->private; + SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, loc->inode, + subvolume, out); + path_len = strlen(loc->path); + snap_len = strlen(priv->path); + temp_loc = loc; + + if (path_len >= snap_len && inode_type == VIRTUAL_INODE) { + path = &loc->path[path_len - snap_len]; + if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, + SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL); + goto out; + } + + if (!strcmp(path, entry_point)) { + /* + * statfs call for virtual snap directory. + * Sent the fops to parent volume by removing + * virtual directory from path + */ + subvolume = FIRST_CHILD(this); + root_loc.path = gf_strdup("/"); + gf_uuid_clear(root_loc.gfid); + root_loc.gfid[15] = 1; + root_loc.inode = inode_ref(loc->inode->table->root); + temp_loc = &root_loc; + } + } + + STACK_WIND_TAIL(frame, subvolume, subvolume->fops->statfs, temp_loc, xdata); + if (temp_loc == &root_loc) + loc_wipe(temp_loc); + + wind = _gf_true; +out: + if (!wind) + SVC_STACK_UNWIND(statfs, frame, op_ret, op_errno, NULL, NULL); + return 0; +} + +static int32_t +gf_svc_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + /* TODO: FIX ME + * Consider a testcase: + * #mount -t nfs host1:/vol1 /mnt + * #ls /mnt + * #ls /mnt/.snaps (As expected this fails) + * #gluster volume set vol1 features.uss enable + * Now `ls /mnt/.snaps` should work, but fails with No such file or + * directory. This is because NFS client (gNFS) caches the list of files + * in a directory. This cache is updated if there are any changes in the + * directory attributes. So, one way to solve this problem is to change + * 'ctime' attribute when USS is enabled as below. + * + * if (op_ret == 0 && IA_ISDIR(buf->ia_type)) + * buf->ia_ctime_nsec++; + * + * But this is not the ideal solution as applications see the unexpected + * ctime change causing failures. + */ + + SVC_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +/* should all the fops be handled like lookup is supposed to be + handled? i.e just based on inode type decide where the call should + be sent and in the call back update the contexts. +*/ +static int32_t +gf_svc_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int32_t ret = -1; + int inode_type = -1; + xlator_t *subvolume = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, loc->inode, + subvolume, out); + + STACK_WIND(frame, gf_svc_stat_cbk, subvolume, subvolume->fops->stat, loc, + xdata); + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(stat, frame, op_ret, op_errno, NULL, NULL); + return 0; +} + +static int32_t +gf_svc_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int32_t ret = -1; + int inode_type = -1; + xlator_t *subvolume = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, out); + + SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, fd->inode, + subvolume, out); + + STACK_WIND_TAIL(frame, subvolume, subvolume->fops->fstat, fd, xdata); + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(fstat, frame, op_ret, op_errno, NULL, NULL); + + return ret; +} + +static int32_t +gf_svc_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + svc_fd_t *svc_fd = NULL; + svc_local_t *local = NULL; + svc_private_t *priv = NULL; + gf_boolean_t special_dir = _gf_false; + char path[PATH_MAX] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + if (op_ret) + goto out; + + priv = this->private; + local = frame->local; + + if (local->subvolume == FIRST_CHILD(this) && priv->special_dir && + strcmp(priv->special_dir, "")) { + if (!__is_root_gfid(fd->inode->gfid)) + snprintf(path, sizeof(path), "%s/.", priv->special_dir); + else + snprintf(path, sizeof(path), "/."); + + if (!strcmp(local->loc.path, priv->special_dir) || + !strcmp(local->loc.path, path)) { + gf_msg_debug(this->name, 0, + "got opendir on special directory" + " %s (gfid: %s)", + path, uuid_utoa(fd->inode->gfid)); + special_dir = _gf_true; + } + } + + if (special_dir) { + svc_fd = svc_fd_ctx_get_or_new(this, fd); + if (!svc_fd) { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_GET_FD_CONTEXT_FAILED, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto out; + } + + svc_fd->last_offset = -1; + svc_fd->special_dir = special_dir; + } + +out: + STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, xdata); + + return 0; +} + +/* If the inode represents a directory which is actually + present in a snapshot, then opendir on that directory + should be sent to the snap-view-server which opens + the directory in the corresponding graph. + In fact any opendir call on a virtual directory + should be sent to svs. Because if it fakes success + here, then later when readdir on that fd comes, there + will not be any corresponding fd opened on svs and + svc has to do things that open-behind is doing. +*/ +static int32_t +gf_svc_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + int32_t ret = -1; + int inode_type = -1; + xlator_t *subvolume = NULL; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + svc_local_t *local = NULL; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + local = mem_get0(this->local_pool); + if (!local) { + op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, SVC_MSG_NO_MEMORY, + "path=%s", loc->path, "gfid=%s", uuid_utoa(fd->inode->gfid), + NULL); + goto out; + } + loc_copy(&local->loc, loc); + frame->local = local; + + SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, loc->inode, + subvolume, out); + local->subvolume = subvolume; + + STACK_WIND(frame, gf_svc_opendir_cbk, subvolume, subvolume->fops->opendir, + loc, fd, xdata); + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(opendir, frame, op_ret, op_errno, NULL, NULL); + + return 0; +} + +static int32_t +gf_svc_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + int32_t ret = -1; + int inode_type = -1; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + ret = svc_inode_ctx_get(this, loc->inode, &inode_type); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_GET_INODE_CONTEXT_FAILED, "path=%s", loc->path, + "gfid= %s", uuid_utoa(loc->inode->gfid), NULL); + goto out; + } + + if (inode_type == NORMAL_INODE) { + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, + xdata); + } else { + op_ret = -1; + op_errno = EROFS; + goto out; + } + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(setattr, frame, op_ret, op_errno, NULL, NULL, NULL); + return 0; +} + +/* XXX: This function is currently not used. Remove "#if 0" when required */ +#if 0 +static int32_t +gf_svc_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + int32_t ret = -1; + int inode_type = -1; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO ("svc", this, out); + GF_VALIDATE_OR_GOTO (this->name, frame, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, fd->inode, out); + + ret = svc_inode_ctx_get (this, fd->inode, &inode_type); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + gf_msg (this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_GET_INODE_CONTEXT_FAILED, "failed to " + "get the inode context for %s", + uuid_utoa (fd->inode->gfid)); + goto out; + } + + if (inode_type == NORMAL_INODE) { + STACK_WIND_TAIL (frame, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsetattr, fd, stbuf, + valid, xdata); + } else { + op_ret = -1; + op_errno = EROFS; + goto out; + } + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, + NULL, NULL, NULL); + return 0; +} +#endif /* gf_svc_fsetattr() is not used */ + +static int32_t +gf_svc_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + int32_t ret = -1; + int inode_type = -1; + xlator_t *subvolume = NULL; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + svc_private_t *priv = NULL; + char attrname[PATH_MAX] = ""; + char attrval[64] = ""; + dict_t *dict = NULL; + char entry_point[NAME_MAX + 1] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, priv, out); + + /* + * Samba sends this special key for case insensitive + * filename check. This request comes with a parent + * path and with a special key GF_XATTR_GET_REAL_FILENAME_KEY. + * e.g. "glusterfs.get_real_filename:.snaps". + * If the name variable matches this key then we have + * to send back .snaps as the real filename. + */ + if (!name) + goto stack_wind; + + sscanf(name, "%[^:]:%[^@]", attrname, attrval); + strcat(attrname, ":"); + + if (!strcmp(attrname, GF_XATTR_GET_REAL_FILENAME_KEY)) { + if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, + SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL); + goto out; + } + + if (!strcasecmp(attrval, entry_point)) { + dict = dict_new(); + if (NULL == dict) { + op_errno = ENOMEM; + goto out; + } + + ret = dict_set_dynstr_with_alloc(dict, (char *)name, entry_point); + + if (ret) { + op_errno = ENOMEM; + goto out; + } + + op_errno = 0; + op_ret = strlen(entry_point) + 1; + /* We should return from here */ + goto out; + } + } +stack_wind: + SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, loc->inode, + subvolume, out); + + STACK_WIND_TAIL(frame, subvolume, subvolume->fops->getxattr, loc, name, + xdata); + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, NULL); + + if (dict) + dict_unref(dict); + + return 0; +} + +/* XXX: This function is currently not used. Mark it '#if 0' when required */ +#if 0 +static int32_t +gf_svc_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + int32_t ret = -1; + int inode_type = -1; + xlator_t *subvolume = NULL; + gf_boolean_t wind = _gf_false; + int op_ret = -1; + int op_errno = EINVAL; + + GF_VALIDATE_OR_GOTO ("svc", this, out); + GF_VALIDATE_OR_GOTO (this->name, frame, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, fd->inode, out); + + SVC_GET_SUBVOL_FROM_CTX (this, op_ret, op_errno, inode_type, ret, + fd->inode, subvolume, out); + + STACK_WIND_TAIL (frame, subvolume, + subvolume->fops->fgetxattr, fd, name, xdata); + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, + NULL, NULL); + return 0; +} +#endif /* gf_svc_fgetxattr() is not used */ + +static int32_t +gf_svc_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + int32_t ret = -1; + int inode_type = -1; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + ret = svc_inode_ctx_get(this, loc->inode, &inode_type); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_GET_INODE_CONTEXT_FAILED, "name=%s", loc->name, + "gfid=%s", uuid_utoa(loc->inode->gfid), NULL); + goto out; + } + + if (inode_type == NORMAL_INODE) { + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, + xdata); + } else { + op_ret = -1; + op_errno = EROFS; + goto out; + } + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(setxattr, frame, op_ret, op_errno, NULL); + + return 0; +} + +static int32_t +gf_svc_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + int32_t ret = -1; + int inode_type = -1; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, out); + + ret = svc_inode_ctx_get(this, fd->inode, &inode_type); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s", + uuid_utoa(fd->inode->gfid), NULL); + goto out; + } + + if (inode_type == NORMAL_INODE) { + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, + xdata); + } else { + op_ret = -1; + op_errno = EROFS; + goto out; + } + + wind = _gf_true; + +out: + if (!wind) + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL); + + return 0; +} + +static int32_t +gf_svc_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) +{ + int inode_type = -1; + int ret = -1; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + ret = svc_inode_ctx_get(this, loc->inode, &inode_type); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_GET_INODE_CONTEXT_FAILED, "name=%s", loc->name, + "gfid=%s", uuid_utoa(loc->inode->gfid), NULL); + goto out; + } + + if (inode_type == NORMAL_INODE) { + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata); + } else { + op_ret = -1; + op_errno = EROFS; + goto out; + } + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(rmdir, frame, op_ret, op_errno, NULL, NULL, NULL); + return 0; +} + +static int32_t +gf_svc_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int inode_type = -1; + int ret = -1; + + if (op_ret < 0) + goto out; + + inode_type = NORMAL_INODE; + ret = svc_inode_ctx_set(this, inode, inode_type); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_SET_INODE_CONTEXT_FAILED, + NULL); + +out: + SVC_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; +} + +static int32_t +gf_svc_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + int parent_type = -1; + int ret = -1; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + char entry_point[NAME_MAX + 1] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + ret = svc_inode_ctx_get(this, loc->parent, &parent_type); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s", + uuid_utoa(loc->parent->gfid), NULL); + goto out; + } + + if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, + SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL); + goto out; + } + + if (strcmp(loc->name, entry_point) && parent_type == NORMAL_INODE) { + STACK_WIND(frame, gf_svc_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); + } else { + op_ret = -1; + op_errno = EROFS; + goto out; + } + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(mkdir, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, + NULL); + return 0; +} + +static int32_t +gf_svc_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int inode_type = -1; + int ret = -1; + + if (op_ret < 0) + goto out; + + inode_type = NORMAL_INODE; + ret = svc_inode_ctx_set(this, inode, inode_type); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_SET_INODE_CONTEXT_FAILED, + NULL); + +out: + SVC_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; +} + +static int32_t +gf_svc_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + int parent_type = -1; + int ret = -1; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + char entry_point[NAME_MAX + 1] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + ret = svc_inode_ctx_get(this, loc->parent, &parent_type); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s", + uuid_utoa(loc->parent->gfid), NULL); + goto out; + } + + if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, + SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL); + goto out; + } + + if (strcmp(loc->name, entry_point) && parent_type == NORMAL_INODE) { + STACK_WIND(frame, gf_svc_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, + xdata); + } else { + op_ret = -1; + op_errno = EROFS; + goto out; + } + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(mknod, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, + NULL); + return 0; +} + +/* If the flags of the open call contain O_WRONLY or O_RDWR and the inode is + a virtual inode, then unwind the call back with EROFS. Otherwise simply + STACK_WIND the call to the first child of svc xlator. +*/ +static int32_t +gf_svc_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + xlator_t *subvolume = NULL; + int inode_type = -1; + int op_ret = -1; + int op_errno = EINVAL; + int ret = -1; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + /* Another way is to STACK_WIND to normal subvolume, if inode + type is not there in the context. If the file actually resides + in snapshots, then ENOENT would be returned. Needs more analysis. + */ + SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, loc->inode, + subvolume, out); + + if (((flags & O_ACCMODE) == O_WRONLY) || ((flags & O_ACCMODE) == O_RDWR)) { + if (subvolume != FIRST_CHILD(this)) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + } + + STACK_WIND_TAIL(frame, subvolume, subvolume->fops->open, loc, flags, fd, + xdata); + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(open, frame, op_ret, op_errno, NULL, NULL); + return 0; +} + +static int32_t +gf_svc_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int inode_type = -1; + int ret = -1; + + if (op_ret < 0) + goto out; + + inode_type = NORMAL_INODE; + ret = svc_inode_ctx_set(this, inode, inode_type); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_SET_INODE_CONTEXT_FAILED, + NULL); + +out: + SVC_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf, + preparent, postparent, xdata); + + return 0; +} + +static int32_t +gf_svc_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + int parent_type = -1; + int ret = -1; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + char entry_point[NAME_MAX + 1] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + ret = svc_inode_ctx_get(this, loc->parent, &parent_type); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s", + uuid_utoa(loc->parent->gfid), NULL); + goto out; + } + + if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, + SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL); + goto out; + } + + if (strcmp(loc->name, entry_point) && parent_type == NORMAL_INODE) { + STACK_WIND(frame, gf_svc_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + } else { + op_ret = -1; + op_errno = EROFS; + goto out; + } + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(create, frame, op_ret, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); + return 0; +} + +static int32_t +gf_svc_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int inode_type = -1; + int ret = -1; + + if (op_ret < 0) + goto out; + + inode_type = NORMAL_INODE; + ret = svc_inode_ctx_set(this, inode, inode_type); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_SET_INODE_CONTEXT_FAILED, + NULL); + +out: + SVC_STACK_UNWIND(symlink, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + + return 0; +} + +static int32_t +gf_svc_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + int parent_type = -1; + int op_ret = -1; + int op_errno = EINVAL; + int ret = -1; + gf_boolean_t wind = _gf_false; + char entry_point[NAME_MAX + 1] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + ret = svc_inode_ctx_get(this, loc->parent, &parent_type); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s", + uuid_utoa(loc->parent->gfid), NULL); + goto out; + } + + if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, + SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL); + goto out; + } + + if (strcmp(loc->name, entry_point) && parent_type == NORMAL_INODE) { + STACK_WIND(frame, gf_svc_symlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, + xdata); + } else { + op_ret = -1; + op_errno = EROFS; + goto out; + } + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(symlink, frame, op_ret, op_errno, NULL, NULL, NULL, + NULL, NULL); + return 0; +} + +static int32_t +gf_svc_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) +{ + int inode_type = -1; + int op_ret = -1; + int op_errno = EINVAL; + int ret = -1; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + ret = svc_inode_ctx_get(this, loc->inode, &inode_type); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s", + uuid_utoa(loc->parent->gfid), NULL); + goto out; + } + + if (inode_type == NORMAL_INODE) { + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, flags, xdata); + } else { + op_ret = -1; + op_errno = EROFS; + goto out; + } + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(unlink, frame, op_ret, op_errno, NULL, NULL, NULL); + return 0; +} + +static int32_t +gf_svc_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int inode_type = -1; + xlator_t *subvolume = NULL; + int ret = -1; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, out); + + SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, fd->inode, + subvolume, out); + + STACK_WIND_TAIL(frame, subvolume, subvolume->fops->readv, fd, size, offset, + flags, xdata); + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(readv, frame, op_ret, op_errno, NULL, 0, NULL, NULL, + NULL); + return 0; +} + +static int32_t +gf_svc_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + int inode_type = -1; + xlator_t *subvolume = NULL; + int ret = -1; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, loc->inode, + subvolume, out); + + STACK_WIND_TAIL(frame, subvolume, subvolume->fops->readlink, loc, size, + xdata); + + wind = _gf_true; + +out: + if (!wind) + STACK_UNWIND_STRICT(readlink, frame, op_ret, op_errno, NULL, NULL, + NULL); + return 0; +} + +static int32_t +gf_svc_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) +{ + int ret = -1; + int inode_type = -1; + xlator_t *subvolume = NULL; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, loc->inode, + subvolume, out); + + STACK_WIND_TAIL(frame, subvolume, subvolume->fops->access, loc, mask, + xdata); + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(access, frame, op_ret, op_errno, NULL); + + return 0; +} + +int32_t +gf_svc_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + gf_dirent_t *tmpentry = NULL; + svc_local_t *local = NULL; + char entry_point[NAME_MAX + 1] = { + 0, + }; + + if (op_ret < 0) + goto out; + + local = frame->local; + + /* If .snaps pre-exists, then it should not be listed + * in the NORMAL INODE directory when USS is enabled, + * so filter the .snaps entry if exists. + * However it is OK to list .snaps in VIRTUAL world + */ + if (local->subvolume != FIRST_CHILD(this)) + goto out; + + /* + * Better to goto out if getting the entry point + * fails. We might end up sending the directory + * entry for the snapview entry point in the readdir + * response. But, the intention is to avoid the race + * condition where priv->path is being changed in + * reconfigure while this is accessing it. + */ + if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, + SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL); + goto out; + } + + list_for_each_entry_safe(entry, tmpentry, &entries->list, list) + { + if (strcmp(entry_point, entry->d_name) == 0) + gf_dirent_entry_free(entry); + } + +out: + SVC_STACK_UNWIND(readdir, frame, op_ret, op_errno, entries, xdata); + return 0; +} + +static int32_t +gf_svc_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + int inode_type = -1; + xlator_t *subvolume = NULL; + svc_local_t *local = NULL; + int ret = -1; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + svc_fd_t *svc_fd = NULL; + gf_dirent_t entries; + + INIT_LIST_HEAD(&entries); + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, out); + + svc_fd = svc_fd_ctx_get_or_new(this, fd); + if (!svc_fd) + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_GET_FD_CONTEXT_FAILED, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + else { + if (svc_fd->entry_point_handled && off == svc_fd->last_offset) { + op_ret = 0; + op_errno = ENOENT; + goto out; + } + } + + SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, fd->inode, + subvolume, out); + + local = mem_get0(this->local_pool); + if (!local) { + gf_smsg(this->name, GF_LOG_ERROR, op_errno, SVC_MSG_NO_MEMORY, + "inode-gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto out; + } + local->subvolume = subvolume; + frame->local = local; + + STACK_WIND(frame, gf_svc_readdir_cbk, subvolume, subvolume->fops->readdir, + fd, size, off, xdata); + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL); + + gf_dirent_free(&entries); + + return 0; +} + +/* + * This lookup if mainly for supporting USS for windows. + * Since the dentry for the entry-point directory is not sent in + * the readdir response, from windows explorer, there is no way + * to access the snapshots. If the explicit path of the entry-point + * directory is mentioned in the address bar, then windows sends + * readdir on the parent directory and compares if the entry point + * directory's name is there in readdir response. If it is not there + * then access to snapshot world is denied. And windows users cannot + * access snapshots via samba. + * So, to handle this a new option called special-directory is created, + * which if set, snapview-client will send the entry-point's dentry + * in readdirp o/p for the special directory, so that it will be + * visible from windows explorer. + * But to send that virtual entry, the following mechanism is used. + * 1) Check if readdir from posix is over. + * 2) If so, then send a lookup on entry point directory to snap daemon + * (this is needed because in readdirp inodes are linked, so we need to + * maintain 1:1 mapping between inodes (gfids) from snapview server to + * snapview client). + * 3) Once successful lookup response received, send a new entry to + * windows. + */ + +static int32_t +gf_svc_readdirp_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + svc_fd_t *svc_fd = NULL; + svc_local_t *local = NULL; + int inode_type = -1; + int ret = -1; + char entry_point[NAME_MAX + 1] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + + INIT_LIST_HEAD(&entries.list); + + local = frame->local; + + if (op_ret) { + if (op_errno == ESTALE && !local->revalidate) { + local->revalidate = 1; + ret = gf_svc_special_dir_revalidate_lookup(frame, this, xdata); + + if (!ret) + return 0; + } + op_ret = 0; + op_errno = ENOENT; + goto out; + } + + svc_fd = svc_fd_ctx_get(this, local->fd); + if (!svc_fd) { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_GET_FD_CONTEXT_FAILED, + "gfid=%s", uuid_utoa(local->fd->inode->gfid), NULL); + op_ret = 0; + op_errno = ENOENT; + goto out; + } + + if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) { + gf_smsg(this->name, GF_LOG_WARNING, 0, SVC_MSG_COPY_ENTRY_POINT_FAILED, + NULL); + op_ret = 0; + op_errno = ENOENT; + goto out; + } + + entry = gf_dirent_for_name(entry_point); + if (!entry) { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_NO_MEMORY, + "entry-point=%s", entry_point, NULL); + op_ret = 0; + op_errno = ENOMEM; + goto out; + } + + entry->inode = inode_ref(inode); + entry->d_off = svc_fd->last_offset + 22; + entry->d_ino = buf->ia_ino; + entry->d_type = DT_DIR; + entry->d_stat = *buf; + inode_type = VIRTUAL_INODE; + ret = svc_inode_ctx_set(this, entry->inode, inode_type); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_SET_INODE_CONTEXT_FAILED, + "entry-name=%s", entry->d_name, NULL); + + list_add_tail(&entry->list, &entries.list); + op_ret = 1; + svc_fd->last_offset = entry->d_off; + svc_fd->entry_point_handled = _gf_true; + +out: + SVC_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, + local ? local->xdata : NULL); + + gf_dirent_free(&entries); + + return 0; +} + +int +gf_svc_special_dir_revalidate_lookup(call_frame_t *frame, xlator_t *this, + dict_t *xdata) +{ + svc_local_t *local = NULL; + loc_t *loc = NULL; + dict_t *tmp_xdata = NULL; + char *path = NULL; + int ret = -1; + char entry_point[NAME_MAX + 1] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + + local = frame->local; + loc = &local->loc; + + if (local->xdata) { + dict_unref(local->xdata); + local->xdata = NULL; + } + + if (xdata) + local->xdata = dict_ref(xdata); + + inode_unref(loc->inode); + loc->inode = inode_new(loc->parent->table); + if (!loc->inode) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, SVC_MSG_ALLOC_INODE_FAILED, + NULL); + goto out; + } + + if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) { + gf_smsg(this->name, GF_LOG_WARNING, 0, SVC_MSG_COPY_ENTRY_POINT_FAILED, + NULL); + goto out; + } + + gf_uuid_copy(local->loc.gfid, loc->inode->gfid); + ret = inode_path(loc->parent, entry_point, &path); + if (ret < 0) + goto out; + + if (loc->path) + GF_FREE((char *)loc->path); + + loc->path = gf_strdup(path); + if (loc->path) { + if (!loc->name || (loc->name && !strcmp(loc->name, ""))) { + loc->name = strrchr(loc->path, '/'); + if (loc->name) + loc->name++; + } + } else + loc->path = NULL; + + tmp_xdata = dict_new(); + if (!tmp_xdata) { + ret = -1; + goto out; + } + + ret = dict_set_str(tmp_xdata, "entry-point", "true"); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_DICT_SET_FAILED, NULL); + goto out; + } + + STACK_WIND(frame, gf_svc_readdirp_lookup_cbk, SECOND_CHILD(this), + SECOND_CHILD(this)->fops->lookup, loc, tmp_xdata); +out: + if (tmp_xdata) + dict_unref(tmp_xdata); + + GF_FREE(path); + return ret; +} + +static gf_boolean_t +gf_svc_readdir_on_special_dir(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + gf_dirent_t *entries, dict_t *xdata) +{ + svc_local_t *local = NULL; + svc_private_t *private = NULL; + inode_t *inode = NULL; + fd_t *fd = NULL; + char *path = NULL; + loc_t *loc = NULL; + dict_t *tmp_xdata = NULL; + int ret = -1; + gf_boolean_t unwind = _gf_true; + svc_fd_t *svc_fd = NULL; + char entry_point[NAME_MAX + 1] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + private + = this->private; + local = frame->local; + + loc = &local->loc; + fd = local->fd; + svc_fd = svc_fd_ctx_get(this, fd); + if (!svc_fd) { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_GET_FD_CONTEXT_FAILED, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto out; + } + + /* + * check if its end of readdir operation from posix, if special_dir + * option is set, if readdir is done on special directory and if + * readdirp is from normal regular graph. + */ + + if (!private->show_entry_point) + goto out; + + if (op_ret == 0 && op_errno == ENOENT && private->special_dir && + strcmp(private->special_dir, "") && svc_fd->special_dir && + local->subvolume == FIRST_CHILD(this)) { + if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) { + gf_smsg(this->name, GF_LOG_WARNING, 0, + SVC_MSG_GET_FD_CONTEXT_FAILED, NULL); + goto out; + } + + inode = inode_grep(fd->inode->table, fd->inode, entry_point); + if (!inode) { + inode = inode_new(fd->inode->table); + if (!inode) { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_ALLOC_INODE_FAILED, + NULL); + goto out; + } + } + + gf_uuid_copy(local->loc.pargfid, fd->inode->gfid); + gf_uuid_copy(local->loc.gfid, inode->gfid); + if (gf_uuid_is_null(inode->gfid)) + ret = inode_path(fd->inode, entry_point, &path); + else + ret = inode_path(inode, NULL, &path); + + if (ret < 0) + goto out; + loc->path = gf_strdup(path); + if (loc->path) { + if (!loc->name || (loc->name && !strcmp(loc->name, ""))) { + loc->name = strrchr(loc->path, '/'); + if (loc->name) + loc->name++; + } + } + + loc->inode = inode; + loc->parent = inode_ref(fd->inode); + tmp_xdata = dict_new(); + if (!tmp_xdata) + goto out; + ret = dict_set_str(tmp_xdata, "entry-point", "true"); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_DICT_SET_FAILED, NULL); + goto out; + } + + local->cookie = cookie; + if (local->xdata) { + dict_unref(local->xdata); + local->xdata = NULL; + } + if (xdata) + local->xdata = dict_ref(xdata); + + STACK_WIND(frame, gf_svc_readdirp_lookup_cbk, SECOND_CHILD(this), + SECOND_CHILD(this)->fops->lookup, loc, tmp_xdata); + unwind = _gf_false; + } + +out: + if (tmp_xdata) + dict_unref(tmp_xdata); + + GF_FREE(path); + return unwind; +} + +static int32_t +gf_svc_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + gf_dirent_t *tmpentry = NULL; + svc_local_t *local = NULL; + int inode_type = -1; + int ret = -1; + svc_fd_t *svc_fd = NULL; + gf_boolean_t unwind = _gf_true; + char entry_point[NAME_MAX + 1] = { + 0, + }; + + if (op_ret < 0) + goto out; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + + local = frame->local; + + svc_fd = svc_fd_ctx_get(this, local->fd); + if (!svc_fd) { + gf_smsg(this->name, GF_LOG_WARNING, 0, SVC_MSG_GET_FD_CONTEXT_FAILED, + "gfid=%s", uuid_utoa(local->fd->inode->gfid), NULL); + } + + if (local->subvolume == FIRST_CHILD(this)) + inode_type = NORMAL_INODE; + else + inode_type = VIRTUAL_INODE; + + /* + * Better to goto out and return whatever is there in the + * readdirp response (even if the readdir response contains + * a directory entry for the snapshot entry point). Otherwise + * if we ignore the error, then there is a chance of race + * condition where, priv->path is changed in reconfigure + */ + if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) { + gf_smsg(this->name, GF_LOG_WARNING, 0, SVC_MSG_COPY_ENTRY_POINT_FAILED, + NULL); + goto out; + } + + list_for_each_entry_safe(entry, tmpentry, &entries->list, list) + { + /* If .snaps pre-exists, then it should not be listed + * in the NORMAL INODE directory when USS is enabled, + * so filter the .snaps entry if exists. + * However it is OK to list .snaps in VIRTUAL world + */ + if (inode_type == NORMAL_INODE && !strcmp(entry_point, entry->d_name)) { + gf_dirent_entry_free(entry); + continue; + } + + if (!entry->inode) + continue; + + ret = svc_inode_ctx_set(this, entry->inode, inode_type); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, + SVC_MSG_SET_INODE_CONTEXT_FAILED, NULL); + if (svc_fd) + svc_fd->last_offset = entry->d_off; + } + + unwind = gf_svc_readdir_on_special_dir(frame, cookie, this, op_ret, + op_errno, entries, xdata); + +out: + if (unwind) + SVC_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata); + + return 0; +} + +static int32_t +gf_svc_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + int inode_type = -1; + xlator_t *subvolume = NULL; + svc_local_t *local = NULL; + int ret = -1; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + svc_fd_t *svc_fd = NULL; + gf_dirent_t entries; + + INIT_LIST_HEAD(&entries.list); + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, out); + + local = mem_get0(this->local_pool); + if (!local) { + op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, SVC_MSG_NO_MEMORY, NULL); + goto out; + } + + /* + * This is mainly for samba shares (or windows clients). As part of + * readdirp on the directory used as samba share, the entry point + * directory would have been added at the end. So when a new readdirp + * request comes, we have to check if the entry point has been handled + * or not in readdirp. That information and the offset used for it + * is remembered in fd context. If it has been handled, then simply + * unwind indication end of readdir operation. + */ + svc_fd = svc_fd_ctx_get_or_new(this, fd); + if (!svc_fd) + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_GET_FD_CONTEXT_FAILED, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + else { + if (svc_fd->entry_point_handled && off == svc_fd->last_offset) { + op_ret = 0; + op_errno = ENOENT; + goto out; + } + } + + SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, fd->inode, + subvolume, out); + + local->subvolume = subvolume; + local->fd = fd_ref(fd); + frame->local = local; + + STACK_WIND(frame, gf_svc_readdirp_cbk, subvolume, subvolume->fops->readdirp, + fd, size, off, xdata); + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL); + + gf_dirent_free(&entries); + + return 0; +} + +/* Renaming the entries from or to snapshots is not allowed as the snapshots + are read-only. +*/ +static int32_t +gf_svc_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + int src_inode_type = -1; + int dst_inode_type = -1; + int dst_parent_type = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t ret = -1; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, oldloc, out); + GF_VALIDATE_OR_GOTO(this->name, oldloc->inode, out); + GF_VALIDATE_OR_GOTO(this->name, newloc, out); + + ret = svc_inode_ctx_get(this, oldloc->inode, &src_inode_type); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s", + uuid_utoa(oldloc->inode->gfid), NULL); + goto out; + } + + if (src_inode_type == VIRTUAL_INODE) { + op_ret = -1; + op_errno = EROFS; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_RENAME_SNAPSHOT_ENTRY, "name=%s", oldloc->name, NULL); + goto out; + } + + if (newloc->inode) { + ret = svc_inode_ctx_get(this, newloc->inode, &dst_inode_type); + if (!ret && dst_inode_type == VIRTUAL_INODE) { + op_ret = -1; + op_errno = EROFS; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_RENAME_SNAPSHOT_ENTRY, "oldloc-name=%s", + oldloc->name, "newloc-name=%s", newloc->name, NULL); + goto out; + } + } + + if (dst_inode_type < 0) { + ret = svc_inode_ctx_get(this, newloc->parent, &dst_parent_type); + if (!ret && dst_parent_type == VIRTUAL_INODE) { + op_ret = -1; + op_errno = EROFS; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_RENAME_SNAPSHOT_ENTRY, "oldloc-name=%s", + oldloc->name, "newloc-name=%s", newloc->name, NULL); + goto out; + } + } + + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, + oldloc, newloc, xdata); + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(rename, frame, op_ret, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); + return 0; +} + +/* Creating hardlinks for the files from the snapshot is not allowed as it + will be equivalent of creating hardlinks across different filesystems. + And so is vice versa. +*/ +static int32_t +gf_svc_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + int src_inode_type = -1; + int dst_parent_type = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t ret = -1; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, oldloc, out); + GF_VALIDATE_OR_GOTO(this->name, oldloc->inode, out); + GF_VALIDATE_OR_GOTO(this->name, newloc, out); + + ret = svc_inode_ctx_get(this, oldloc->inode, &src_inode_type); + if (!ret && src_inode_type == VIRTUAL_INODE) { + op_ret = -1; + op_errno = EROFS; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, SVC_MSG_LINK_SNAPSHOT_ENTRY, + "oldloc-name=%s", oldloc->name, NULL); + goto out; + } + + ret = svc_inode_ctx_get(this, newloc->parent, &dst_parent_type); + if (!ret && dst_parent_type == VIRTUAL_INODE) { + op_ret = -1; + op_errno = EROFS; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, SVC_MSG_LINK_SNAPSHOT_ENTRY, + "oldloc-name=%s", oldloc->name, "newloc-name=%s", newloc->name, + NULL); + goto out; + } + + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(link, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, + NULL); + return 0; +} + +static int32_t +gf_svc_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + int ret = -1; + int inode_type = -1; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + ret = svc_inode_ctx_get(this, loc->inode, &inode_type); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_GET_INODE_CONTEXT_FAILED, "path=%s", loc->path, + "gfid=%s", uuid_utoa(loc->inode->gfid), NULL); + goto out; + } + + if (inode_type == NORMAL_INODE) { + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + } else { + op_ret = -1; + op_errno = EROFS; + goto out; + } + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(removexattr, frame, op_ret, op_errno, NULL); + + return 0; +} + +static int +gf_svc_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) +{ + int inode_type = -1; + int ret = -1; + int op_ret = -1; + int op_errno = EINVAL; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, out); + + ret = svc_inode_ctx_get(this, fd->inode, &inode_type); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s", + uuid_utoa(fd->inode->gfid), NULL); + goto out; + } + + if (inode_type == NORMAL_INODE) { + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + } else { + op_ret = -1; + op_errno = EROFS; + goto out; + } + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(fsync, frame, op_ret, op_errno, NULL, NULL, NULL); + + return 0; +} + +static int32_t +gf_svc_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int ret = -1; + int inode_type = -1; + xlator_t *subvolume = NULL; + gf_boolean_t wind = _gf_false; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, out); + + SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, fd->inode, + subvolume, out); + + STACK_WIND_TAIL(frame, subvolume, subvolume->fops->flush, fd, xdata); + + wind = _gf_true; + +out: + if (!wind) + SVC_STACK_UNWIND(flush, frame, op_ret, op_errno, NULL); + + return 0; +} + +static int32_t +gf_svc_releasedir(xlator_t *this, fd_t *fd) +{ + svc_fd_t *sfd = NULL; + uint64_t tmp_pfd = 0; + int ret = 0; + + GF_VALIDATE_OR_GOTO("snapview-client", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + ret = fd_ctx_del(fd, this, &tmp_pfd); + if (ret < 0) { + gf_msg_debug(this->name, 0, "pfd from fd=%p is NULL", fd); + goto out; + } + + GF_FREE(sfd); + +out: + return 0; +} + +static int32_t +gf_svc_forget(xlator_t *this, inode_t *inode) +{ + int ret = -1; + uint64_t value = 0; + + GF_VALIDATE_OR_GOTO("svc", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + ret = inode_ctx_del(inode, this, &value); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + SVC_MSG_DELETE_INODE_CONTEXT_FAILED, "gfid=%s", + uuid_utoa(inode->gfid), NULL); + goto out; + } + +out: + return 0; +} + +static int +gf_svc_priv_destroy(xlator_t *this, svc_private_t *priv) +{ + int ret = -1; + + if (!priv) { + gf_smsg(this->name, GF_LOG_WARNING, 0, SVC_MSG_NULL_PRIV, NULL); + goto out; + } + + GF_FREE(priv->path); + GF_FREE(priv->special_dir); + + LOCK_DESTROY(&priv->lock); + + GF_FREE(priv); + + if (this->local_pool) { + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } + + ret = 0; + +out: + return ret; +} + +/** + * ** NOTE **: + * ============= + * The option "snapdir-entry-path" is NOT reconfigurable. + * That option as of now is only for the consumption of + * samba, where, it needs to tell glusterfs about the + * directory that is shared with windows client for the + * access. Now, in windows-explorer (GUI) interface, for + * the directory shared, the entry point to the snapshot + * world (snapshot-directory option) should be visible, + * atleast as a hidden entry. For that to happen, glusterfs + * has to send that entry in the readdir response coming on + * the directory used as the smb share. Therefore, samba, + * while initializing the gluster volume (via gfapi) sets + * the xlator option "snapdir-entry-path" to the directory + * which is to be shared with windows (check the file + * vfs_glusterfs.c from samba source code). So to avoid + * problems with smb access, not allowing snapdir-entry-path + * option to be configurable. That option is for those + * consumers who know what they are doing. + **/ +int +reconfigure(xlator_t *this, dict_t *options) +{ + svc_private_t *priv = NULL; + char *path = NULL; + gf_boolean_t show_entry_point = _gf_false; + char *tmp = NULL; + + priv = this->private; + + GF_OPTION_RECONF("snapshot-directory", path, options, str, out); + if (!path || (strlen(path) > NAME_MAX) || path[0] != '.') { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_INVALID_ENTRY_POINT, + "path=%s", path, NULL); + goto out; + } + + GF_OPTION_RECONF("show-snapshot-directory", show_entry_point, options, bool, + out); + + /* + * The assumption now is that priv->path is an allocated memory (either + * in init or in a previous reconfigure). + * So, the intention here is to preserve the older contents of the option + * until the new option's value has been completely stored in the priv. + * So, do this. + * - Store the pointer of priv->path in a temporary pointer. + * - Allocate new memory for the new value of the option that is just + * obtained from the above call to GF_OPTION_RECONF. + * - If the above allocation fails, again set the pointer from priv + * to the address stored in tmp. i.e. the previous value. + * - If the allocation succeeds, then free the tmp pointer. + * WARNING: Before changing the allocation and freeing logic of + * priv->path, always check the init function to see how + * priv->path is set. Take decisions accordingly. As of now, + * the assumption is that, the string elements of private + * structure of snapview-client are allocated (either in + * init or here in reconfugure). + */ + LOCK(&priv->lock); + { + tmp = priv->path; + priv->path = NULL; + priv->path = gf_strdup(path); + if (!priv->path) { + gf_log(this->name, GF_LOG_ERROR, + "failed to reconfigure snapshot-directory option to %s", + path); + priv->path = tmp; + } else { + GF_FREE(tmp); + tmp = NULL; + } + + priv->show_entry_point = show_entry_point; + } + UNLOCK(&priv->lock); + +out: + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int32_t ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init(this, gf_svc_mt_end + 1); + + if (ret != 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, SVC_MSG_MEM_ACNT_FAILED, NULL); + } + + return ret; +} + +int32_t +init(xlator_t *this) +{ + svc_private_t *private = NULL; + int ret = -1; + int children = 0; + xlator_list_t *xl = NULL; + char *path = NULL; + char *special_dir = NULL; + + if (!this->children) { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_NO_CHILD_FOR_XLATOR, NULL); + goto out; + } + + xl = this->children; + while (xl) { + children++; + xl = xl->next; + } + + if (children != 2) { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_XLATOR_CHILDREN_WRONG, + "subvol-num=%d", children, NULL); + goto out; + } + + /* This can be the top of graph in certain cases */ + if (!this->parents) { + gf_msg_debug(this->name, 0, + "dangling volume. Check " + "volfile"); + } + + private + = GF_CALLOC(1, sizeof(*private), gf_svc_mt_svc_private_t); + if (!private) + goto out; + + LOCK_INIT(&private->lock); + + GF_OPTION_INIT("snapshot-directory", path, str, out); + if (!path || (strlen(path) > NAME_MAX) || path[0] != '.') { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_INVALID_ENTRY_POINT, + "path=%s", path, NULL); + goto out; + } + + private + ->path = gf_strdup(path); + if (!private->path) { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_NO_MEMORY, + "entry-point-path=%s", path, NULL); + goto out; + } + + GF_OPTION_INIT("snapdir-entry-path", special_dir, str, out); + if (!special_dir || strstr(special_dir, path)) { + if (special_dir) + gf_smsg(this->name, GF_LOG_ERROR, 0, + SVC_MSG_ENTRY_POINT_SPECIAL_DIR, "path=%s", path, + "special-dir=%s", special_dir); + else + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_NULL_SPECIAL_DIR, + NULL); + goto out; + } + + private + ->special_dir = gf_strdup(special_dir); + if (!private->special_dir) { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_NO_MEMORY, + "special-directory=%s", special_dir, NULL); + goto out; + } + + GF_OPTION_INIT("show-snapshot-directory", private->show_entry_point, bool, + out); + + this->local_pool = mem_pool_new(svc_local_t, 128); + if (!this->local_pool) { + gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_MEM_POOL_GET_FAILED, NULL); + goto out; + } + + this->private = private; + + ret = 0; + +out: + if (ret) + (void)gf_svc_priv_destroy(this, private); + + return ret; +} + +void +fini(xlator_t *this) +{ + svc_private_t *priv = NULL; + + if (!this) + return; + + priv = this->private; + if (!priv) + return; + + /* + * Just log the failure and go ahead to + * set this->priv to NULL. + */ + if (gf_svc_priv_destroy(this, priv)) + gf_smsg(this->name, GF_LOG_WARNING, 0, SVC_MSG_PRIV_DESTROY_FAILED, + NULL); + + this->private = NULL; + + return; +} + +int +notify(xlator_t *this, int event, void *data, ...) +{ + xlator_t *subvol = NULL; + int ret = 0; + + subvol = data; + + /* As there are two subvolumes in snapview-client, there is + * a possibility that the regular subvolume is still down and + * snapd subvolume come up first. So if we don't handle this situation + * CHILD_UP event will be propagated upwards to fuse when + * regular subvolume is still down. + * This can cause data unavailable for the application. + * So for now send notifications up only for regular subvolume. + * + * TODO: In future if required we may need to handle + * notifications from virtual subvolume + */ + if (subvol != SECOND_CHILD(this)) + ret = default_notify(this, event, data); + + return ret; +} + +struct xlator_fops fops = { + .lookup = gf_svc_lookup, + .opendir = gf_svc_opendir, + .stat = gf_svc_stat, + .fstat = gf_svc_fstat, + .statfs = gf_svc_statfs, + .rmdir = gf_svc_rmdir, + .rename = gf_svc_rename, + .mkdir = gf_svc_mkdir, + .open = gf_svc_open, + .unlink = gf_svc_unlink, + .setattr = gf_svc_setattr, + .getxattr = gf_svc_getxattr, + .setxattr = gf_svc_setxattr, + .fsetxattr = gf_svc_fsetxattr, + .readv = gf_svc_readv, + .readdir = gf_svc_readdir, + .readdirp = gf_svc_readdirp, + .create = gf_svc_create, + .readlink = gf_svc_readlink, + .mknod = gf_svc_mknod, + .symlink = gf_svc_symlink, + .flush = gf_svc_flush, + .link = gf_svc_link, + .access = gf_svc_access, + .removexattr = gf_svc_removexattr, + .fsync = gf_svc_fsync, +}; + +struct xlator_cbks cbks = { + .forget = gf_svc_forget, + .releasedir = gf_svc_releasedir, +}; + +struct volume_options options[] = { + { + .key = {"snapshot-directory"}, + .type = GF_OPTION_TYPE_STR, + .default_value = ".snaps", + }, + { + .key = {"snapdir-entry-path"}, + .type = GF_OPTION_TYPE_STR, + .description = "An option to set the path of a directory on which " + "when readdir comes, dentry for the snapshot-directory" + " should be created and added in the readdir response", + .default_value = "", + }, + { + .key = {"show-snapshot-directory"}, + .type = GF_OPTION_TYPE_BOOL, + .description = "If this option is set, and the option " + "\"snapdir-entry-path\" is set (which is set by samba " + "vfs plugin for glusterfs, then send the entry point " + "when readdir comes on the snapdir-entry-path", + .default_value = "off", + }, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "snapview-client", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/snapview-client/src/snapview-client.h b/xlators/features/snapview-client/src/snapview-client.h new file mode 100644 index 00000000000..166116a439d --- /dev/null +++ b/xlators/features/snapview-client/src/snapview-client.h @@ -0,0 +1,101 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __SNAP_VIEW_CLIENT_H__ +#define __SNAP_VIEW_CLIENT_H__ + +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include "snapview-client-mem-types.h" +#include "snapview-client-messages.h" + +struct __svc_local { + loc_t loc; + xlator_t *subvolume; + fd_t *fd; + void *cookie; + dict_t *xdata; + uint16_t revalidate; +}; +typedef struct __svc_local svc_local_t; + +#define SVC_STACK_UNWIND(fop, frame, params...) \ + do { \ + svc_local_t *__local = NULL; \ + if (frame) { \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + svc_local_free(__local); \ + } while (0) + +#define SVC_ENTRY_POINT_SET(this, xdata, op_ret, op_errno, new_xdata, ret, \ + label) \ + do { \ + if (!xdata) { \ + xdata = new_xdata = dict_new(); \ + if (!new_xdata) { \ + gf_log(this->name, GF_LOG_ERROR, \ + "failed to allocate new dict"); \ + op_ret = -1; \ + op_errno = ENOMEM; \ + goto label; \ + } \ + } \ + ret = dict_set_str(xdata, "entry-point", "true"); \ + if (ret) { \ + gf_log(this->name, GF_LOG_ERROR, "failed to set dict"); \ + op_ret = -1; \ + op_errno = ENOMEM; \ + goto label; \ + } \ + } while (0); + +#define SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, \ + inode, subvolume, label) \ + do { \ + ret = svc_inode_ctx_get(this, inode, &inode_type); \ + if (ret < 0) { \ + gf_log(this->name, GF_LOG_ERROR, \ + "inode context not found for gfid %s", \ + uuid_utoa(inode->gfid)); \ + op_ret = -1; \ + op_errno = EINVAL; \ + goto label; \ + } \ + \ + subvolume = svc_get_subvolume(this, inode_type); \ + } while (0); + +struct svc_private { + char *path; + char *special_dir; /* needed for samba */ + gf_boolean_t show_entry_point; + gf_lock_t lock; /* mainly to guard private->path */ +}; +typedef struct svc_private svc_private_t; + +struct svc_fd { + off_t last_offset; + gf_boolean_t entry_point_handled; + gf_boolean_t special_dir; +}; +typedef struct svc_fd svc_fd_t; + +typedef enum { NORMAL_INODE = 1, VIRTUAL_INODE } inode_type_t; + +int +gf_svc_special_dir_revalidate_lookup(call_frame_t *frame, xlator_t *this, + dict_t *xdata); + +#endif /* __SNAP_VIEW_CLIENT_H__ */ diff --git a/xlators/features/snapview-server/Makefile.am b/xlators/features/snapview-server/Makefile.am new file mode 100644 index 00000000000..af437a64d6d --- /dev/null +++ b/xlators/features/snapview-server/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src diff --git a/xlators/features/snapview-server/src/Makefile.am b/xlators/features/snapview-server/src/Makefile.am new file mode 100644 index 00000000000..2935f138a4c --- /dev/null +++ b/xlators/features/snapview-server/src/Makefile.am @@ -0,0 +1,25 @@ +if WITH_SERVER +xlator_LTLIBRARIES = snapview-server.la +endif +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +snapview_server_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +snapview_server_la_SOURCES = snapview-server.c snapview-server-mgmt.c \ + snapview-server-helpers.c + +snapview_server_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/api/src/libgfapi.la \ + $(RLLIBS) $(top_builddir)/rpc/xdr/src/libgfxdr.la \ + $(top_builddir)/rpc/rpc-lib/src/libgfrpc.la + +noinst_HEADERS = snapview-server.h snapview-server-mem-types.h snapview-server-messages.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/api/src -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -DDATADIR=\"$(localstatedir)\" + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/snapview-server/src/snapview-server-helpers.c b/xlators/features/snapview-server/src/snapview-server-helpers.c new file mode 100644 index 00000000000..62c1ddac49c --- /dev/null +++ b/xlators/features/snapview-server/src/snapview-server-helpers.c @@ -0,0 +1,715 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#include "snapview-server.h" +#include "snapview-server-mem-types.h" + +#include <glusterfs/xlator.h> +#include "rpc-clnt.h" +#include "xdr-generic.h" +#include "protocol-common.h" +#include <pthread.h> + +int +__svs_inode_ctx_set(xlator_t *this, inode_t *inode, svs_inode_t *svs_inode) +{ + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, svs_inode, out); + + value = (uint64_t)(long)svs_inode; + + ret = __inode_ctx_set(inode, this, &value); + +out: + return ret; +} + +svs_inode_t * +__svs_inode_ctx_get(xlator_t *this, inode_t *inode) +{ + svs_inode_t *svs_inode = NULL; + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + ret = __inode_ctx_get(inode, this, &value); + if (ret) + goto out; + + svs_inode = (svs_inode_t *)((long)value); + +out: + return svs_inode; +} + +svs_inode_t * +svs_inode_ctx_get(xlator_t *this, inode_t *inode) +{ + svs_inode_t *svs_inode = NULL; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + LOCK(&inode->lock); + { + svs_inode = __svs_inode_ctx_get(this, inode); + } + UNLOCK(&inode->lock); + +out: + return svs_inode; +} + +int32_t +svs_inode_ctx_set(xlator_t *this, inode_t *inode, svs_inode_t *svs_inode) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, svs_inode, out); + + LOCK(&inode->lock); + { + ret = __svs_inode_ctx_set(this, inode, svs_inode); + } + UNLOCK(&inode->lock); + +out: + return ret; +} + +svs_inode_t * +svs_inode_new(void) +{ + svs_inode_t *svs_inode = NULL; + + svs_inode = GF_CALLOC(1, sizeof(*svs_inode), gf_svs_mt_svs_inode_t); + + return svs_inode; +} + +svs_inode_t * +svs_inode_ctx_get_or_new(xlator_t *this, inode_t *inode) +{ + svs_inode_t *svs_inode = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + LOCK(&inode->lock); + { + svs_inode = __svs_inode_ctx_get(this, inode); + if (!svs_inode) { + svs_inode = svs_inode_new(); + if (svs_inode) { + ret = __svs_inode_ctx_set(this, inode, svs_inode); + if (ret) { + GF_FREE(svs_inode); + svs_inode = NULL; + } + } + } + } + UNLOCK(&inode->lock); + +out: + return svs_inode; +} + +svs_fd_t * +svs_fd_new(void) +{ + svs_fd_t *svs_fd = NULL; + + svs_fd = GF_CALLOC(1, sizeof(*svs_fd), gf_svs_mt_svs_fd_t); + + return svs_fd; +} + +int +__svs_fd_ctx_set(xlator_t *this, fd_t *fd, svs_fd_t *svs_fd) +{ + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, svs_fd, out); + + value = (uint64_t)(long)svs_fd; + + ret = __fd_ctx_set(fd, this, value); + +out: + return ret; +} + +svs_fd_t * +__svs_fd_ctx_get(xlator_t *this, fd_t *fd) +{ + svs_fd_t *svs_fd = NULL; + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + ret = __fd_ctx_get(fd, this, &value); + if (ret) + return NULL; + + svs_fd = (svs_fd_t *)((long)value); + +out: + return svs_fd; +} + +svs_fd_t * +svs_fd_ctx_get(xlator_t *this, fd_t *fd) +{ + svs_fd_t *svs_fd = NULL; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + LOCK(&fd->lock); + { + svs_fd = __svs_fd_ctx_get(this, fd); + } + UNLOCK(&fd->lock); + +out: + return svs_fd; +} + +int32_t +svs_fd_ctx_set(xlator_t *this, fd_t *fd, svs_fd_t *svs_fd) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, svs_fd, out); + + LOCK(&fd->lock); + { + ret = __svs_fd_ctx_set(this, fd, svs_fd); + } + UNLOCK(&fd->lock); + +out: + return ret; +} + +svs_fd_t * +__svs_fd_ctx_get_or_new(xlator_t *this, fd_t *fd) +{ + svs_fd_t *svs_fd = NULL; + int ret = -1; + glfs_t *fs = NULL; + glfs_object_t *object = NULL; + svs_inode_t *inode_ctx = NULL; + glfs_fd_t *glfd = NULL; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + inode = fd->inode; + svs_fd = __svs_fd_ctx_get(this, fd); + if (svs_fd) { + ret = 0; + goto out; + } + + svs_fd = svs_fd_new(); + if (!svs_fd) { + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_NEW_FD_CTX_FAILED, + "failed to allocate new fd " + "context for gfid %s", + uuid_utoa(inode->gfid)); + goto out; + } + + if (fd_is_anonymous(fd)) { + inode_ctx = svs_inode_ctx_get(this, inode); + if (!inode_ctx) { + gf_msg(this->name, GF_LOG_ERROR, 0, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "failed to get inode " + "context for %s", + uuid_utoa(inode->gfid)); + goto out; + } + + fs = inode_ctx->fs; + object = inode_ctx->object; + + if (inode->ia_type == IA_IFDIR) { + glfd = glfs_h_opendir(fs, object); + if (!glfd) { + gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_OPENDIR_FAILED, + "failed to " + "open the directory %s", + uuid_utoa(inode->gfid)); + goto out; + } + } + + if (inode->ia_type == IA_IFREG) { + glfd = glfs_h_open(fs, object, O_RDONLY | O_LARGEFILE); + if (!glfd) { + gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_OPEN_FAILED, + "failed to " + "open the file %s", + uuid_utoa(inode->gfid)); + goto out; + } + } + + svs_fd->fd = glfd; + } + + ret = __svs_fd_ctx_set(this, fd, svs_fd); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_SET_FD_CONTEXT_FAILED, + "failed to set fd context " + "for gfid %s", + uuid_utoa(inode->gfid)); + if (svs_fd->fd) { + if (inode->ia_type == IA_IFDIR) { + ret = glfs_closedir(svs_fd->fd); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, errno, + SVS_MSG_CLOSEDIR_FAILED, + "failed to close the fd for %s", + uuid_utoa(inode->gfid)); + } + if (inode->ia_type == IA_IFREG) { + ret = glfs_close(svs_fd->fd); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_CLOSE_FAILED, + "failed to close the fd for %s", + uuid_utoa(inode->gfid)); + } + } + ret = -1; + } + +out: + if (ret) { + GF_FREE(svs_fd); + svs_fd = NULL; + } + + return svs_fd; +} + +svs_fd_t * +svs_fd_ctx_get_or_new(xlator_t *this, fd_t *fd) +{ + svs_fd_t *svs_fd = NULL; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + LOCK(&fd->lock); + { + svs_fd = __svs_fd_ctx_get_or_new(this, fd); + } + UNLOCK(&fd->lock); + +out: + return svs_fd; +} + +int +svs_uuid_generate(xlator_t *this, uuid_t gfid, char *snapname, + uuid_t origin_gfid) +{ + char ino_string[NAME_MAX + 32] = ""; + uuid_t tmp = { + 0, + }; + int ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, snapname, out); + + (void)snprintf(ino_string, sizeof(ino_string), "%s%s", snapname, + uuid_utoa(origin_gfid)); + + if (gf_gfid_generate_from_xxh64(tmp, ino_string)) { + gf_msg(this->name, GF_LOG_WARNING, 0, SVS_MSG_GFID_GEN_FAILED, + "failed to generate " + "gfid for object with actual gfid of %s " + "(snapname: %s, key: %s)", + uuid_utoa(origin_gfid), snapname, ino_string); + goto out; + } + + gf_uuid_copy(gfid, tmp); + + ret = 0; + + gf_msg_debug(this->name, 0, "gfid generated is %s ", uuid_utoa(gfid)); + +out: + return ret; +} + +void +svs_fill_ino_from_gfid(struct iatt *buf) +{ + xlator_t *this = NULL; + + this = THIS; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, buf, out); + + /* consider least significant 8 bytes of value out of gfid */ + if (gf_uuid_is_null(buf->ia_gfid)) { + buf->ia_ino = -1; + goto out; + } + + buf->ia_ino = gfid_to_ino(buf->ia_gfid); +out: + return; +} + +void +svs_iatt_fill(uuid_t gfid, struct iatt *buf) +{ + struct timeval tv = { + 0, + }; + xlator_t *this = NULL; + + this = THIS; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, buf, out); + + buf->ia_type = IA_IFDIR; + buf->ia_uid = 0; + buf->ia_gid = 0; + buf->ia_size = 0; + buf->ia_nlink = 2; + buf->ia_blocks = 8; + buf->ia_size = 4096; + + gf_uuid_copy(buf->ia_gfid, gfid); + svs_fill_ino_from_gfid(buf); + + buf->ia_prot = ia_prot_from_st_mode(0755); + + gettimeofday(&tv, 0); + + buf->ia_mtime = buf->ia_atime = buf->ia_ctime = tv.tv_sec; + buf->ia_mtime_nsec = buf->ia_atime_nsec = buf->ia_ctime_nsec = (tv.tv_usec * + 1000); + +out: + return; +} + +/* priv->snaplist_lock should be held before calling this function */ +snap_dirent_t * +__svs_get_snap_dirent(xlator_t *this, const char *name) +{ + svs_private_t *private = NULL; + int i = 0; + snap_dirent_t *dirents = NULL; + snap_dirent_t *tmp_dirent = NULL; + snap_dirent_t *dirent = NULL; + + private + = this->private; + + dirents = private->dirents; + if (!dirents) { + goto out; + } + + tmp_dirent = dirents; + for (i = 0; i < private->num_snaps; i++) { + if (!strcmp(tmp_dirent->name, name)) { + dirent = tmp_dirent; + break; + } + tmp_dirent++; + } + +out: + return dirent; +} + +glfs_t * +__svs_initialise_snapshot_volume(xlator_t *this, const char *name, + int32_t *op_errno) +{ + svs_private_t *priv = NULL; + int32_t ret = -1; + int32_t local_errno = ESTALE; + snap_dirent_t *dirent = NULL; + char volname[PATH_MAX] = { + 0, + }; + glfs_t *fs = NULL; + int loglevel = GF_LOG_INFO; + char logfile[PATH_MAX] = { + 0, + }; + char *volfile_server = NULL; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, name, out); + + priv = this->private; + + dirent = __svs_get_snap_dirent(this, name); + if (!dirent) { + gf_msg_debug(this->name, 0, + "snap entry for " + "name %s not found", + name); + local_errno = ENOENT; + goto out; + } + + if (dirent->fs) { + ret = 0; + fs = dirent->fs; + goto out; + } + + snprintf(volname, sizeof(volname), "/snaps/%s/%s/%s", dirent->name, + dirent->snap_volname, dirent->snap_volname); + + fs = glfs_new(volname); + if (!fs) { + local_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, local_errno, SVS_MSG_GLFS_NEW_FAILED, + "glfs instance for snap volume %s " + "failed", + dirent->name); + goto out; + } + + /* + * Before, localhost was used as the volfile server. But, with that + * method, accessing snapshots started giving ENOENT error if a + * specific bind address is mentioned in the glusterd volume file. + * Check the bug https://bugzilla.redhat.com/show_bug.cgi?id=1725211. + * So, the new method is tried below, where, snapview-server first + * uses the volfile server used by the snapd (obtained from the + * command line arguments saved in the global context of the process). + * If the volfile server in global context is NULL, then localhost + * is tried (like before). + */ + if (this->ctx->cmd_args.volfile_server) { + volfile_server = gf_strdup(this->ctx->cmd_args.volfile_server); + if (!volfile_server) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, + SVS_MSG_VOLFILE_SERVER_GET_FAIL, + "failed to copy volfile server %s. ", + this->ctx->cmd_args.volfile_server); + ret = -1; + goto out; + } + } else { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, + SVS_MSG_VOLFILE_SERVER_GET_FAIL, + "volfile server is NULL in cmd args. " + "Trying with localhost"); + volfile_server = gf_strdup("localhost"); + if (!volfile_server) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, + SVS_MSG_VOLFILE_SERVER_GET_FAIL, + "failed to copy volfile server localhost."); + ret = -1; + goto out; + } + } + + ret = glfs_set_volfile_server(fs, "tcp", volfile_server, 24007); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, local_errno, + SVS_MSG_SET_VOLFILE_SERVR_FAILED, + "setting the " + "volfile server %s for snap volume %s " + "failed", + volfile_server, dirent->name); + goto out; + } + + snprintf(logfile, sizeof(logfile), + DEFAULT_SVD_LOG_FILE_DIRECTORY "/snaps/%s/%s-%s.log", + priv->volname, name, dirent->uuid); + + ret = glfs_set_logging(fs, logfile, loglevel); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, local_errno, + SVS_MSG_SET_LOGGING_FAILED, + "failed to set the " + "log file path"); + goto out; + } + + ret = glfs_init(fs); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, local_errno, SVS_MSG_GLFS_INIT_FAILED, + "initing the " + "fs for %s failed", + dirent->name); + goto out; + } + + ret = 0; + +out: + if (ret) { + if (op_errno) + *op_errno = local_errno; + + if (fs) + glfs_fini(fs); + fs = NULL; + } + + if (fs) { + dirent->fs = fs; + } + + GF_FREE(volfile_server); + return fs; +} + +glfs_t * +svs_initialise_snapshot_volume(xlator_t *this, const char *name, + int32_t *op_errno) +{ + glfs_t *fs = NULL; + svs_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, name, out); + + priv = this->private; + + LOCK(&priv->snaplist_lock); + { + fs = __svs_initialise_snapshot_volume(this, name, op_errno); + } + UNLOCK(&priv->snaplist_lock); + +out: + + return fs; +} + +snap_dirent_t * +svs_get_latest_snap_entry(xlator_t *this) +{ + svs_private_t *priv = NULL; + snap_dirent_t *dirents = NULL; + snap_dirent_t *dirent = NULL; + + GF_VALIDATE_OR_GOTO("svs", this, out); + + priv = this->private; + + LOCK(&priv->snaplist_lock); + { + dirents = priv->dirents; + if (!dirents) { + goto unlock; + } + if (priv->num_snaps) + dirent = &dirents[priv->num_snaps - 1]; + } +unlock: + UNLOCK(&priv->snaplist_lock); + +out: + return dirent; +} + +glfs_t * +svs_get_latest_snapshot(xlator_t *this) +{ + glfs_t *fs = NULL; + snap_dirent_t *dirent = NULL; + svs_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("svs", this, out); + priv = this->private; + + dirent = svs_get_latest_snap_entry(this); + + if (dirent) { + LOCK(&priv->snaplist_lock); + { + fs = dirent->fs; + } + UNLOCK(&priv->snaplist_lock); + } + +out: + return fs; +} + +glfs_t * +svs_inode_ctx_glfs_mapping(xlator_t *this, svs_inode_t *inode_ctx) +{ + glfs_t *fs = NULL; + + GF_VALIDATE_OR_GOTO("svs", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode_ctx, out); + + fs = inode_ctx->fs; + + SVS_CHECK_VALID_SNAPSHOT_HANDLE(fs, this); + +out: + return fs; +} + +glfs_t * +svs_inode_glfs_mapping(xlator_t *this, inode_t *inode) +{ + svs_inode_t *inode_ctx = NULL; + glfs_t *fs = NULL; + + inode_ctx = svs_inode_ctx_get(this, inode); + if (!inode_ctx) { + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_GET_INODE_CONTEXT_FAILED, + "inode context not found for" + " the inode %s", + uuid_utoa(inode->gfid)); + goto out; + } + + fs = svs_inode_ctx_glfs_mapping(this, inode_ctx); + +out: + return fs; +} diff --git a/xlators/features/snapview-server/src/snapview-server-mem-types.h b/xlators/features/snapview-server/src/snapview-server-mem-types.h new file mode 100644 index 00000000000..63456b85323 --- /dev/null +++ b/xlators/features/snapview-server/src/snapview-server-mem-types.h @@ -0,0 +1,25 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __SNAP_VIEW_MEM_TYPES_H +#define __SNAP_VIEW_MEM_TYPES_H + +#include <glusterfs/mem-types.h> + +enum snapview_mem_types { + gf_svs_mt_priv_t = gf_common_mt_end + 1, + gf_svs_mt_svs_inode_t, + gf_svs_mt_dirents_t, + gf_svs_mt_svs_fd_t, + gf_svs_mt_snaplist_t, + gf_svs_mt_end +}; + +#endif diff --git a/xlators/features/snapview-server/src/snapview-server-messages.h b/xlators/features/snapview-server/src/snapview-server-messages.h new file mode 100644 index 00000000000..f634ab5d2b0 --- /dev/null +++ b/xlators/features/snapview-server/src/snapview-server-messages.h @@ -0,0 +1,54 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _SNAPVIEW_SERVER_MESSAGES_H_ +#define _SNAPVIEW_SERVER_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(SNAPVIEW_SERVER, SVS_MSG_NO_MEMORY, SVS_MSG_MEM_ACNT_FAILED, + SVS_MSG_NULL_GFID, SVS_MSG_GET_LATEST_SNAP_FAILED, + SVS_MSG_INVALID_GLFS_CTX, SVS_MSG_LOCK_DESTROY_FAILED, + SVS_MSG_SNAPSHOT_LIST_CHANGED, SVS_MSG_MGMT_INIT_FAILED, + SVS_MSG_GET_SNAPSHOT_LIST_FAILED, SVS_MSG_GET_GLFS_H_OBJECT_FAILED, + SVS_MSG_PARENT_CTX_OR_NAME_NULL, SVS_MSG_SET_INODE_CONTEXT_FAILED, + SVS_MSG_GET_INODE_CONTEXT_FAILED, SVS_MSG_NEW_INODE_CTX_FAILED, + SVS_MSG_DELETE_INODE_CONTEXT_FAILED, SVS_MSG_SET_FD_CONTEXT_FAILED, + SVS_MSG_NEW_FD_CTX_FAILED, SVS_MSG_DELETE_FD_CTX_FAILED, + SVS_MSG_GETXATTR_FAILED, SVS_MSG_LISTXATTR_FAILED, + SVS_MSG_RELEASEDIR_FAILED, SVS_MSG_RELEASE_FAILED, + SVS_MSG_TELLDIR_FAILED, SVS_MSG_STAT_FAILED, SVS_MSG_STATFS_FAILED, + SVS_MSG_OPEN_FAILED, SVS_MSG_READ_FAILED, SVS_MSG_READLINK_FAILED, + SVS_MSG_ACCESS_FAILED, SVS_MSG_GET_FD_CONTEXT_FAILED, + SVS_MSG_DICT_SET_FAILED, SVS_MSG_OPENDIR_FAILED, + SVS_MSG_FS_INSTANCE_INVALID, SVS_MSG_SETFSUID_FAIL, + SVS_MSG_SETFSGID_FAIL, SVS_MSG_SETFSGRPS_FAIL, + SVS_MSG_BUILD_TRNSPRT_OPT_FAILED, SVS_MSG_RPC_INIT_FAILED, + SVS_MSG_REG_NOTIFY_FAILED, SVS_MSG_REG_CBK_PRGM_FAILED, + SVS_MSG_RPC_CLNT_START_FAILED, SVS_MSG_XDR_PAYLOAD_FAILED, + SVS_MSG_NULL_CTX, SVS_MSG_RPC_CALL_FAILED, SVS_MSG_XDR_DECODE_FAILED, + SVS_MSG_RSP_DICT_EMPTY, SVS_MSG_DICT_GET_FAILED, + SVS_MSG_SNAP_LIST_REFRESH_FAILED, SVS_MSG_RPC_REQ_FAILED, + SVS_MSG_CLOSEDIR_FAILED, SVS_MSG_CLOSE_FAILED, + SVS_MSG_GFID_GEN_FAILED, SVS_MSG_GLFS_NEW_FAILED, + SVS_MSG_SET_VOLFILE_SERVR_FAILED, SVS_MSG_SET_LOGGING_FAILED, + SVS_MSG_VOLFILE_SERVER_GET_FAIL, SVS_MSG_GLFS_INIT_FAILED); + +#endif /* !_SNAPVIEW_CLIENT_MESSAGES_H_ */ diff --git a/xlators/features/snapview-server/src/snapview-server-mgmt.c b/xlators/features/snapview-server/src/snapview-server-mgmt.c new file mode 100644 index 00000000000..ecf31c3b880 --- /dev/null +++ b/xlators/features/snapview-server/src/snapview-server-mgmt.c @@ -0,0 +1,524 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#include "snapview-server.h" +#include "snapview-server-mem-types.h" +#include <pthread.h> + +int +mgmt_cbk_snap(struct rpc_clnt *rpc, void *mydata, void *data) +{ + xlator_t *this = NULL; + + this = mydata; + GF_ASSERT(this); + + gf_msg("mgmt", GF_LOG_INFO, 0, SVS_MSG_SNAPSHOT_LIST_CHANGED, + "list of snapshots changed"); + + svs_get_snapshot_list(this); + return 0; +} + +static rpcclnt_cb_actor_t svs_cbk_actors[GF_CBK_MAXVALUE] = { + [GF_CBK_GET_SNAPS] = {"GETSNAPS", mgmt_cbk_snap, GF_CBK_GET_SNAPS}, +}; + +static struct rpcclnt_cb_program svs_cbk_prog = { + .progname = "GlusterFS Callback", + .prognum = GLUSTER_CBK_PROGRAM, + .progver = GLUSTER_CBK_VERSION, + .actors = svs_cbk_actors, + .numactors = GF_CBK_MAXVALUE, +}; + +static char *clnt_handshake_procs[GF_HNDSK_MAXVALUE] = { + [GF_HNDSK_NULL] = "NULL", + [GF_HNDSK_EVENT_NOTIFY] = "EVENTNOTIFY", +}; + +static rpc_clnt_prog_t svs_clnt_handshake_prog = { + .progname = "GlusterFS Handshake", + .prognum = GLUSTER_HNDSK_PROGRAM, + .progver = GLUSTER_HNDSK_VERSION, + .procnames = clnt_handshake_procs, +}; + +static int +svs_rpc_notify(struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, + void *data) +{ + xlator_t *this = NULL; + int ret = 0; + + this = mydata; + + switch (event) { + case RPC_CLNT_CONNECT: + ret = svs_get_snapshot_list(this); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, + SVS_MSG_GET_SNAPSHOT_LIST_FAILED, + "Error in refreshing the snaplist " + "infrastructure"); + ret = -1; + } + break; + default: + break; + } + return ret; +} + +int +svs_mgmt_init(xlator_t *this) +{ + int ret = -1; + svs_private_t *priv = NULL; + dict_t *options = NULL; + int port = GF_DEFAULT_BASE_PORT; + char *host = NULL; + cmd_args_t *cmd_args = NULL; + glusterfs_ctx_t *ctx = NULL; + xlator_cmdline_option_t *opt = NULL; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, this->ctx, out); + + priv = this->private; + + ctx = this->ctx; + cmd_args = &ctx->cmd_args; + + host = "localhost"; + if (cmd_args->volfile_server) + host = cmd_args->volfile_server; + + options = dict_new(); + if (!options) + goto out; + + opt = find_xlator_option_in_cmd_args_t("address-family", cmd_args); + ret = rpc_transport_inet_options_build(options, host, port, + (opt != NULL ? opt->value : NULL)); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_BUILD_TRNSPRT_OPT_FAILED, + "failed to build the " + "transport options"); + goto out; + } + + priv->rpc = rpc_clnt_new(options, this, this->name, 8); + if (!priv->rpc) { + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_RPC_INIT_FAILED, + "failed to initialize RPC"); + goto out; + } + + ret = rpc_clnt_register_notify(priv->rpc, svs_rpc_notify, this); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SVS_MSG_REG_NOTIFY_FAILED, + "failed to register notify function"); + goto out; + } + + ret = rpcclnt_cbk_program_register(priv->rpc, &svs_cbk_prog, this); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_REG_CBK_PRGM_FAILED, + "failed to register callback program"); + goto out; + } + + ret = rpc_clnt_start(priv->rpc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_RPC_CLNT_START_FAILED, + "failed to start the rpc " + "client"); + goto out; + } + + ret = 0; + + gf_msg_debug(this->name, 0, "svs mgmt init successful"); + +out: + if (options) + dict_unref(options); + if (ret) + if (priv) { + rpc_clnt_connection_cleanup(&priv->rpc->conn); + rpc_clnt_unref(priv->rpc); + priv->rpc = NULL; + } + + return ret; +} + +int +svs_mgmt_submit_request(void *req, call_frame_t *frame, glusterfs_ctx_t *ctx, + rpc_clnt_prog_t *prog, int procnum, fop_cbk_fn_t cbkfn, + xdrproc_t xdrproc) +{ + int ret = -1; + int count = 0; + struct iovec iov = { + 0, + }; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + ssize_t xdr_size = 0; + + GF_VALIDATE_OR_GOTO("snapview-server", frame, out); + GF_VALIDATE_OR_GOTO("snapview-server", req, out); + GF_VALIDATE_OR_GOTO("snapview-server", ctx, out); + GF_VALIDATE_OR_GOTO("snapview-server", prog, out); + + GF_ASSERT(frame->this); + + iobref = iobref_new(); + if (!iobref) { + gf_msg(frame->this->name, GF_LOG_WARNING, ENOMEM, SVS_MSG_NO_MEMORY, + "failed to allocate " + "new iobref"); + goto out; + } + + if (req) { + xdr_size = xdr_sizeof(xdrproc, req); + + iobuf = iobuf_get2(ctx->iobuf_pool, xdr_size); + if (!iobuf) { + goto out; + } + + iobref_add(iobref, iobuf); + + iov.iov_base = iobuf->ptr; + iov.iov_len = iobuf_pagesize(iobuf); + + /* Create the xdr payload */ + ret = xdr_serialize_generic(iov, req, xdrproc); + if (ret == -1) { + gf_msg(frame->this->name, GF_LOG_WARNING, 0, + SVS_MSG_XDR_PAYLOAD_FAILED, "Failed to create XDR payload"); + goto out; + } + iov.iov_len = ret; + count = 1; + } + + ret = rpc_clnt_submit(ctx->mgmt, prog, procnum, cbkfn, &iov, count, NULL, 0, + iobref, frame, NULL, 0, NULL, 0, NULL); + +out: + if (iobref) + iobref_unref(iobref); + + if (iobuf) + iobuf_unref(iobuf); + return ret; +} + +int +mgmt_get_snapinfo_cbk(struct rpc_req *req, struct iovec *iov, int count, + void *myframe) +{ + gf_getsnap_name_uuid_rsp rsp = { + 0, + }; + call_frame_t *frame = NULL; + glusterfs_ctx_t *ctx = NULL; + int ret = -1; + dict_t *dict = NULL; + char key[32] = {0}; + int len; + int snapcount = 0; + svs_private_t *priv = NULL; + xlator_t *this = NULL; + int i = 0; + int j = 0; + char *value = NULL; + snap_dirent_t *dirents = NULL; + snap_dirent_t *old_dirents = NULL; + int oldcount = 0; + + GF_VALIDATE_OR_GOTO("snapview-server", req, error_out); + GF_VALIDATE_OR_GOTO("snapview-server", myframe, error_out); + GF_VALIDATE_OR_GOTO("snapview-server", iov, error_out); + + frame = myframe; + this = frame->this; + ctx = frame->this->ctx; + priv = this->private; + + if (!ctx) { + errno = EINVAL; + gf_msg(frame->this->name, GF_LOG_ERROR, errno, SVS_MSG_NULL_CTX, + "NULL context"); + goto out; + } + + if (-1 == req->rpc_status) { + errno = EINVAL; + gf_msg(frame->this->name, GF_LOG_ERROR, errno, SVS_MSG_RPC_CALL_FAILED, + "RPC call is not successful"); + goto out; + } + + ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gf_getsnap_name_uuid_rsp); + if (ret < 0) { + gf_msg(frame->this->name, GF_LOG_ERROR, 0, SVS_MSG_XDR_DECODE_FAILED, + "Failed to decode xdr response, rsp.op_ret = %d", rsp.op_ret); + goto out; + } + + if (rsp.op_ret == -1) { + errno = rsp.op_errno; + ret = -1; + goto out; + } + + if (!rsp.dict.dict_len) { + ret = -1; + errno = EINVAL; + gf_msg(frame->this->name, GF_LOG_ERROR, errno, SVS_MSG_RSP_DICT_EMPTY, + "Response dict is not populated"); + goto out; + } + + dict = dict_new(); + if (!dict) { + ret = -1; + errno = ENOMEM; + goto out; + } + + ret = dict_unserialize(rsp.dict.dict_val, rsp.dict.dict_len, &dict); + if (ret) { + errno = EINVAL; + gf_msg(frame->this->name, GF_LOG_ERROR, errno, + LG_MSG_DICT_UNSERIAL_FAILED, "Failed to unserialize dictionary"); + goto out; + } + + ret = dict_get_int32(dict, "snap-count", (int32_t *)&snapcount); + if (ret) { + errno = EINVAL; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_DICT_GET_FAILED, + "Error retrieving snapcount"); + goto out; + } + + if (snapcount > 0) { + /* first time we are fetching snap list */ + dirents = GF_CALLOC(snapcount, sizeof(snap_dirent_t), + gf_svs_mt_dirents_t); + if (!dirents) { + errno = ENOMEM; + ret = -1; + gf_msg(frame->this->name, GF_LOG_ERROR, errno, SVS_MSG_NO_MEMORY, + "Unable to allocate memory"); + goto out; + } + } + + for (i = 0; i < snapcount; i++) { + len = snprintf(key, sizeof(key), "snap-volname.%d", i + 1); + ret = dict_get_strn(dict, key, len, &value); + if (ret) { + errno = EINVAL; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_DICT_GET_FAILED, + "Error retrieving snap volname %d", i + 1); + goto out; + } + + strncpy(dirents[i].snap_volname, value, + sizeof(dirents[i].snap_volname)); + + len = snprintf(key, sizeof(key), "snap-id.%d", i + 1); + ret = dict_get_strn(dict, key, len, &value); + if (ret) { + errno = EINVAL; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_DICT_GET_FAILED, + "Error retrieving snap uuid %d", i + 1); + goto out; + } + strncpy(dirents[i].uuid, value, sizeof(dirents[i].uuid)); + + len = snprintf(key, sizeof(key), "snapname.%d", i + 1); + ret = dict_get_strn(dict, key, len, &value); + if (ret) { + errno = EINVAL; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_DICT_GET_FAILED, + "Error retrieving snap name %d", i + 1); + goto out; + } + strncpy(dirents[i].name, value, sizeof(dirents[i].name)); + } + + /* + * Got the new snap list populated in dirents + * The new snap list is either a subset or a superset of + * the existing snaplist old_dirents which has priv->num_snaps + * number of entries. + * + * If subset, then clean up the fs for entries which are + * no longer relevant. + * + * For other overlapping entries set the fs for new dirents + * entries which have a fs assigned already in old_dirents + * + * We do this as we don't want to do new glfs_init()s repeatedly + * as the dirents entries for snapshot volumes get repatedly + * cleaned up and allocated. And if we don't then that will lead + * to memleaks + */ + + LOCK(&priv->snaplist_lock); + { + oldcount = priv->num_snaps; + old_dirents = priv->dirents; + for (i = 0; i < priv->num_snaps; i++) { + for (j = 0; j < snapcount; j++) { + if ((!strcmp(old_dirents[i].name, dirents[j].name)) && + (!strcmp(old_dirents[i].uuid, dirents[j].uuid))) { + dirents[j].fs = old_dirents[i].fs; + old_dirents[i].fs = NULL; + break; + } + } + } + + priv->dirents = dirents; + priv->num_snaps = snapcount; + } + UNLOCK(&priv->snaplist_lock); + + if (old_dirents) { + for (i = 0; i < oldcount; i++) { + if (old_dirents[i].fs) + gf_msg_debug(this->name, 0, + "calling glfs_fini on " + "name: %s, snap_volname: %s, uuid: %s", + old_dirents[i].name, old_dirents[i].snap_volname, + old_dirents[i].uuid); + glfs_fini(old_dirents[i].fs); + } + } + + GF_FREE(old_dirents); + + ret = 0; + +out: + if (dict) { + dict_unref(dict); + } + free(rsp.dict.dict_val); + free(rsp.op_errstr); + + if (ret && dirents) { + gf_msg(this->name, GF_LOG_WARNING, 0, SVS_MSG_SNAP_LIST_REFRESH_FAILED, + "Could not update dirents with refreshed snap list"); + GF_FREE(dirents); + } + + if (myframe) + SVS_STACK_DESTROY(myframe); + +error_out: + return ret; +} + +int +svs_get_snapshot_list(xlator_t *this) +{ + gf_getsnap_name_uuid_req req = {{ + 0, + }}; + int ret = -1; + dict_t *dict = NULL; + glusterfs_ctx_t *ctx = NULL; + call_frame_t *frame = NULL; + svs_private_t *priv = NULL; + gf_boolean_t frame_cleanup = _gf_true; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + + ctx = this->ctx; + if (!ctx) { + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_NULL_CTX, "ctx is NULL"); + goto out; + } + + frame = create_frame(this, ctx->pool); + if (!frame) { + gf_msg(this->name, GF_LOG_ERROR, 0, LG_MSG_FRAME_ERROR, + "Error allocating frame"); + goto out; + } + + priv = this->private; + + dict = dict_new(); + if (!dict) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, SVS_MSG_NO_MEMORY, + "Error allocating dictionary"); + goto out; + } + + ret = dict_set_str(dict, "volname", priv->volname); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_DICT_SET_FAILED, + "Error setting volname in dict"); + goto out; + } + + ret = dict_allocate_and_serialize(dict, &req.dict.dict_val, + &req.dict.dict_len); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, LG_MSG_DICT_UNSERIAL_FAILED, + "Failed to serialize dictionary"); + ret = -1; + goto out; + } + + ret = svs_mgmt_submit_request( + &req, frame, ctx, &svs_clnt_handshake_prog, GF_HNDSK_GET_SNAPSHOT_INFO, + mgmt_get_snapinfo_cbk, (xdrproc_t)xdr_gf_getsnap_name_uuid_req); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_RPC_REQ_FAILED, + "Error sending snapshot names RPC request"); + } + + frame_cleanup = _gf_false; + +out: + if (dict) { + dict_unref(dict); + } + GF_FREE(req.dict.dict_val); + + if (frame_cleanup && frame) { + /* + * Destroy the frame if we encountered an error + * Else we need to clean it up in + * mgmt_get_snapinfo_cbk + */ + SVS_STACK_DESTROY(frame); + } + + return ret; +} diff --git a/xlators/features/snapview-server/src/snapview-server.c b/xlators/features/snapview-server/src/snapview-server.c new file mode 100644 index 00000000000..76cccae5914 --- /dev/null +++ b/xlators/features/snapview-server/src/snapview-server.c @@ -0,0 +1,2720 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#include "snapview-server.h" +#include "snapview-server-mem-types.h" +#include <glusterfs/compat-errno.h> + +#include <glusterfs/xlator.h> +#include "rpc-clnt.h" +#include "xdr-generic.h" +#include "protocol-common.h" +#include <glusterfs/syscall.h> +#include <pthread.h> + +#include "glfs-internal.h" + +int +gf_setcredentials(uid_t *uid, gid_t *gid, uint16_t ngrps, uint32_t *groups) +{ + int ret = 0; + + if (uid) { + ret = glfs_setfsuid(*uid); + if (ret != 0) { + gf_msg("snapview-server", GF_LOG_ERROR, 0, SVS_MSG_SETFSUID_FAIL, + "failed to set uid " + "%u in thread context", + *uid); + return ret; + } + } + if (gid) { + ret = glfs_setfsgid(*gid); + if (ret != 0) { + gf_msg("snapview-server", GF_LOG_ERROR, 0, SVS_MSG_SETFSGID_FAIL, + "failed to set gid " + "%u in thread context", + *gid); + return ret; + } + } + + if (ngrps != 0 && groups) { + ret = glfs_setfsgroups(ngrps, groups); + if (ret != 0) { + gf_msg("snapview-server", GF_LOG_ERROR, 0, SVS_MSG_SETFSGRPS_FAIL, + "failed to set " + "groups in thread context"); + return ret; + } + } + return 0; +} + +int32_t +svs_lookup_entry_point(xlator_t *this, loc_t *loc, inode_t *parent, + struct iatt *buf, struct iatt *postparent, + int32_t *op_errno) +{ + uuid_t gfid; + svs_inode_t *inode_ctx = NULL; + int op_ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + GF_VALIDATE_OR_GOTO(this->name, buf, out); + GF_VALIDATE_OR_GOTO(this->name, postparent, out); + + if (gf_uuid_is_null(loc->inode->gfid)) { + gf_uuid_generate(gfid); + svs_iatt_fill(gfid, buf); + + /* Here the inode context of the entry point directory + is filled with just the type of the inode and the gfid + of the parent from where the entry point was entered. + The glfs object and the fs instance will be NULL. + */ + if (parent) + svs_iatt_fill(parent->gfid, postparent); + else { + svs_iatt_fill(buf->ia_gfid, postparent); + } + + inode_ctx = svs_inode_ctx_get_or_new(this, loc->inode); + if (!inode_ctx) { + op_ret = -1; + *op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, *op_errno, + SVS_MSG_NEW_INODE_CTX_FAILED, + "failed to " + "allocate inode context for entry point " + "directory"); + goto out; + } + + gf_uuid_copy(inode_ctx->pargfid, loc->pargfid); + memcpy(&inode_ctx->buf, buf, sizeof(*buf)); + inode_ctx->type = SNAP_VIEW_ENTRY_POINT_INODE; + } else { + inode_ctx = svs_inode_ctx_get(this, loc->inode); + if (inode_ctx) { + memcpy(buf, &inode_ctx->buf, sizeof(*buf)); + svs_iatt_fill(inode_ctx->pargfid, postparent); + } else { + svs_iatt_fill(loc->inode->gfid, buf); + if (parent) + svs_iatt_fill(parent->gfid, postparent); + else { + svs_iatt_fill(loc->inode->gfid, postparent); + } + } + } + + op_ret = 0; + +out: + return op_ret; +} + +/* When lookup comes from client and the protocol/server tries to resolve + the pargfid via just sending the gfid as part of lookup, if the inode + for the parent gfid is not found. But since that gfid has not yet been + looked up yet, inode will not be having inode context and parent is not + there (as it is the parent of the entry that is being resolved). So + without parent and inode context, svs cannot know which snapshot + to look into. In such cases, the amguity is handled by looking + into the latest snapshot. If the directory is there in the latest + snapshot, lookup is successful, otherwise it is a failure. So for + any directory created after taking the latest snapshot, entry into + snapshot world is denied. i.e you have to be part of snapshot world + to enter it. If the gfid is not found there, then unwind with + ESTALE + This gets executed mainly in the situation where the snapshot entry + point is entered from a non-root directory and that non-root directory's + inode (or gfid) is not yet looked up. And in each case when a gfid has to + be looked up (without any inode contex and parent context present), last + snapshot is referred and a random gfid is not generated. +*/ +int32_t +svs_lookup_gfid(xlator_t *this, loc_t *loc, struct iatt *buf, + struct iatt *postparent, int32_t *op_errno) +{ + int32_t op_ret = -1; + unsigned char handle_obj[GFAPI_HANDLE_LENGTH] = { + 0, + }; + glfs_t *fs = NULL; + glfs_object_t *object = NULL; + struct stat statbuf = { + 0, + }; + svs_inode_t *inode_ctx = NULL; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + GF_VALIDATE_OR_GOTO(this->name, buf, out); + GF_VALIDATE_OR_GOTO(this->name, postparent, out); + + if (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid)) { + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_NULL_GFID, "gfid is NULL"); + goto out; + } + + if (!gf_uuid_is_null(loc->inode->gfid)) + memcpy(handle_obj, loc->inode->gfid, GFAPI_HANDLE_LENGTH); + else + memcpy(handle_obj, loc->gfid, GFAPI_HANDLE_LENGTH); + + fs = svs_get_latest_snapshot(this); + if (!fs) { + op_ret = -1; + *op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, *op_errno, + SVS_MSG_GET_LATEST_SNAP_FAILED, + "failed to get the latest " + "snapshot"); + goto out; + } + + object = glfs_h_create_from_handle(fs, handle_obj, GFAPI_HANDLE_LENGTH, + &statbuf); + if (!object) { + op_ret = -1; + *op_errno = ESTALE; + gf_msg(this->name, GF_LOG_ERROR, *op_errno, + SVS_MSG_GET_GLFS_H_OBJECT_FAILED, + "failed to do lookup and get " + "the handle on the snapshot %s (path: %s, gfid: %s)", + loc->name, loc->path, uuid_utoa(loc->gfid)); + goto out; + } + + inode_ctx = svs_inode_ctx_get_or_new(this, loc->inode); + if (!inode_ctx) { + op_ret = -1; + *op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, *op_errno, + SVS_MSG_NEW_INODE_CTX_FAILED, + "failed to allocate inode " + "context"); + goto out; + } + + iatt_from_stat(buf, &statbuf); + if (!gf_uuid_is_null(loc->gfid)) + gf_uuid_copy(buf->ia_gfid, loc->gfid); + else + gf_uuid_copy(buf->ia_gfid, loc->inode->gfid); + + inode_ctx->type = SNAP_VIEW_VIRTUAL_INODE; + inode_ctx->fs = fs; + inode_ctx->object = object; + memcpy(&inode_ctx->buf, buf, sizeof(*buf)); + svs_iatt_fill(buf->ia_gfid, postparent); + + op_ret = 0; + +out: + return op_ret; +} + +/* If the parent is an entry point inode, then create the handle for the + snapshot on which lookup came. i.e in reality lookup came on + the directory from which the entry point directory was entered, but + lookup is into the past. So create the handle for it by doing + the name-less lookup on the gfid (which can be obtained from + parent's context +*/ +int32_t +svs_lookup_snapshot(xlator_t *this, loc_t *loc, struct iatt *buf, + struct iatt *postparent, inode_t *parent, + svs_inode_t *parent_ctx, int32_t *op_errno) +{ + int32_t op_ret = -1; + unsigned char handle_obj[GFAPI_HANDLE_LENGTH] = { + 0, + }; + glfs_t *fs = NULL; + glfs_object_t *object = NULL; + struct stat statbuf = { + 0, + }; + svs_inode_t *inode_ctx = NULL; + uuid_t gfid; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + GF_VALIDATE_OR_GOTO(this->name, buf, out); + GF_VALIDATE_OR_GOTO(this->name, postparent, out); + GF_VALIDATE_OR_GOTO(this->name, parent_ctx, out); + GF_VALIDATE_OR_GOTO(this->name, parent, out); + + fs = svs_initialise_snapshot_volume(this, loc->name, op_errno); + if (!fs) { + gf_msg_debug(this->name, 0, + "failed to create " + "the fs instance for snap %s", + loc->name); + *op_errno = ENOENT; + op_ret = -1; + goto out; + } + + memcpy(handle_obj, parent_ctx->pargfid, GFAPI_HANDLE_LENGTH); + object = glfs_h_create_from_handle(fs, handle_obj, GFAPI_HANDLE_LENGTH, + &statbuf); + if (!object) { + op_ret = -1; + *op_errno = errno; + /* Should this be in warning or error mode? */ + gf_msg_debug(this->name, 0, + "failed to do lookup and " + "get the handle on the snapshot %s", + loc->name); + goto out; + } + + inode_ctx = svs_inode_ctx_get_or_new(this, loc->inode); + if (!inode_ctx) { + op_ret = -1; + *op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, *op_errno, + SVS_MSG_NEW_INODE_CTX_FAILED, + "failed to allocate " + "inode context"); + goto out; + } + + if (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid)) + gf_uuid_generate(gfid); + else { + if (!gf_uuid_is_null(loc->inode->gfid)) + gf_uuid_copy(gfid, loc->inode->gfid); + else + gf_uuid_copy(gfid, loc->gfid); + } + iatt_from_stat(buf, &statbuf); + gf_uuid_copy(buf->ia_gfid, gfid); + svs_fill_ino_from_gfid(buf); + inode_ctx->type = SNAP_VIEW_SNAPSHOT_INODE; + inode_ctx->fs = fs; + inode_ctx->object = object; + memcpy(&inode_ctx->buf, buf, sizeof(*buf)); + svs_iatt_fill(parent->gfid, postparent); + + SVS_STRDUP(inode_ctx->snapname, loc->name); + if (!inode_ctx->snapname) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; + } + op_ret = 0; + +out: + if (op_ret) { + if (object) + glfs_h_close(object); + + if (inode_ctx) + inode_ctx->object = NULL; + } + + return op_ret; +} + +/* Both parent and entry are from snapshot world */ +int32_t +svs_lookup_entry(xlator_t *this, loc_t *loc, struct iatt *buf, + struct iatt *postparent, inode_t *parent, + svs_inode_t *parent_ctx, int32_t *op_errno) +{ + int32_t op_ret = -1; + glfs_t *fs = NULL; + glfs_object_t *object = NULL; + struct stat statbuf = { + 0, + }; + svs_inode_t *inode_ctx = NULL; + glfs_object_t *parent_object = NULL; + uuid_t gfid = { + 0, + }; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + GF_VALIDATE_OR_GOTO(this->name, buf, out); + GF_VALIDATE_OR_GOTO(this->name, postparent, out); + GF_VALIDATE_OR_GOTO(this->name, parent_ctx, out); + GF_VALIDATE_OR_GOTO(this->name, parent, out); + + parent_object = parent_ctx->object; + fs = parent_ctx->fs; + + object = glfs_h_lookupat(fs, parent_object, loc->name, &statbuf, 0); + if (!object) { + /* should this be in WARNING or ERROR mode? */ + gf_msg_debug(this->name, 0, + "failed to do lookup and " + "get the handle for entry %s (path: %s)", + loc->name, loc->path); + op_ret = -1; + *op_errno = errno; + goto out; + } + + if (gf_uuid_is_null(object->gfid)) { + /* should this be in WARNING or ERROR mode? */ + gf_msg_debug(this->name, 0, + "gfid from glfs handle is " + "NULL for entry %s (path: %s)", + loc->name, loc->path); + op_ret = -1; + *op_errno = errno; + goto out; + } + + inode_ctx = svs_inode_ctx_get_or_new(this, loc->inode); + if (!inode_ctx) { + op_ret = -1; + *op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, *op_errno, + SVS_MSG_NEW_INODE_CTX_FAILED, + "failed to allocate " + "inode context"); + goto out; + } + + if (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid)) { + if (svs_uuid_generate(this, gfid, parent_ctx->snapname, object->gfid)) { + /* + * should op_errno be something else such as + * EINVAL or ESTALE? + */ + op_ret = -1; + *op_errno = EIO; + goto out; + } + } else { + if (!gf_uuid_is_null(loc->inode->gfid)) + gf_uuid_copy(gfid, loc->inode->gfid); + else + gf_uuid_copy(gfid, loc->gfid); + } + + iatt_from_stat(buf, &statbuf); + gf_uuid_copy(buf->ia_gfid, gfid); + svs_fill_ino_from_gfid(buf); + inode_ctx->type = SNAP_VIEW_VIRTUAL_INODE; + inode_ctx->fs = fs; + inode_ctx->object = object; + memcpy(&inode_ctx->buf, buf, sizeof(*buf)); + svs_iatt_fill(parent->gfid, postparent); + + if (IA_ISDIR(buf->ia_type)) { + SVS_STRDUP(inode_ctx->snapname, parent_ctx->snapname); + if (!inode_ctx->snapname) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; + } + } + + op_ret = 0; + +out: + if (op_ret) { + if (object) + glfs_h_close(object); + + if (inode_ctx) + inode_ctx->object = NULL; + } + + return op_ret; +} + +/* inode context is there means lookup has come on an object which was + built either as part of lookup or as part of readdirp. But in readdirp + we would not have got the handle to access the object in the gfapi + world. + So if inode context contains glfs_t instance for the right + gfapi world and glfs_object_t handle for accessing it in the gfapi + world, then unwind with success as the snapshots as of now are + read-only. + If the above condition is not met, then send lookup call again to + the gfapi world. It can happen only if both parent context and + the name of the entry are present. + + If parent is an entry point to snapshot world: + * parent is needed for getting the gfid on which lookup has to be done + (the gfid present in the inode is a virtual gfid) in the snapshot + world. + * name is required to get the right glfs_t instance on which lookup + has to be done + + If parent is a directory from snapshot world: + * parent context is needed to get the glfs_t instance and to get the + handle to parent directory in the snapshot world. + * name is needed to do the lookup on the right entry in the snapshot + world +*/ +int32_t +svs_revalidate(xlator_t *this, loc_t *loc, inode_t *parent, + svs_inode_t *inode_ctx, svs_inode_t *parent_ctx, + struct iatt *buf, struct iatt *postparent, int32_t *op_errno) +{ + int32_t op_ret = -1; + int ret = -1; + char tmp_uuid[64] = { + 0, + }; + glfs_t *fs = NULL; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, buf, out); + GF_VALIDATE_OR_GOTO(this->name, postparent, out); + GF_VALIDATE_OR_GOTO(this->name, inode_ctx, out); + + if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) { + svs_iatt_fill(loc->inode->gfid, buf); + if (parent) + svs_iatt_fill(parent->gfid, postparent); + else + svs_iatt_fill(loc->inode->gfid, postparent); + op_ret = 0; + goto out; + } else { + /* Though fs and object are present in the inode context, its + * better to check if fs is valid or not before doing anything. + * Its for the protection from the following operations. + * 1) Create a file on the glusterfs mount point + * 2) Create a snapshot (say "snap1") + * 3) Access the contents of the snapshot + * 4) Delete the file from the mount point + * 5) Delete the snapshot "snap1" + * 6) Create a new snapshot "snap1" + * + * Now accessing the new snapshot "snap1" gives problems. + * Because the inode and dentry created for snap1 would not be + * deleted upon the deletion of the snapshot (as deletion of + * snapshot is a gluster cli operation, not a fop). So next time + * upon creation of a new snap with same name, the previous + * inode and dentry itself will be used. But the inode context + * contains old information about the glfs_t instance and the + * handle in the gfapi world. Thus the glfs_t instance should + * be checked before accessing. If its wrong, then right + * instance should be obtained by doing the lookup. + */ + if (inode_ctx->fs && inode_ctx->object) { + fs = inode_ctx->fs; + SVS_CHECK_VALID_SNAPSHOT_HANDLE(fs, this); + if (fs) { + memcpy(buf, &inode_ctx->buf, sizeof(*buf)); + if (parent) + svs_iatt_fill(parent->gfid, postparent); + else + svs_iatt_fill(buf->ia_gfid, postparent); + op_ret = 0; + goto out; + } else { + inode_ctx->fs = NULL; + inode_ctx->object = NULL; + ret = svs_get_handle(this, loc, inode_ctx, op_errno); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, *op_errno, + SVS_MSG_GET_GLFS_H_OBJECT_FAILED, + "failed to get the handle for " + "%s (gfid %s)", + loc->path, uuid_utoa_r(loc->inode->gfid, tmp_uuid)); + op_ret = -1; + goto out; + } + } + } + + /* To send the lookup to gfapi world, both the name of the + entry as well as the parent context is needed. + */ + if (!loc->name || !parent_ctx) { + *op_errno = ESTALE; + gf_msg(this->name, GF_LOG_ERROR, *op_errno, + SVS_MSG_PARENT_CTX_OR_NAME_NULL, "%s is NULL", + loc->name ? "parent context" : "loc->name"); + goto out; + } + + if (parent_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) + op_ret = svs_lookup_snapshot(this, loc, buf, postparent, parent, + parent_ctx, op_errno); + else + op_ret = svs_lookup_entry(this, loc, buf, postparent, parent, + parent_ctx, op_errno); + + goto out; + } + +out: + return op_ret; +} + +int32_t +svs_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + struct iatt buf = { + 0, + }; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + struct iatt postparent = { + 0, + }; + svs_inode_t *inode_ctx = NULL; + svs_inode_t *parent_ctx = NULL; + int32_t ret = -1; + inode_t *parent = NULL; + gf_boolean_t entry_point_key = _gf_false; + gf_boolean_t entry_point = _gf_false; + call_stack_t *root = NULL; + + GF_VALIDATE_OR_GOTO("svs", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + root = frame->root; + op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps, + root->groups); + if (op_ret != 0) { + goto out; + } + + /* For lookups sent on inodes (i.e not parent inode + basename, but + direct inode itself which usually is a nameless lookup or revalidate + on the inode), loc->name will not be there. Get it from path if + it is there. + This is the difference between nameless lookup and revalidate lookup + on an inode: + nameless lookup: loc->path contains gfid and strrchr on it fails + revalidate lookup: loc->path contains the entry name of the inode + and strrchr gives the name of the entry from path + */ + if (loc->path) { + if (!loc->name || (loc->name && !strcmp(loc->name, ""))) { + loc->name = strrchr(loc->path, '/'); + if (loc->name) + loc->name++; + } + } + + if (loc->parent) + parent = inode_ref(loc->parent); + else { + parent = inode_find(loc->inode->table, loc->pargfid); + if (!parent) + parent = inode_parent(loc->inode, NULL, NULL); + } + if (parent) + parent_ctx = svs_inode_ctx_get(this, parent); + + inode_ctx = svs_inode_ctx_get(this, loc->inode); + + if (xdata && !inode_ctx) { + ret = dict_get_str_boolean(xdata, "entry-point", _gf_false); + if (ret == -1) { + gf_msg_debug(this->name, 0, + "failed to get the " + "entry point info"); + entry_point_key = _gf_false; + } else { + entry_point_key = ret; + } + + if (loc->name && strlen(loc->name)) { + /* lookup can come with the entry-point set in the dict + * for the parent directory of the entry-point as well. + * So consider entry_point only for named lookup + */ + entry_point = entry_point_key; + } + } + + if (inode_ctx && inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) { + /* entry-point may not be set in the dictonary. + * This can happen if snap-view client is restarted where + * inode-ctx not available and a nameless lookup has come + */ + entry_point = _gf_true; + } + + /* lookup is on the entry point to the snapshot world */ + if (entry_point) { + op_ret = svs_lookup_entry_point(this, loc, parent, &buf, &postparent, + &op_errno); + goto out; + } + + /* revalidate */ + if (inode_ctx) { + op_ret = svs_revalidate(this, loc, parent, inode_ctx, parent_ctx, &buf, + &postparent, &op_errno); + goto out; + } + + /* This can happen when entry point directory is entered from non-root + directory. (ex: if /mnt/glusterfs is the mount point, then entry + point (say .snaps) is entered from /mnt/glusterfs/dir/.snaps). Also + it can happen when client sends a nameless lookup on just a gfid and + the server does not have the inode in the inode table. + */ + if (!inode_ctx && !parent_ctx) { + if (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid)) { + op_ret = -1; + op_errno = ESTALE; + gf_msg_debug(this->name, 0, + "gfid is NULL. Either the lookup " + "came on missing entry or the " + "entry is stale"); + goto out; + } + + if (!entry_point_key) { + /* This can happen when there is no inode_ctx available. + * snapview-server might have restarted or + * graph change might have happened + */ + op_ret = -1; + op_errno = ESTALE; + goto out; + } + + /* lookup is on the parent directory of entry-point. + * this would have already looked up by snap-view client + * so return success + */ + if (!gf_uuid_is_null(loc->gfid)) + gf_uuid_copy(buf.ia_gfid, loc->gfid); + else + gf_uuid_copy(buf.ia_gfid, loc->inode->gfid); + + svs_iatt_fill(buf.ia_gfid, &buf); + svs_iatt_fill(buf.ia_gfid, &postparent); + + op_ret = 0; + goto out; + } + + if (parent_ctx) { + if (parent_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) + op_ret = svs_lookup_snapshot(this, loc, &buf, &postparent, parent, + parent_ctx, &op_errno); + else + op_ret = svs_lookup_entry(this, loc, &buf, &postparent, parent, + parent_ctx, &op_errno); + goto out; + } + +out: + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, + loc ? loc->inode : NULL, &buf, xdata, &postparent); + + if (parent) + inode_unref(parent); + + return 0; +} + +int32_t +svs_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + svs_inode_t *inode_ctx = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + svs_fd_t *svs_fd = NULL; + glfs_fd_t *glfd = NULL; + glfs_t *fs = NULL; + glfs_object_t *object = NULL; + call_stack_t *root = NULL; + + GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + root = frame->root; + op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps, + root->groups); + if (op_ret != 0) { + goto out; + } + + inode_ctx = svs_inode_ctx_get(this, loc->inode); + if (!inode_ctx) { + op_ret = -1; + op_errno = ESTALE; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "inode context not found " + "for the inode %s", + uuid_utoa(loc->inode->gfid)); + goto out; + } + + /* Fake success is sent if the opendir is on the entry point directory + or the inode is SNAP_VIEW_ENTRY_POINT_INODE + */ + if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) { + op_ret = 0; + op_errno = 0; + goto out; + } else { + SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, op_ret, + op_errno, out); + + glfd = glfs_h_opendir(fs, object); + if (!glfd) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_OPENDIR_FAILED, + "opendir on %s failed " + "(gfid: %s)", + loc->name, uuid_utoa(loc->inode->gfid)); + goto out; + } + svs_fd = svs_fd_ctx_get_or_new(this, fd); + if (!svs_fd) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_NEW_FD_CTX_FAILED, + "failed to allocate fd context " + "for %s (gfid: %s)", + loc->name, uuid_utoa(fd->inode->gfid)); + glfs_closedir(glfd); + goto out; + } + svs_fd->fd = glfd; + + op_ret = 0; + op_errno = 0; + } + +out: + STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, NULL); + + return 0; +} + +/* + * This function adds the xattr keys present in the list (@list) to the dict. + * But the list contains only the names of the xattrs (and no value, as + * the gfapi functions for the listxattr operations would return only the + * names of the xattrs in the buffer provided by the caller, though they had + * got the values of those xattrs from posix) as described in the man page of + * listxattr. But before unwinding snapview-server has to put those names + * back into the dict. But to get the values for those xattrs it has to do the + * getxattr operation on each xattr which might turn out to be a costly + * operation. So for each of the xattrs present in the list, a 0 byte value + * ("") is set into the dict before unwinding. Since ("") is also a valid xattr + * value(in a file system) we use an extra key in the same dictionary as an + * indicator to other xlators which want to cache the xattrs (as of now, + * md-cache which caches acl and selinux related xattrs) to not to cache the + * values of the xattrs present in the dict. + */ +int32_t +svs_add_xattrs_to_dict(xlator_t *this, dict_t *dict, char *list, ssize_t size) +{ + char keybuffer[4096] = { + 0, + }; + size_t remaining_size = 0; + int32_t list_offset = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("snapview-daemon", this, out); + GF_VALIDATE_OR_GOTO(this->name, dict, out); + GF_VALIDATE_OR_GOTO(this->name, list, out); + + remaining_size = size; + list_offset = 0; + while (remaining_size > 0) { + strncpy(keybuffer, list + list_offset, sizeof(keybuffer) - 1); +#ifdef GF_DARWIN_HOST_OS + /* The protocol expect namespace for now */ + char *newkey = NULL; + gf_add_prefix(XATTR_USER_PREFIX, keybuffer, &newkey); + strcpy(keybuffer, newkey); + GF_FREE(newkey); +#endif + ret = dict_set_str(dict, keybuffer, ""); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_DICT_SET_FAILED, + "dict set operation " + "for the key %s failed.", + keybuffer); + goto out; + } + + remaining_size -= strlen(keybuffer) + 1; + list_offset += strlen(keybuffer) + 1; + } /* while (remaining_size > 0) */ + + /* Add an additional key to indicate that we don't need to cache these + * xattrs(with value "") */ + ret = dict_set_str(dict, "glusterfs.skip-cache", ""); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_DICT_SET_FAILED, + "dict set operation for the key glusterfs.skip-cache failed."); + goto out; + } + + ret = 0; + +out: + return ret; +} + +int32_t +svs_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata) +{ + svs_inode_t *inode_ctx = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + glfs_t *fs = NULL; + glfs_object_t *object = NULL; + char *value = 0; + ssize_t size = 0; + dict_t *dict = NULL; + call_stack_t *root = NULL; + + GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out); + GF_VALIDATE_OR_GOTO("snap-view-daemon", frame, out); + GF_VALIDATE_OR_GOTO("snap-view-daemon", loc, out); + GF_VALIDATE_OR_GOTO("snap-view-daemon", loc->inode, out); + + root = frame->root; + op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps, + root->groups); + if (op_ret != 0) { + goto out; + } + + inode_ctx = svs_inode_ctx_get(this, loc->inode); + if (!inode_ctx) { + op_ret = -1; + op_errno = ESTALE; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "inode context not found " + "for the inode %s", + uuid_utoa(loc->inode->gfid)); + goto out; + } + + /* ENODATA is sent if the getxattr is on entry point directory + or the inode is SNAP_VIEW_ENTRY_POINT_INODE. Entry point is + a virtual directory on which setxattr operations are not + allowed. If getxattr has to be faked as success, then a value + for the name of the xattr has to be sent which we don't have. + */ + if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) { + op_ret = -1; + op_errno = ENODATA; + goto out; + } else { + SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, op_ret, + op_errno, out); + + dict = dict_new(); + if (!dict) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY, + "failed to allocate dict"); + goto out; + } + + size = glfs_h_getxattrs(fs, object, name, NULL, 0); + if (size == -1) { + op_ret = -1; + op_errno = errno; + if (errno == ENODATA) { + gf_msg_debug(this->name, 0, + "getxattr on " + "%s failed (ket: %s) with %s", + loc->path, name, strerror(errno)); + } else { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GETXATTR_FAILED, + "getxattr on %s failed (key: %s) with %s", loc->path, + name, strerror(errno)); + } + goto out; + } + value = GF_CALLOC(size + 1, sizeof(char), gf_common_mt_char); + if (!value) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY, + "failed to allocate memory for getxattr " + "on %s (key: %s)", + loc->name, name); + goto out; + } + + size = glfs_h_getxattrs(fs, object, name, value, size); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_GETXATTR_FAILED, + "failed to get the xattr %s for " + "entry %s", + name, loc->name); + goto out; + } + value[size] = '\0'; + + if (name) { + op_ret = dict_set_dynptr(dict, (char *)name, value, size); + if (op_ret < 0) { + op_errno = -op_ret; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_DICT_SET_FAILED, + "dict set operation for %s for " + "the key %s failed.", + loc->path, name); + GF_FREE(value); + value = NULL; + goto out; + } + } else { + op_ret = svs_add_xattrs_to_dict(this, dict, value, size); + if (op_ret == -1) { + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY, + "failed to add xattrs from the list to " + "dict for %s (gfid: %s)", + loc->path, uuid_utoa(loc->inode->gfid)); + goto out; + } + GF_FREE(value); + value = NULL; + } + } + +out: + if (op_ret && value) + GF_FREE(value); + + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, NULL); + + if (dict) + dict_unref(dict); + + return 0; +} + +int32_t +svs_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + svs_inode_t *inode_ctx = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + char *value = 0; + ssize_t size = 0; + dict_t *dict = NULL; + svs_fd_t *sfd = NULL; + glfs_fd_t *glfd = NULL; + + GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out); + GF_VALIDATE_OR_GOTO("snap-view-daemon", frame, out); + GF_VALIDATE_OR_GOTO("snap-view-daemon", fd, out); + GF_VALIDATE_OR_GOTO("snap-view-daemon", fd->inode, out); + + inode_ctx = svs_inode_ctx_get(this, fd->inode); + if (!inode_ctx) { + op_ret = -1; + op_errno = ESTALE; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "inode context not found " + "for the inode %s", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + if (!(svs_inode_ctx_glfs_mapping(this, inode_ctx))) { + op_ret = -1; + op_errno = EBADF; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_FS_INSTANCE_INVALID, + "glfs instance %p to which the inode %s " + "belongs to does not exist. The snapshot " + "corresponding to the instance might have" + "been deleted or deactivated", + inode_ctx->fs, uuid_utoa(fd->inode->gfid)); + goto out; + } + + sfd = svs_fd_ctx_get_or_new(this, fd); + if (!sfd) { + op_ret = -1; + op_errno = EBADFD; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_FD_CONTEXT_FAILED, + "failed to get the fd " + "context for %s", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + glfd = sfd->fd; + /* EINVAL is sent if the getxattr is on entry point directory + or the inode is SNAP_VIEW_ENTRY_POINT_INODE. Entry point is + a virtual directory on which setxattr operations are not + allowed. If getxattr has to be faked as success, then a value + for the name of the xattr has to be sent which we don't have. + */ + if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } else { + dict = dict_new(); + if (!dict) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY, + "failed to allocate dict " + "(gfid: %s, key: %s)", + uuid_utoa(fd->inode->gfid), name); + goto out; + } + + if (name) { + size = glfs_fgetxattr(glfd, name, NULL, 0); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GETXATTR_FAILED, + "getxattr on %s failed " + "(key: %s)", + uuid_utoa(fd->inode->gfid), name); + goto out; + } + value = GF_CALLOC(size + 1, sizeof(char), gf_common_mt_char); + if (!value) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY, + "failed to " + "allocate memory for getxattr on %s " + "(key: %s)", + uuid_utoa(fd->inode->gfid), name); + goto out; + } + + size = glfs_fgetxattr(glfd, name, value, size); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GETXATTR_FAILED, + "failed to get the xattr %s " + "for inode %s", + name, uuid_utoa(fd->inode->gfid)); + goto out; + } + value[size] = '\0'; + + op_ret = dict_set_dynptr(dict, (char *)name, value, size); + if (op_ret < 0) { + op_errno = -op_ret; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_DICT_SET_FAILED, + "dict set operation for gfid %s " + "for the key %s failed.", + uuid_utoa(fd->inode->gfid), name); + goto out; + } + } else { + size = glfs_flistxattr(glfd, NULL, 0); + if (size == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_LISTXATTR_FAILED, "listxattr on %s failed", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + value = GF_CALLOC(size + 1, sizeof(char), gf_common_mt_char); + if (!value) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY, + "failed to " + "allocate buffer for xattr " + "list (%s)", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + size = glfs_flistxattr(glfd, value, size); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_LISTXATTR_FAILED, "listxattr on %s failed", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + op_ret = svs_add_xattrs_to_dict(this, dict, value, size); + if (op_ret == -1) { + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY, + "failed to add xattrs from the list " + "to dict (gfid: %s)", + uuid_utoa(fd->inode->gfid)); + goto out; + } + GF_FREE(value); + } + + op_ret = 0; + op_errno = 0; + } + +out: + if (op_ret) + GF_FREE(value); + + STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, dict, NULL); + + if (dict) + dict_unref(dict); + + return 0; +} + +int32_t +svs_releasedir(xlator_t *this, fd_t *fd) +{ + svs_fd_t *sfd = NULL; + uint64_t tmp_pfd = 0; + int ret = 0; + svs_inode_t *svs_inode = NULL; + glfs_t *fs = NULL; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + ret = fd_ctx_del(fd, this, &tmp_pfd); + if (ret < 0) { + gf_msg_debug(this->name, 0, "pfd from fd=%p is NULL", fd); + goto out; + } + + inode = fd->inode; + + svs_inode = svs_inode_ctx_get(this, inode); + if (svs_inode) { + fs = svs_inode->fs; /* should inode->lock be held for this? */ + SVS_CHECK_VALID_SNAPSHOT_HANDLE(fs, this); + if (fs) { + sfd = (svs_fd_t *)(long)tmp_pfd; + if (sfd->fd) { + ret = glfs_closedir(sfd->fd); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, errno, + SVS_MSG_RELEASEDIR_FAILED, + "failed to close the glfd for " + "directory %s", + uuid_utoa(fd->inode->gfid)); + } + } + } + + GF_FREE(sfd); + +out: + return 0; +} + +int32_t +svs_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int ret = -1; + uint64_t value = 0; + svs_inode_t *inode_ctx = NULL; + call_stack_t *root = NULL; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + root = frame->root; + op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps, + root->groups); + if (op_ret != 0) { + goto out; + } + + inode_ctx = svs_inode_ctx_get(this, fd->inode); + if (!inode_ctx) { + op_ret = -1; + op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "inode context not found for" + " the inode %s", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + ret = fd_ctx_get(fd, this, &value); + if (ret < 0 && inode_ctx->type != SNAP_VIEW_ENTRY_POINT_INODE) { + op_errno = EINVAL; + gf_msg(this->name, GF_LOG_WARNING, op_errno, + SVS_MSG_GET_FD_CONTEXT_FAILED, "pfd is NULL on fd=%p", fd); + goto out; + } + + op_ret = 0; + +out: + STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, NULL); + + return 0; +} + +int32_t +svs_release(xlator_t *this, fd_t *fd) +{ + svs_fd_t *sfd = NULL; + uint64_t tmp_pfd = 0; + int ret = 0; + inode_t *inode = NULL; + svs_inode_t *svs_inode = NULL; + glfs_t *fs = NULL; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + ret = fd_ctx_del(fd, this, &tmp_pfd); + if (ret < 0) { + gf_msg_debug(this->name, 0, "pfd from fd=%p is NULL", fd); + goto out; + } + + inode = fd->inode; + + svs_inode = svs_inode_ctx_get(this, inode); + if (svs_inode) { + fs = svs_inode->fs; /* should inode->lock be held for this? */ + SVS_CHECK_VALID_SNAPSHOT_HANDLE(fs, this); + if (fs) { + sfd = (svs_fd_t *)(long)tmp_pfd; + if (sfd->fd) { + ret = glfs_close(sfd->fd); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, errno, + SVS_MSG_RELEASE_FAILED, + "failed to close " + "the glfd for %s", + uuid_utoa(fd->inode->gfid)); + } + } + } + + GF_FREE(sfd); +out: + return 0; +} + +int32_t +svs_forget(xlator_t *this, inode_t *inode) +{ + int ret = -1; + uint64_t value = 0; + svs_inode_t *inode_ctx = NULL; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + ret = inode_ctx_del(inode, this, &value); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_DELETE_INODE_CONTEXT_FAILED, + "failed to delete the inode " + "context of %s", + uuid_utoa(inode->gfid)); + goto out; + } + + inode_ctx = (svs_inode_t *)(uintptr_t)value; + if (!inode_ctx) + goto out; + + if (inode_ctx->snapname) + GF_FREE(inode_ctx->snapname); + + /* + * glfs_h_close leads to unref and forgetting of the + * underlying inode in the gfapi world. i.e. the inode + * which inode_ctx->object points to. + * As of now the only possibility is, this forget came as a + * result of snapdaemon's inode table reaching the lru + * limit and receiving forget as a result of purging of + * extra inodes that exceeded the limit. But, care must + * be taken to ensure that, the gfapi instance to which + * the glfs_h_object belongs to is not deleted. Otherwise + * this might result in access of a freed pointer. + * This will still be helpful in reducing the memory + * footprint of snapdaemon when the fs instance itself is + * valid (i.e. present and not destroyed due to either snap + * deactivate or snap delete), but the lru limit is reached. + * The forget due to lru limit will make the underlying inode + * being unrefed and forgotten. + */ + if (svs_inode_ctx_glfs_mapping(this, inode_ctx)) { + glfs_h_close(inode_ctx->object); + inode_ctx->object = NULL; + } + GF_FREE(inode_ctx); + +out: + return 0; +} + +int +svs_fill_readdir(xlator_t *this, gf_dirent_t *entries, size_t size, off_t off) +{ + gf_dirent_t *entry = NULL; + svs_private_t *priv = NULL; + int i = 0; + snap_dirent_t *dirents = NULL; + int this_size = 0; + int filled_size = 0; + int count = 0; + + GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out); + GF_VALIDATE_OR_GOTO("snap-view-daemon", entries, out); + + priv = this->private; + GF_ASSERT(priv); + + /* create the dir entries */ + LOCK(&priv->snaplist_lock); + { + dirents = priv->dirents; + + for (i = off; i < priv->num_snaps;) { + this_size = sizeof(gf_dirent_t) + strlen(dirents[i].name) + 1; + if (this_size + filled_size > size) + goto unlock; + + entry = gf_dirent_for_name(dirents[i].name); + if (!entry) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, SVS_MSG_NO_MEMORY, + "failed to allocate dentry for %s", dirents[i].name); + goto unlock; + } + + entry->d_off = i + 1; + /* + * readdir on the entry-point directory to the snapshot + * world, will return elements in the list of the + * snapshots as the directory entries. Since the entries + * returned are virtual entries which does not exist + * physically on the disk, pseudo inode numbers are + * generated. + */ + entry->d_ino = i + 2 * 42; + entry->d_type = DT_DIR; + list_add_tail(&entry->list, &entries->list); + ++i; + count++; + filled_size += this_size; + } + } +unlock: + UNLOCK(&priv->snaplist_lock); + +out: + return count; +} + +int32_t +svs_glfs_readdir(xlator_t *this, glfs_fd_t *glfd, gf_dirent_t *entries, + int32_t *op_errno, struct iatt *buf, gf_boolean_t readdirplus, + size_t size) +{ + int filled_size = 0; + int this_size = 0; + int32_t ret = -1; + int32_t count = 0; + gf_dirent_t *entry = NULL; + struct dirent *dirents = NULL; + struct dirent de = { + 0, + }; + struct stat statbuf = { + 0, + }; + off_t in_case = -1; + + GF_VALIDATE_OR_GOTO("svs", this, out); + GF_VALIDATE_OR_GOTO(this->name, glfd, out); + GF_VALIDATE_OR_GOTO(this->name, entries, out); + + while (filled_size < size) { + in_case = glfs_telldir(glfd); + if (in_case == -1) { + gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_TELLDIR_FAILED, + "telldir failed"); + break; + } + + if (readdirplus) + ret = glfs_readdirplus_r(glfd, &statbuf, &de, &dirents); + else + ret = glfs_readdir_r(glfd, &de, &dirents); + + if (ret == 0 && dirents != NULL) { + if (readdirplus) + this_size = max(sizeof(gf_dirent_t), sizeof(gfs3_dirplist)) + + strlen(de.d_name) + 1; + else + this_size = sizeof(gf_dirent_t) + strlen(de.d_name) + 1; + + if (this_size + filled_size > size) { + glfs_seekdir(glfd, in_case); + break; + } + + entry = gf_dirent_for_name(de.d_name); + if (!entry) { + /* + * Since gf_dirent_for_name can return + * NULL only when it fails to allocate + * memory for the directory entry, + * SVS_MSG_NO_MEMORY is used as the + * message-id. + */ + gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_NO_MEMORY, + "could not create gf_dirent " + "for entry %s: (%s)", + entry->d_name, strerror(errno)); + break; + } + entry->d_off = glfs_telldir(glfd); + entry->d_ino = de.d_ino; + entry->d_type = de.d_type; + if (readdirplus) { + iatt_from_stat(buf, &statbuf); + entry->d_stat = *buf; + } + list_add_tail(&entry->list, &entries->list); + + filled_size += this_size; + count++; + } else if (ret == 0 && dirents == NULL) { + *op_errno = ENOENT; + break; + } else if (ret != 0) { + *op_errno = errno; + break; + } + dirents = NULL; + } + +out: + return count; +} + +/* readdirp can be of 2 types. + 1) It can come on entry point directory where the list of snapshots + is sent as dirents. In this case, the iatt structure is filled + on the fly if the inode is not found for the entry or the inode + context is NULL. Other wise if inode is found and inode context + is there the iatt structure saved in the context is used. + 2) It can be on a directory in one of the snapshots. In this case, + the readdirp call would have sent us a iatt structure. So the same + structure is used with the exception that the gfid and the inode + numbers will be newly generated and filled in. +*/ +void +svs_readdirp_fill(xlator_t *this, inode_t *parent, svs_inode_t *parent_ctx, + gf_dirent_t *entry) +{ + inode_t *inode = NULL; + uuid_t random_gfid = { + 0, + }; + struct iatt buf = { + 0, + }; + svs_inode_t *inode_ctx = NULL; + + GF_VALIDATE_OR_GOTO("snapview-server", this, out); + GF_VALIDATE_OR_GOTO(this->name, parent, out); + GF_VALIDATE_OR_GOTO(this->name, parent_ctx, out); + GF_VALIDATE_OR_GOTO(this->name, entry, out); + + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) + goto out; + + inode = inode_grep(parent->table, parent, entry->d_name); + if (inode) { + entry->inode = inode; + inode_ctx = svs_inode_ctx_get(this, inode); + if (!inode_ctx) { + gf_uuid_copy(buf.ia_gfid, inode->gfid); + svs_iatt_fill(inode->gfid, &buf); + buf.ia_type = inode->ia_type; + } else { + buf = inode_ctx->buf; + } + + entry->d_ino = buf.ia_ino; + + if (parent_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) + entry->d_stat = buf; + else { + entry->d_stat.ia_ino = buf.ia_ino; + gf_uuid_copy(entry->d_stat.ia_gfid, buf.ia_gfid); + } + } else { + if (parent_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) { + inode = inode_new(parent->table); + entry->inode = inode; + + /* If inode context allocation fails, then do not send + * the inode for that particular entry as part of + * readdirp response. Fuse and protocol/server will link + * the inodes in readdirp only if the entry contains + * inode in it. + */ + inode_ctx = svs_inode_ctx_get_or_new(this, inode); + if (!inode_ctx) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, SVS_MSG_NO_MEMORY, + "failed to allocate inode " + "context for %s", + entry->d_name); + inode_unref(entry->inode); + entry->inode = NULL; + goto out; + } + + /* Generate virtual gfid for SNAPSHOT dir and + * update the statbuf + */ + gf_uuid_generate(random_gfid); + gf_uuid_copy(buf.ia_gfid, random_gfid); + svs_fill_ino_from_gfid(&buf); + buf.ia_type = IA_IFDIR; + entry->d_ino = buf.ia_ino; + entry->d_stat = buf; + inode_ctx->buf = buf; + inode_ctx->type = SNAP_VIEW_SNAPSHOT_INODE; + } else { + /* For files under snapshot world do not set + * entry->inode and reset statbuf (except ia_ino), + * so that FUSE/Kernel will send an explicit lookup. + * entry->d_stat contains the statbuf information + * of original file, so for NFS not to cache this + * information and to send explicit lookup, it is + * required to reset the statbuf. + * Virtual gfid for these files will be generated in the + * first lookup. + */ + buf.ia_ino = entry->d_ino; + entry->d_stat = buf; + } + } + +out: + return; +} + +/* In readdirp, though new inode is created along with the generation of + new gfid, the inode context created will not contain the glfs_t instance + for the filesystem it belongs to and the handle for it in the gfapi + world. (handle is obtained only by doing the lookup call on the entry + and doing lookup on each entry received as part of readdir call is a + costly operation. So the fs and handle is NULL in the inode context + and is filled in when lookup comes on that object. +*/ +int32_t +svs_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict) +{ + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + struct iatt buf = { + 0, + }; + int count = 0; + int op_ret = -1; + int op_errno = EINVAL; + svs_inode_t *parent_ctx = NULL; + svs_fd_t *svs_fd = NULL; + call_stack_t *root = NULL; + + GF_VALIDATE_OR_GOTO("snap-view-daemon", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, frame, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, unwind); + + INIT_LIST_HEAD(&entries.list); + + root = frame->root; + op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps, + root->groups); + if (op_ret != 0) { + goto unwind; + } + + parent_ctx = svs_inode_ctx_get(this, fd->inode); + if (!parent_ctx) { + op_ret = -1; + op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "failed to get the inode " + "context for %s", + uuid_utoa(fd->inode->gfid)); + goto unwind; + } + + if (parent_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) { + LOCK(&fd->lock); + { + count = svs_fill_readdir(this, &entries, size, off); + } + UNLOCK(&fd->lock); + + op_ret = count; + + list_for_each_entry(entry, &entries.list, list) + { + svs_readdirp_fill(this, fd->inode, parent_ctx, entry); + } + + goto unwind; + } else { + svs_fd = svs_fd_ctx_get_or_new(this, fd); + if (!svs_fd) { + op_ret = -1; + op_errno = EBADFD; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_FD_CONTEXT_FAILED, + "failed to get the fd context " + "for the inode %s", + uuid_utoa(fd->inode->gfid)); + goto unwind; + } + + glfs_seekdir(svs_fd->fd, off); + + LOCK(&fd->lock); + { + count = svs_glfs_readdir(this, svs_fd->fd, &entries, &op_errno, + &buf, _gf_true, size); + } + UNLOCK(&fd->lock); + + op_ret = count; + + list_for_each_entry(entry, &entries.list, list) + { + svs_readdirp_fill(this, fd->inode, parent_ctx, entry); + } + + goto unwind; + } + +unwind: + STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, &entries, dict); + + gf_dirent_free(&entries); + + return 0; +} + +int32_t +svs_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + gf_dirent_t entries = { + { + { + 0, + }, + }, + }; + int count = 0; + svs_inode_t *inode_ctx = NULL; + int op_errno = EINVAL; + int op_ret = -1; + svs_fd_t *svs_fd = NULL; + glfs_fd_t *glfd = NULL; + + INIT_LIST_HEAD(&entries.list); + + GF_VALIDATE_OR_GOTO("snap-view-server", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, frame, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, unwind); + + inode_ctx = svs_inode_ctx_get(this, fd->inode); + if (!inode_ctx) { + op_ret = -1; + op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "inode context not found in " + "the inode %s", + uuid_utoa(fd->inode->gfid)); + goto unwind; + } + + if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) { + LOCK(&fd->lock); + { + count = svs_fill_readdir(this, &entries, size, off); + } + UNLOCK(&fd->lock); + } else { + svs_fd = svs_fd_ctx_get_or_new(this, fd); + if (!svs_fd) { + op_ret = -1; + op_errno = EBADFD; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_FD_CONTEXT_FAILED, + "failed to get the fd " + "context for %s", + uuid_utoa(fd->inode->gfid)); + goto unwind; + } + + glfd = svs_fd->fd; + + LOCK(&fd->lock); + { + count = svs_glfs_readdir(this, glfd, &entries, &op_errno, NULL, + _gf_false, size); + } + UNLOCK(&fd->lock); + } + + op_ret = count; + +unwind: + STACK_UNWIND_STRICT(readdir, frame, op_ret, op_errno, &entries, xdata); + + gf_dirent_free(&entries); + + return 0; +} + +/* + * This function is mainly helpful for NFS. Till now NFS server was not linking + * the inodes in readdirp, which caused problems when below operations were + * performed. + * + * 1) ls -l in one of the snaopshots (snapview-server would generate gfids for + * each entry on the fly and link the inodes associated with those entries) + * 2) NFS server upon getting readdirp reply would not link the inodes of the + * entries. But it used to generate filehandles for each entry and associate + * the gfid of that entry with the filehandle and send it as part of the + * reply to nfs client. + * 3) NFS client would send the filehandle of one of those entries when some + * activity is done on it. + * 4) NFS server would not be able to find the inode for the gfid present in the + * filehandle (as the inode was not linked) and would go for hard resolution + * by sending a lookup on the gfid by creating a new inode. + * 5) snapview-client will not able to identify whether the inode is a real + * inode existing in the main volume or a virtual inode existing in the + * snapshots as there would not be any inode context. + * 6) Since the gfid upon which lookup is sent is a virtual gfid which is not + * present in the disk, lookup would fail and the application would get an + * error. + * + * The above problem is fixed by the below commit which makes snapview server + * more compatible with nfs server (1dea949cb60c3814c9206df6ba8dddec8d471a94). + * But now because NFS server does inode linking in readdirp has introduced + * the below issue. + * In readdirp though snapview-server allocates inode contexts it does not + * actually perform lookup on each entry it obtained in readdirp (as doing + * a lookup via gfapi over the network for each entry would be costly). + * + * Till now it was not a problem with NFS server, as NFS was sending a lookup on + * the gfid it got from NFS client, for which it was not able to find the right + * inode. So snapview-server was able to get the fs instance (glfs_t) of the + * snapshot volume to which the entry belongs to, and the handle for the entry + * from the corresponding snapshot volume and fill those information in the + * inode context. + * + * But now, since NFS server is able to find the inode from the inode table for + * the gfid it got from the NFS client, it won't send lookup. Rather it directly + * sends the fop it received from the client. Now this causes problems for + * snapview-server. Because for each fop snapview-server assumes that lookup has + * been performed on that entry and the entry's inode context contains the + * pointers for the fs instance and the handle to the entry in that fs. When NFS + * server sends the fop and snapview-server finds that the fs instance and the + * handle within the inode context are NULL it unwinds with EINVAL. + * + * So to handle this, if fs instance or handle within the inode context are + * NULL, then do a lookup based on parent inode context's fs instance. And + * unwind the results obtained as part of lookup + */ + +int32_t +svs_get_handle(xlator_t *this, loc_t *loc, svs_inode_t *inode_ctx, + int32_t *op_errno) +{ + svs_inode_t *parent_ctx = NULL; + int ret = -1; + inode_t *parent = NULL; + struct iatt postparent = { + 0, + }; + struct iatt buf = { + 0, + }; + char uuid1[64]; + + GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + if (loc->path) { + if (!loc->name || (loc->name && !strcmp(loc->name, ""))) { + loc->name = strrchr(loc->path, '/'); + if (loc->name) + loc->name++; + } + } + + if (loc->parent) + parent = inode_ref(loc->parent); + else { + parent = inode_find(loc->inode->table, loc->pargfid); + if (!parent) + parent = inode_parent(loc->inode, NULL, NULL); + } + + if (parent) + parent_ctx = svs_inode_ctx_get(this, parent); + + if (!parent_ctx) { + *op_errno = EINVAL; + gf_msg(this->name, GF_LOG_WARNING, *op_errno, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "failed to get the parent " + "context for %s (%s)", + loc->path, uuid_utoa_r(loc->inode->gfid, uuid1)); + goto out; + } + + if (parent_ctx) { + if (parent_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) + ret = svs_lookup_snapshot(this, loc, &buf, &postparent, parent, + parent_ctx, op_errno); + else + ret = svs_lookup_entry(this, loc, &buf, &postparent, parent, + parent_ctx, op_errno); + } + +out: + if (parent) + inode_unref(parent); + + return ret; +} + +int32_t +svs_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + struct iatt buf = { + 0, + }; + int32_t op_errno = EINVAL; + int32_t op_ret = -1; + svs_inode_t *inode_ctx = NULL; + glfs_t *fs = NULL; + glfs_object_t *object = NULL; + struct stat stat = { + 0, + }; + int ret = -1; + call_stack_t *root = NULL; + + GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + root = frame->root; + op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps, + root->groups); + if (op_ret != 0) { + goto out; + } + + /* Instead of doing the check of whether it is a entry point directory + or not by checking the name of the entry and then deciding what + to do, just check the inode context and decide what to be done. + */ + + inode_ctx = svs_inode_ctx_get(this, loc->inode); + if (!inode_ctx) { + op_ret = -1; + op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "inode context not found for %s", uuid_utoa(loc->inode->gfid)); + goto out; + } + + if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) { + svs_iatt_fill(loc->inode->gfid, &buf); + op_ret = 0; + } else { + SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, op_ret, + op_errno, out); + + ret = glfs_h_stat(fs, object, &stat); + if (ret) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_STAT_FAILED, + "glfs_h_stat on %s (gfid: %s) " + "failed", + loc->name, uuid_utoa(loc->inode->gfid)); + goto out; + } else + gf_msg_debug(this->name, 0, "stat on %s (%s) successful", loc->path, + uuid_utoa(loc->inode->gfid)); + + iatt_from_stat(&buf, &stat); + gf_uuid_copy(buf.ia_gfid, loc->inode->gfid); + svs_fill_ino_from_gfid(&buf); + op_ret = ret; + } + +out: + STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, &buf, xdata); + return 0; +} + +int32_t +svs_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + struct iatt buf = { + 0, + }; + int32_t op_errno = EINVAL; + int32_t op_ret = -1; + svs_inode_t *inode_ctx = NULL; + struct stat stat = { + 0, + }; + int ret = -1; + glfs_fd_t *glfd = NULL; + svs_fd_t *sfd = NULL; + call_stack_t *root = NULL; + + GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, out); + + /* Instead of doing the check of whether it is a entry point directory + or not by checking the name of the entry and then deciding what + to do, just check the inode context and decide what to be done. + */ + + root = frame->root; + op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps, + root->groups); + if (op_ret != 0) { + goto out; + } + + inode_ctx = svs_inode_ctx_get(this, fd->inode); + if (!inode_ctx) { + op_ret = -1; + op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "inode context not found for" + " the inode %s", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) { + svs_iatt_fill(fd->inode->gfid, &buf); + op_ret = 0; + } else { + if (!(svs_inode_ctx_glfs_mapping(this, inode_ctx))) { + op_ret = -1; + op_errno = EBADF; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_FS_INSTANCE_INVALID, + "glfs instance %p to which the inode %s " + "belongs to does not exist. That snapshot " + "corresponding to the fs instance " + "might have been deleted or deactivated.", + inode_ctx->fs, uuid_utoa(fd->inode->gfid)); + goto out; + } + + sfd = svs_fd_ctx_get_or_new(this, fd); + if (!sfd) { + op_ret = -1; + op_errno = EBADFD; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_FD_CONTEXT_FAILED, + "failed to get the fd context " + "for %s", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + glfd = sfd->fd; + ret = glfs_fstat(glfd, &stat); + if (ret) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_STAT_FAILED, + "glfs_fstat on gfid: %s failed", uuid_utoa(fd->inode->gfid)); + goto out; + } + + iatt_from_stat(&buf, &stat); + gf_uuid_copy(buf.ia_gfid, fd->inode->gfid); + svs_fill_ino_from_gfid(&buf); + op_ret = ret; + } + +out: + STACK_UNWIND_STRICT(fstat, frame, op_ret, op_errno, &buf, xdata); + return 0; +} + +int32_t +svs_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + struct statvfs buf = { + 0, + }; + int32_t op_errno = EINVAL; + int32_t op_ret = -1; + svs_inode_t *inode_ctx = NULL; + glfs_t *fs = NULL; + glfs_object_t *object = NULL; + int ret = -1; + call_stack_t *root = NULL; + + GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + root = frame->root; + op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps, + root->groups); + if (op_ret != 0) { + goto out; + } + + /* Instead of doing the check of whether it is a entry point directory + or not by checking the name of the entry and then deciding what + to do, just check the inode context and decide what to be done. + */ + inode_ctx = svs_inode_ctx_get(this, loc->inode); + if (!inode_ctx) { + op_ret = -1; + op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "inode context not found for %s", uuid_utoa(loc->inode->gfid)); + goto out; + } + + SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, op_ret, op_errno, + out); + + ret = glfs_h_statfs(fs, object, &buf); + if (ret) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_STATFS_FAILED, + "glfs_h_statvfs on %s (gfid: %s) " + "failed", + loc->name, uuid_utoa(loc->inode->gfid)); + goto out; + } + op_ret = ret; + +out: + STACK_UNWIND_STRICT(statfs, frame, op_ret, op_errno, &buf, xdata); + return 0; +} + +int32_t +svs_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + svs_inode_t *inode_ctx = NULL; + svs_fd_t *sfd = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + glfs_fd_t *glfd = NULL; + glfs_t *fs = NULL; + glfs_object_t *object = NULL; + call_stack_t *root = NULL; + + GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + root = frame->root; + + inode_ctx = svs_inode_ctx_get(this, loc->inode); + if (!inode_ctx) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "inode context for %s (gfid: %s) " + "not found", + loc->name, uuid_utoa(loc->inode->gfid)); + goto out; + } + + if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) + GF_ASSERT(0); // on entry point it should always be opendir + + SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, op_ret, op_errno, + out); + + op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps, + root->groups); + if (op_ret != 0) { + goto out; + } + + glfd = glfs_h_open(fs, object, flags); + if (!glfd) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_OPEN_FAILED, + "glfs_h_open on %s failed (gfid: %s)", loc->name, + uuid_utoa(loc->inode->gfid)); + goto out; + } + + sfd = svs_fd_ctx_get_or_new(this, fd); + if (!sfd) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY, + "failed to allocate fd context " + "for %s (gfid: %s)", + loc->name, uuid_utoa(loc->inode->gfid)); + glfs_close(glfd); + goto out; + } + sfd->fd = glfd; + + op_ret = 0; + +out: + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, NULL); + return 0; +} + +int32_t +svs_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + svs_private_t *priv = NULL; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + struct iovec vec = { + 0, + }; + svs_fd_t *sfd = NULL; + int ret = -1; + struct glfs_stat fstatbuf = { + 0, + }; + glfs_fd_t *glfd = NULL; + struct iatt stbuf = { + 0, + }; + call_stack_t *root = NULL; + + GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, out); + + priv = this->private; + VALIDATE_OR_GOTO(priv, out); + + root = frame->root; + op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps, + root->groups); + if (op_ret != 0) { + goto out; + } + + if (!svs_inode_glfs_mapping(this, fd->inode)) { + op_ret = -1; + op_errno = EBADF; /* should this be some other error? */ + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_FS_INSTANCE_INVALID, + "glfs instance to which the inode " + "%s receiving read request belongs, " + "does not exist anymore", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + sfd = svs_fd_ctx_get_or_new(this, fd); + if (!sfd) { + op_ret = -1; + op_errno = EBADFD; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "failed to get the fd " + "context for %s", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + glfd = sfd->fd; + + iobuf = iobuf_get2(this->ctx->iobuf_pool, size); + if (!iobuf) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY, + "failed to " + "allocate iobuf while reading the " + "file with gfid %s", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + ret = glfs_pread(glfd, iobuf->ptr, size, offset, 0, &fstatbuf); + if (ret < 0) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_READ_FAILED, + "glfs_read failed on %s (%s)", uuid_utoa(fd->inode->gfid), + strerror(op_errno)); + goto out; + } + + vec.iov_base = iobuf->ptr; + vec.iov_len = ret; + + iobref = iobref_new(); + + iobref_add(iobref, iobuf); + glfs_iatt_from_statx(&stbuf, &fstatbuf); + gf_uuid_copy(stbuf.ia_gfid, fd->inode->gfid); + svs_fill_ino_from_gfid(&stbuf); + + /* Hack to notify higher layers of EOF. */ + if (!stbuf.ia_size || (offset + vec.iov_len) >= stbuf.ia_size) + op_errno = ENOENT; + + op_ret = vec.iov_len; + +out: + + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, &vec, 1, &stbuf, iobref, + NULL); + + if (iobref) + iobref_unref(iobref); + if (iobuf) + iobuf_unref(iobuf); + + return 0; +} + +int32_t +svs_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + svs_inode_t *inode_ctx = NULL; + glfs_t *fs = NULL; + glfs_object_t *object = NULL; + int op_ret = -1; + int op_errno = EINVAL; + char *buf = NULL; + struct iatt stbuf = { + 0, + }; + int ret = -1; + struct stat stat = { + 0, + }; + call_stack_t *root = NULL; + + GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + root = frame->root; + op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps, + root->groups); + if (op_ret != 0) { + goto out; + } + + inode_ctx = svs_inode_ctx_get(this, loc->inode); + if (!inode_ctx) { + op_ret = -1; + op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "failed to get inode context " + "for %s (gfid: %s)", + loc->name, uuid_utoa(loc->inode->gfid)); + goto out; + } + + SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, op_ret, op_errno, + out); + + ret = glfs_h_stat(fs, object, &stat); + if (ret) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_STAT_FAILED, + "glfs_h_stat on %s (gfid: %s) " + "failed", + loc->name, uuid_utoa(loc->inode->gfid)); + goto out; + } + + iatt_from_stat(&stbuf, &stat); + gf_uuid_copy(stbuf.ia_gfid, loc->inode->gfid); + svs_fill_ino_from_gfid(&stbuf); + + buf = alloca(size + 1); + op_ret = glfs_h_readlink(fs, object, buf, size); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_READLINK_FAILED, + "readlink on %s failed (gfid: %s)", loc->name, + uuid_utoa(loc->inode->gfid)); + goto out; + } + + buf[op_ret] = 0; + +out: + STACK_UNWIND_STRICT(readlink, frame, op_ret, op_errno, buf, &stbuf, NULL); + + return 0; +} + +int32_t +svs_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int mask, + dict_t *xdata) +{ + int ret = -1; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + glfs_t *fs = NULL; + glfs_object_t *object = NULL; + svs_inode_t *inode_ctx = NULL; + gf_boolean_t is_fuse_call = 0; + int mode = 0; + call_stack_t *root = NULL; + + GF_VALIDATE_OR_GOTO("svs", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, out); + + root = frame->root; + op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps, + root->groups); + if (op_ret != 0) { + goto out; + } + + inode_ctx = svs_inode_ctx_get(this, loc->inode); + if (!inode_ctx) { + op_ret = -1; + op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SVS_MSG_GET_INODE_CONTEXT_FAILED, + "inode context not found for %s", uuid_utoa(loc->inode->gfid)); + goto out; + } + + is_fuse_call = __is_fuse_call(frame); + + /* + * For entry-point directory, set read and execute bits. But not write + * permissions. + */ + if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) { + if (is_fuse_call) { + op_ret = 0; + op_errno = 0; + } else { + op_ret = 0; + mode |= POSIX_ACL_READ; + mode |= POSIX_ACL_EXECUTE; + op_errno = mode; + } + goto out; + } + + SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, op_ret, op_errno, + out); + + /* The actual posix_acl xlator does acl checks differently for + fuse and nfs. So set frame->root->pid as fspid of the syncop + if the call came from nfs + */ + if (!is_fuse_call) { + syncopctx_setfspid(&frame->root->pid); + syncopctx_setfsuid(&frame->root->uid); + syncopctx_setfsgid(&frame->root->gid); + syncopctx_setfsgroups(frame->root->ngrps, frame->root->groups); + } + + ret = glfs_h_access(fs, object, mask); + if (ret < 0) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_ACCESS_FAILED, + "failed to access %s (gfid: %s)", loc->path, + uuid_utoa(loc->inode->gfid)); + goto out; + } + + op_ret = 0; + op_errno = ret; + +out: + + STACK_UNWIND_STRICT(access, frame, op_ret, op_errno, NULL); + return 0; +} + +int32_t +notify(xlator_t *this, int32_t event, void *data, ...) +{ + switch (event) { + case GF_EVENT_PARENT_UP: { + /* Tell the parent that snapview-server xlator is up */ + default_notify(this, GF_EVENT_CHILD_UP, data); + } break; + default: + break; + } + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init(this, gf_svs_mt_end + 1); + + if (ret != 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, SVS_MSG_MEM_ACNT_FAILED, + "Memory accounting" + " init failed"); + return ret; + } + + return ret; +} + +int32_t +init(xlator_t *this) +{ + svs_private_t *priv = NULL; + int ret = -1; + + /* This can be the top of graph in certain cases */ + if (!this->parents) { + gf_msg_debug(this->name, 0, "dangling volume. check volfile "); + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_svs_mt_priv_t); + if (!priv) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, SVS_MSG_NO_MEMORY, + "failed to " + "allocate memory for this->private "); + goto out; + } + + this->private = priv; + + GF_OPTION_INIT("volname", priv->volname, str, out); + LOCK_INIT(&priv->snaplist_lock); + + LOCK(&priv->snaplist_lock); + { + priv->num_snaps = 0; + } + UNLOCK(&priv->snaplist_lock); + + /* What to do here upon failure? should init be failed or succeed? */ + /* If succeeded, then dynamic management of snapshots will not */ + /* happen.*/ + ret = svs_mgmt_init(this); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, EINVAL, SVS_MSG_MGMT_INIT_FAILED, + "failed to initiate the " + "mgmt rpc callback for svs. Dymamic management of the" + "snapshots will not happen"); + goto out; + } + + /* get the list of snaps first to return to client xlator */ + ret = svs_get_snapshot_list(this); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, + SVS_MSG_GET_SNAPSHOT_LIST_FAILED, + "Error initializing snaplist infrastructure"); + ret = -1; + goto out; + } + + ret = 0; + +out: + if (ret && priv) { + LOCK_DESTROY(&priv->snaplist_lock); + GF_FREE(priv->dirents); + GF_FREE(priv); + } + + return ret; +} + +void +fini(xlator_t *this) +{ + svs_private_t *priv = NULL; + glusterfs_ctx_t *ctx = NULL; + int ret = 0; + + GF_ASSERT(this); + priv = this->private; + this->private = NULL; + ctx = this->ctx; + if (!ctx) + gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_INVALID_GLFS_CTX, + "Invalid ctx found"); + + if (priv) { + ret = LOCK_DESTROY(&priv->snaplist_lock); + if (ret != 0) { + gf_msg(this->name, GF_LOG_WARNING, errno, + SVS_MSG_LOCK_DESTROY_FAILED, + "Could not destroy mutex snaplist_lock"); + } + + if (priv->dirents) { + GF_FREE(priv->dirents); + } + + if (priv->rpc) { + /* cleanup the saved-frames before last unref */ + rpc_clnt_connection_cleanup(&priv->rpc->conn); + rpc_clnt_unref(priv->rpc); + } + + GF_FREE(priv); + } + + return; +} + +struct xlator_fops fops = { + .lookup = svs_lookup, + .stat = svs_stat, + .statfs = svs_statfs, + .opendir = svs_opendir, + .readdirp = svs_readdirp, + .readdir = svs_readdir, + .open = svs_open, + .readv = svs_readv, + .flush = svs_flush, + .fstat = svs_fstat, + .getxattr = svs_getxattr, + .access = svs_access, + .readlink = svs_readlink, + /* entry fops */ +}; + +struct xlator_cbks cbks = { + .release = svs_release, + .releasedir = svs_releasedir, + .forget = svs_forget, +}; + +struct volume_options options[] = { + { + .key = {"volname"}, + .type = GF_OPTION_TYPE_STR, + }, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .mem_acct_init = mem_acct_init, + .op_version = {1}, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "snapview-server", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/snapview-server/src/snapview-server.h b/xlators/features/snapview-server/src/snapview-server.h new file mode 100644 index 00000000000..6472422e715 --- /dev/null +++ b/xlators/features/snapview-server/src/snapview-server.h @@ -0,0 +1,255 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __SNAP_VIEW_H__ +#define __SNAP_VIEW_H__ + +#include <glusterfs/dict.h> +#include <glusterfs/defaults.h> +#include <glusterfs/mem-types.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/iatt.h> +#include <ctype.h> +#include <sys/uio.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include "glfs.h" +#include "glfs-handles.h" +#include "glfs-internal.h" +#include "glusterfs3-xdr.h" +#include <glusterfs/glusterfs-acl.h> +#include <glusterfs/syncop.h> +#include <glusterfs/list.h> +#include <glusterfs/timer.h> +#include "rpc-clnt.h" +#include "protocol-common.h" +#include "xdr-generic.h" +#include "snapview-server-messages.h" + +#define DEFAULT_SVD_LOG_FILE_DIRECTORY DATADIR "/log/glusterfs" + +#define SNAP_VIEW_MAX_GLFS_T 256 +#define SNAP_VIEW_MAX_GLFS_FDS 1024 +#define SNAP_VIEW_MAX_GLFS_OBJ_HANDLES 1024 + +#define SVS_STACK_DESTROY(_frame) \ + do { \ + ((call_frame_t *)_frame)->local = NULL; \ + STACK_DESTROY(((call_frame_t *)_frame)->root); \ + } while (0) + +#define SVS_CHECK_VALID_SNAPSHOT_HANDLE(fs, this) \ + do { \ + svs_private_t *_private = NULL; \ + _private = this->private; \ + int i = 0; \ + gf_boolean_t found = _gf_false; \ + glfs_t *tmp_fs = NULL; \ + LOCK(&_private->snaplist_lock); \ + { \ + for (i = 0; i < _private->num_snaps; i++) { \ + tmp_fs = _private->dirents[i].fs; \ + gf_log(this->name, GF_LOG_DEBUG, \ + "snap name: %s, snap volume: %s," \ + "dirent->fs: %p", \ + _private->dirents[i].name, \ + _private->dirents[i].snap_volname, tmp_fs); \ + if (tmp_fs && fs && (tmp_fs == fs)) { \ + found = _gf_true; \ + gf_msg_debug(this->name, 0, \ + "found the fs " \ + "instance"); \ + break; \ + } \ + } \ + } \ + UNLOCK(&_private->snaplist_lock); \ + \ + if (!found) { \ + gf_log(this->name, GF_LOG_WARNING, \ + "failed to" \ + " find the fs instance %p", \ + fs); \ + fs = NULL; \ + } \ + } while (0) + +#define SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, ret, \ + op_errno, label) \ + do { \ + fs = inode_ctx->fs; \ + object = inode_ctx->object; \ + SVS_CHECK_VALID_SNAPSHOT_HANDLE(fs, this); \ + if (!fs) \ + object = NULL; \ + \ + if (!fs || !object) { \ + int32_t tmp = -1; \ + char tmp_uuid[64]; \ + \ + tmp = svs_get_handle(this, loc, inode_ctx, &op_errno); \ + if (tmp) { \ + gf_log(this->name, GF_LOG_ERROR, \ + "failed to get the handle for %s " \ + "(gfid: %s)", \ + loc->path, uuid_utoa_r(loc->inode->gfid, tmp_uuid)); \ + ret = -1; \ + goto label; \ + } \ + \ + fs = inode_ctx->fs; \ + object = inode_ctx->object; \ + } \ + } while (0); + +#define SVS_STRDUP(dst, src) \ + do { \ + if (dst && strcmp(src, dst)) { \ + GF_FREE(dst); \ + dst = NULL; \ + } \ + \ + if (!dst) \ + dst = gf_strdup(src); \ + } while (0) + +int +svs_mgmt_submit_request(void *req, call_frame_t *frame, glusterfs_ctx_t *ctx, + rpc_clnt_prog_t *prog, int procnum, fop_cbk_fn_t cbkfn, + xdrproc_t xdrproc); + +int +svs_get_snapshot_list(xlator_t *this); + +int +mgmt_get_snapinfo_cbk(struct rpc_req *req, struct iovec *iov, int count, + void *myframe); + +typedef enum { + SNAP_VIEW_ENTRY_POINT_INODE = 0, + SNAP_VIEW_SNAPSHOT_INODE, + SNAP_VIEW_VIRTUAL_INODE +} inode_type_t; + +struct svs_inode { + glfs_t *fs; + glfs_object_t *object; + inode_type_t type; + + /* used only for entry point directory where gfid of the directory + from where the entry point was entered is saved. + */ + uuid_t pargfid; + + /* This is used to generate gfid for all sub files/dirs under this + * snapshot + */ + char *snapname; + struct iatt buf; +}; +typedef struct svs_inode svs_inode_t; + +struct svs_fd { + glfs_fd_t *fd; +}; +typedef struct svs_fd svs_fd_t; + +struct snap_dirent { + char name[NAME_MAX]; + char uuid[UUID_CANONICAL_FORM_LEN + 1]; + char snap_volname[NAME_MAX]; + glfs_t *fs; +}; +typedef struct snap_dirent snap_dirent_t; + +struct svs_private { + snap_dirent_t *dirents; + int num_snaps; + char *volname; + struct list_head snaplist; + gf_lock_t snaplist_lock; + struct rpc_clnt *rpc; +}; +typedef struct svs_private svs_private_t; + +int +__svs_inode_ctx_set(xlator_t *this, inode_t *inode, svs_inode_t *svs_inode); + +svs_inode_t * +__svs_inode_ctx_get(xlator_t *this, inode_t *inode); + +svs_inode_t * +svs_inode_ctx_get(xlator_t *this, inode_t *inode); + +int32_t +svs_inode_ctx_set(xlator_t *this, inode_t *inode, svs_inode_t *svs_inode); + +svs_inode_t * +svs_inode_ctx_get_or_new(xlator_t *this, inode_t *inode); + +int +__svs_fd_ctx_set(xlator_t *this, fd_t *fd, svs_fd_t *svs_fd); + +svs_fd_t * +__svs_fd_ctx_get(xlator_t *this, fd_t *fd); + +svs_fd_t * +svs_fd_ctx_get(xlator_t *this, fd_t *fd); + +int32_t +svs_fd_ctx_set(xlator_t *this, fd_t *fd, svs_fd_t *svs_fd); + +svs_fd_t * +__svs_fd_ctx_get_or_new(xlator_t *this, fd_t *fd); + +svs_fd_t * +svs_fd_ctx_get_or_new(xlator_t *this, fd_t *fd); + +int +svs_uuid_generate(xlator_t *this, uuid_t gfid, char *snapname, + uuid_t origin_gfid); + +void +svs_fill_ino_from_gfid(struct iatt *buf); + +void +svs_iatt_fill(uuid_t gfid, struct iatt *buf); + +snap_dirent_t * +svs_get_latest_snap_entry(xlator_t *this); + +glfs_t * +svs_get_latest_snapshot(xlator_t *this); + +glfs_t * +svs_initialise_snapshot_volume(xlator_t *this, const char *name, + int32_t *op_errno); + +glfs_t * +__svs_initialise_snapshot_volume(xlator_t *this, const char *name, + int32_t *op_errno); + +snap_dirent_t * +__svs_get_snap_dirent(xlator_t *this, const char *name); + +int +svs_mgmt_init(xlator_t *this); + +int32_t +svs_get_handle(xlator_t *this, loc_t *loc, svs_inode_t *inode_ctx, + int32_t *op_errno); + +glfs_t * +svs_inode_glfs_mapping(xlator_t *this, inode_t *inode); + +glfs_t * +svs_inode_ctx_glfs_mapping(xlator_t *this, svs_inode_t *inode_ctx); + +#endif /* __SNAP_VIEW_H__ */ diff --git a/xlators/features/thin-arbiter/Makefile.am b/xlators/features/thin-arbiter/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/features/thin-arbiter/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/thin-arbiter/src/Makefile.am b/xlators/features/thin-arbiter/src/Makefile.am new file mode 100644 index 00000000000..a3c133e7798 --- /dev/null +++ b/xlators/features/thin-arbiter/src/Makefile.am @@ -0,0 +1,22 @@ +xlator_LTLIBRARIES = thin-arbiter.la + +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +thin_arbiter_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +thin_arbiter_la_SOURCES = thin-arbiter.c \ + $(top_builddir)/xlators/lib/src/libxlator.c + +thin_arbiter_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = thin-arbiter.h thin-arbiter-mem-types.h thin-arbiter-messages.h \ + $(top_builddir)/xlators/lib/src/libxlator.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/thin-arbiter/src/thin-arbiter-mem-types.h b/xlators/features/thin-arbiter/src/thin-arbiter-mem-types.h new file mode 100644 index 00000000000..69562d2febc --- /dev/null +++ b/xlators/features/thin-arbiter/src/thin-arbiter-mem-types.h @@ -0,0 +1,19 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __THIN_ARBITER_MEM_TYPES_H__ +#define __THIN_ARBITER_MEM_TYPES_H__ +#include <glusterfs/mem-types.h> + +typedef enum gf_ta_mem_types_ { + gf_ta_mt_local_t = gf_common_mt_end + 1, + gf_ta_mt_char, + gf_ta_mt_end +} gf_ta_mem_types_t; +#endif diff --git a/xlators/features/thin-arbiter/src/thin-arbiter-messages.h b/xlators/features/thin-arbiter/src/thin-arbiter-messages.h new file mode 100644 index 00000000000..81d7491577a --- /dev/null +++ b/xlators/features/thin-arbiter/src/thin-arbiter-messages.h @@ -0,0 +1,28 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _TA_MESSAGES_H_ +#define _TA_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(TA, TA_MSG_INVALID_FOP); + +#endif /* !_TA_MESSAGES_H_ */ diff --git a/xlators/features/thin-arbiter/src/thin-arbiter.c b/xlators/features/thin-arbiter/src/thin-arbiter.c new file mode 100644 index 00000000000..ce3008636f1 --- /dev/null +++ b/xlators/features/thin-arbiter/src/thin-arbiter.c @@ -0,0 +1,661 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "thin-arbiter.h" +#include "thin-arbiter-messages.h" +#include "thin-arbiter-mem-types.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> +#include <glusterfs/logging.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/common-utils.h> + +int +ta_set_incoming_values(dict_t *dict, char *key, data_t *value, void *data) +{ + int32_t ret = 0; + ta_fop_t *fop = (ta_fop_t *)data; + int32_t *pending = NULL; + + pending = GF_CALLOC(1, value->len, gf_ta_mt_char); + if (!pending) { + ret = -ENOMEM; + goto out; + } + ret = dict_set_bin(fop->brick_xattr, key, pending, value->len); +out: + return ret; +} + +int +ta_get_incoming_and_brick_values(dict_t *dict, char *key, data_t *value, + void *data) +{ + ta_fop_t *fop = data; + char *source = NULL; + char *in_coming = NULL; + int32_t len = 0, ret = 0; + + source = GF_CALLOC(1, value->len, gf_ta_mt_char); + if (!source) { + ret = -ENOMEM; + goto out; + } + + ret = dict_get_ptr_and_len(fop->dict, key, (void **)&in_coming, &len); + + if (!in_coming || value->len != len) { + ret = -EINVAL; + goto out; + } + + if (!memcmp(value->data, source, value->len) && + (!memcmp(in_coming, source, len))) { + fop->on_disk[fop->idx] = 0; + } else { + fop->on_disk[fop->idx] = 1; + } + + fop->idx++; +out: + GF_FREE(source); + return ret; +} + +void +ta_release_fop(ta_fop_t *fop) +{ + if (!fop) { + return; + } + if (fop->fd) { + fd_unref(fop->fd); + } + loc_wipe(&fop->loc); + if (fop->dict) { + dict_unref(fop->dict); + } + if (fop->brick_xattr) { + dict_unref(fop->brick_xattr); + } + + GF_FREE(fop); + return; +} + +int32_t +ta_set_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + TA_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +/* +case 1 - If brick value is 0 and incoming value is also 0, fine +case 2 - If brick value is 0 and incoming value is non 0, fine +case 3 - If brick value is non 0 and incoming value is also 0, fine +case 4 - If brick value is non 0 and incoming value is non 0, fine +case 5 - If incoming value is non zero on both brick, it is wrong +case 6 - If incoming value is non zero but brick value for other +brick is also non zero, wrong +*/ + +int32_t +ta_verify_on_disk_source(ta_fop_t *fop, dict_t *dict) +{ + int ret = 0; + + if (!fop) { + return -EINVAL; + } + + ret = dict_foreach(dict, ta_get_incoming_and_brick_values, (void *)fop); + if (ret < 0) { + return ret; + } + if (fop->on_disk[0] && fop->on_disk[1]) { + return -EINVAL; + } + return 0; +} + +int32_t +ta_get_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + ta_fop_t *fop = NULL; + int ret = 0; + + fop = frame->local; + if (op_ret) { + goto unwind; + } + + ret = ta_verify_on_disk_source(fop, dict); + if (ret < 0) { + op_errno = -ret; + goto unwind; + } + + if (fop->fd) { + STACK_WIND(frame, ta_set_xattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, fop->fd, + fop->xattrop_flags, fop->dict, NULL); + } else { + STACK_WIND(frame, ta_set_xattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, &fop->loc, + fop->xattrop_flags, fop->dict, NULL); + } + return 0; + +unwind: + + TA_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL); + return -1; +} + +ta_fop_t * +ta_prepare_fop(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + ta_fop_t *fop = NULL; + int ret = 0; + + fop = GF_CALLOC(1, sizeof(*fop), gf_ta_mt_local_t); + if (!fop) { + goto out; + } + + if (loc) { + loc_copy(&fop->loc, loc); + } + + if (fd) { + fop->fd = fd_ref(fd); + } + + fop->xattrop_flags = flags; + fop->idx = 0; + + if (dict != NULL) { + fop->dict = dict_ref(dict); + } + fop->brick_xattr = dict_new(); + if (fop->brick_xattr == NULL) { + goto out; + } + ret = dict_foreach(dict, ta_set_incoming_values, (void *)fop); + if (ret < 0) { + goto out; + } + frame->local = fop; + return fop; + +out: + ta_release_fop(fop); + return NULL; +} + +int32_t +ta_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + int ret = 0; + ta_fop_t *fop = NULL; + + fop = ta_prepare_fop(frame, this, NULL, fd, flags, dict, xdata); + if (!fop) { + ret = -ENOMEM; + goto unwind; + } + + STACK_WIND(frame, ta_get_xattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, fd, flags, fop->brick_xattr, + xdata); + return 0; + +unwind: + + TA_STACK_UNWIND(xattrop, frame, -1, -ret, NULL, NULL); + return 0; +} + +int32_t +ta_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + int ret = 0; + ta_fop_t *fop = NULL; + + fop = ta_prepare_fop(frame, this, loc, NULL, flags, dict, xdata); + if (!fop) { + ret = -ENOMEM; + goto unwind; + } + + STACK_WIND(frame, ta_get_xattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, loc, flags, fop->brick_xattr, + xdata); + return 0; + +unwind: + + TA_STACK_UNWIND(xattrop, frame, -1, -ret, NULL, NULL); + return 0; +} + +int32_t +ta_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t off, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + TA_FAILED_FOP(writev, frame, EINVAL); + return 0; +} + +int32_t +ta_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + TA_FAILED_FOP(fsetxattr, frame, EINVAL); + return 0; +} + +int32_t +ta_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + TA_FAILED_FOP(setxattr, frame, EINVAL); + return 0; +} + +int32_t +ta_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) +{ + TA_FAILED_FOP(fallocate, frame, EINVAL); + return 0; +} + +int32_t +ta_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) +{ + TA_FAILED_FOP(access, frame, EINVAL); + return 0; +} + +int32_t +ta_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + TA_FAILED_FOP(discard, frame, EINVAL); + return 0; +} + +int32_t +ta_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + TA_FAILED_FOP(entrylk, frame, EINVAL); + return 0; +} + +int32_t +ta_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + TA_FAILED_FOP(fentrylk, frame, EINVAL); + return 0; +} + +int32_t +ta_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + TA_FAILED_FOP(flush, frame, EINVAL); + return 0; +} + +int32_t +ta_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + TA_FAILED_FOP(fsync, frame, EINVAL); + return 0; +} +int32_t +ta_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + TA_FAILED_FOP(fsyncdir, frame, EINVAL); + return 0; +} + +int32_t +ta_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata) +{ + TA_FAILED_FOP(getxattr, frame, EINVAL); + return 0; +} + +int32_t +ta_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + TA_FAILED_FOP(fgetxattr, frame, EINVAL); + return 0; +} + +int32_t +ta_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + TA_FAILED_FOP(link, frame, EINVAL); + return 0; +} + +int32_t +ta_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + TA_FAILED_FOP(lk, frame, EINVAL); + return 0; +} + +int32_t +ta_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + TA_FAILED_FOP(mkdir, frame, EINVAL); + return 0; +} + +int32_t +ta_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + TA_FAILED_FOP(mknod, frame, EINVAL); + return 0; +} + +int32_t +ta_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + TA_FAILED_FOP(open, frame, EINVAL); + return 0; +} + +int32_t +ta_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + TA_FAILED_FOP(opendir, frame, EINVAL); + return 0; +} + +int32_t +ta_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + TA_FAILED_FOP(readdir, frame, EINVAL); + return 0; +} + +int32_t +ta_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + TA_FAILED_FOP(readdirp, frame, EINVAL); + return 0; +} + +int32_t +ta_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + TA_FAILED_FOP(readlink, frame, EINVAL); + return 0; +} + +int32_t +ta_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + TA_FAILED_FOP(readv, frame, EINVAL); + return 0; +} + +int32_t +ta_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + TA_FAILED_FOP(removexattr, frame, EINVAL); + return 0; +} + +int32_t +ta_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + TA_FAILED_FOP(fremovexattr, frame, EINVAL); + return 0; +} + +int32_t +ta_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + TA_FAILED_FOP(rename, frame, EINVAL); + return 0; +} + +int32_t +ta_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) +{ + TA_FAILED_FOP(rmdir, frame, EINVAL); + return 0; +} + +int32_t +ta_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + TA_FAILED_FOP(setattr, frame, EINVAL); + return 0; +} + +int32_t +ta_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + TA_FAILED_FOP(fsetattr, frame, EINVAL); + return 0; +} + +int32_t +ta_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + TA_FAILED_FOP(stat, frame, EINVAL); + return 0; +} + +int32_t +ta_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + TA_FAILED_FOP(fstat, frame, EINVAL); + return 0; +} + +int32_t +ta_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + TA_FAILED_FOP(statfs, frame, EINVAL); + return 0; +} + +int32_t +ta_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + TA_FAILED_FOP(symlink, frame, EINVAL); + return 0; +} + +int32_t +ta_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + TA_FAILED_FOP(truncate, frame, EINVAL); + return 0; +} + +int32_t +ta_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + TA_FAILED_FOP(ftruncate, frame, EINVAL); + return 0; +} + +int32_t +ta_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) +{ + TA_FAILED_FOP(unlink, frame, EINVAL); + return 0; +} + +int32_t +ta_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + TA_FAILED_FOP(zerofill, frame, EINVAL); + return 0; +} + +int32_t +ta_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) +{ + TA_FAILED_FOP(seek, frame, EINVAL); + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + ret = xlator_mem_acct_init(this, gf_ta_mt_end + 1); + if (ret) + gf_log(this->name, GF_LOG_ERROR, + "Memory accounting " + "initialization failed."); + return ret; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + return 0; +} + +int32_t +init(xlator_t *this) +{ + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_ERROR, + "'thin_arbiter' not configured with exactly one child"); + return -1; + } + + if (!this->parents) { + gf_log(this->name, GF_LOG_ERROR, "dangling volume. check volfile "); + } + return 0; +} + +void +fini(xlator_t *this) +{ + return; +} + +struct xlator_fops fops = { + /*Passed fop*/ + .xattrop = ta_xattrop, + .fxattrop = ta_fxattrop, + /*Failed fop*/ + .writev = ta_writev, + .stat = ta_stat, + .fstat = ta_fstat, + .truncate = ta_truncate, + .ftruncate = ta_ftruncate, + .access = ta_access, + .readlink = ta_readlink, + .mknod = ta_mknod, + .mkdir = ta_mkdir, + .unlink = ta_unlink, + .rmdir = ta_rmdir, + .symlink = ta_symlink, + .rename = ta_rename, + .link = ta_link, + .open = ta_open, + .readv = ta_readv, + .flush = ta_flush, + .fsync = ta_fsync, + .opendir = ta_opendir, + .readdir = ta_readdir, + .readdirp = ta_readdirp, + .fsyncdir = ta_fsyncdir, + .statfs = ta_statfs, + .setxattr = ta_setxattr, + .getxattr = ta_getxattr, + .fsetxattr = ta_fsetxattr, + .fgetxattr = ta_fgetxattr, + .removexattr = ta_removexattr, + .fremovexattr = ta_fremovexattr, + .lk = ta_lk, + .entrylk = ta_entrylk, + .fentrylk = ta_fentrylk, + .setattr = ta_setattr, + .fsetattr = ta_fsetattr, + .fallocate = ta_fallocate, + .discard = ta_discard, + .zerofill = ta_zerofill, + .seek = ta_seek, +}; + +struct xlator_cbks cbks = {}; + +struct volume_options options[] = { + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {GD_OP_VERSION_6_0}, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "thin-arbiter", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/thin-arbiter/src/thin-arbiter.h b/xlators/features/thin-arbiter/src/thin-arbiter.h new file mode 100644 index 00000000000..e5f914b84bf --- /dev/null +++ b/xlators/features/thin-arbiter/src/thin-arbiter.h @@ -0,0 +1,59 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _THIN_ARBITER_H +#define _THIN_ARBITER_H + +#include <glusterfs/locking.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include <glusterfs/list.h> + +#define THIN_ARBITER_SOURCE_XATTR "trusted.ta.source" +#define THIN_ARBITER_SOURCE_SIZE 2 + +#define TA_FAILED_FOP(fop, frame, op_errno) \ + do { \ + default_##fop##_failure_cbk(frame, op_errno); \ + } while (0) + +#define TA_STACK_UNWIND(fop, frame, op_ret, op_errno, params...) \ + do { \ + ta_fop_t *__local = NULL; \ + int32_t __op_ret = 0; \ + int32_t __op_errno = 0; \ + \ + __local = frame->local; \ + __op_ret = op_ret; \ + __op_errno = op_errno; \ + if (__local) { \ + ta_release_fop(__local); \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, __op_ret, __op_errno, params); \ + \ + } while (0) + +struct _ta_fop; +typedef struct _ta_fop ta_fop_t; + +struct _ta_fop { + gf_xattrop_flags_t xattrop_flags; + loc_t loc; + fd_t *fd; + dict_t *dict; + dict_t *brick_xattr; + int32_t on_disk[2]; + int32_t idx; +}; + +#endif /* _THIN_ARBITER_H */ diff --git a/xlators/features/trash/src/Makefile.am b/xlators/features/trash/src/Makefile.am index 5251eb08256..8557e7171af 100644 --- a/xlators/features/trash/src/Makefile.am +++ b/xlators/features/trash/src/Makefile.am @@ -1,14 +1,17 @@ +if WITH_SERVER xlator_LTLIBRARIES = trash.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/features +endif +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features -trash_la_LDFLAGS = -module -avoid-version +trash_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) trash_la_SOURCES = trash.c trash_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = trash.h trash-mem-types.h -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/features/trash/src/trash-mem-types.h b/xlators/features/trash/src/trash-mem-types.h index 0e6ef572fcc..43353c8f095 100644 --- a/xlators/features/trash/src/trash-mem-types.h +++ b/xlators/features/trash/src/trash-mem-types.h @@ -10,13 +10,13 @@ #ifndef __TRASH_MEM_TYPES_H__ #define __TRASH_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_trash_mem_types_ { - gf_trash_mt_trash_private_t = gf_common_mt_end + 1, - gf_trash_mt_char, - gf_trash_mt_trash_elim_pattern_t, - gf_trash_mt_end + gf_trash_mt_trash_private_t = gf_common_mt_end + 1, + gf_trash_mt_char, + gf_trash_mt_uuid, + gf_trash_mt_trash_elim_path, + gf_trash_mt_end }; #endif - diff --git a/xlators/features/trash/src/trash.c b/xlators/features/trash/src/trash.c index addeb66a053..7d09cba3e9c 100644 --- a/xlators/features/trash/src/trash.c +++ b/xlators/features/trash/src/trash.c @@ -7,1531 +7,2647 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include "trash.h" #include "trash-mem-types.h" +#include <glusterfs/syscall.h> -int32_t -trash_ftruncate_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, - struct iatt *stbuf, struct iobref *iobuf); +#define root_gfid \ + (uuid_t) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 } +#define trash_gfid \ + (uuid_t) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 } +#define internal_op_gfid \ + (uuid_t) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 } int32_t -trash_truncate_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *prebuf, struct iatt *postbuf); +trash_truncate_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata); int32_t -trash_truncate_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent); +trash_truncate_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); int32_t -trash_unlink_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent); +trash_unlink_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata); +/* Common routines used in this translator */ -void -trash_local_wipe (trash_local_t *local) +/** + * When a directory/file is created under trash directory, it should have + * the same permission as before. This function will fetch permission from + * the existing directory and returns the same + */ +mode_t +get_permission(char *path) { - if (!local) - goto out; - - loc_wipe (&local->loc); - loc_wipe (&local->newloc); - - if (local->fd) - fd_unref (local->fd); - - if (local->newfd) - fd_unref (local->newfd); + mode_t mode = 0755; + struct stat sbuf = { + 0, + }; + struct iatt ibuf = { + 0, + }; + int ret = 0; + + ret = sys_stat(path, &sbuf); + if (!ret) { + iatt_from_stat(&ibuf, &sbuf); + mode = st_mode_from_ia(ibuf.ia_prot, ibuf.ia_type); + } else + gf_log("trash", GF_LOG_DEBUG, + "stat on %s failed" + " using default", + path); + return mode; +} - mem_put (local); +/** + * For normalization, trash directory name is stored inside priv structure as + * '/trash_directory/'. As a result the trailing and leading slashes are being + * striped out for additional usage. + */ +int +extract_trash_directory(char *priv_value, const char **trash_directory) +{ + char *tmp = NULL; + int ret = 0; + + GF_VALIDATE_OR_GOTO("trash", priv_value, out); + + tmp = gf_strdup(priv_value + 1); + if (!tmp) { + ret = ENOMEM; + goto out; + } + if (tmp[strlen(tmp) - 1] == '/') + tmp[strlen(tmp) - 1] = '\0'; + *trash_directory = gf_strdup(tmp); + if (!(*trash_directory)) { + ret = ENOMEM; + goto out; + } out: - return; + if (tmp) + GF_FREE(tmp); + return ret; } -int32_t -trash_common_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent) +/** + * The trash directory path should be append at beginning of file path for + * delete or truncate operations. Normal trashing moves the contents to + * trash directory and trashing done by internal operations are moved to + * internal_op directory inside trash. + */ +void +copy_trash_path(const char *priv_value, gf_boolean_t internal, char *path, + size_t path_size) { - TRASH_STACK_UNWIND (unlink, frame, op_ret, op_errno, preparent, postparent); - return 0; + char trash_path[PATH_MAX] = { + 0, + }; + + strncpy(trash_path, priv_value, sizeof(trash_path)); + trash_path[sizeof(trash_path) - 1] = 0; + if (internal) + strncat(trash_path, "internal_op/", + sizeof(trash_path) - strlen(trash_path) - 1); + + strncpy(path, trash_path, path_size); + path[path_size - 1] = 0; } -int32_t -trash_unlink_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) +/** + * This function performs the reverse operation of copy_trash_path(). It gives + * out a pointer, whose starting value will be the path inside trash directory, + * similar to original path. + */ +void +remove_trash_path(const char *path, gf_boolean_t internal, char **rem_path) { - trash_local_t *local = NULL; - char *tmp_str = NULL; - char *tmp_path = NULL; - char *tmp_dirname = NULL; - char *dir_name = NULL; - int32_t count = 0; - int32_t loop_count = 0; - int i = 0; - loc_t tmp_loc = {0,}; - - local = frame->local; - tmp_str = gf_strdup (local->newpath); - if (!tmp_str) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto out; - } - loop_count = local->loop_count; - - if ((op_ret == -1) && (op_errno == ENOENT)) { - tmp_dirname = strchr (tmp_str, '/'); - while (tmp_dirname) { - count = tmp_dirname - tmp_str; - if (count == 0) - count = 1; - i++; - if (i > loop_count) - break; - tmp_dirname = strchr (tmp_str + count + 1, '/'); - } - tmp_path = memdup (local->newpath, count); - if (!tmp_path) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto out; - } - - tmp_loc.path = tmp_path; - - /* TODO:create the directory with proper permissions */ - STACK_WIND_COOKIE (frame, trash_unlink_mkdir_cbk, tmp_path, - this->children->xlator, - this->children->xlator->fops->mkdir, - &tmp_loc, 0755, NULL); - - goto out; - } - - if (op_ret == 0) { - dir_name = dirname (tmp_str); - if (strcmp((char*)cookie, dir_name) == 0) { - tmp_loc.path = local->newpath; - STACK_WIND (frame, trash_unlink_rename_cbk, - this->children->xlator, - this->children->xlator->fops->rename, - &local->loc, &tmp_loc); - goto out; - } - } - - LOCK (&frame->lock); - { - loop_count = ++local->loop_count; - } - UNLOCK (&frame->lock); - tmp_dirname = strchr (tmp_str, '/'); - while (tmp_dirname) { - count = tmp_dirname - tmp_str; - if (count == 0) - count = 1; - i++; - if ((i > loop_count) || (count > PATH_MAX)) - break; - tmp_dirname = strchr (tmp_str + count + 1, '/'); - } - tmp_path = memdup (local->newpath, count); - if (!tmp_path) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto out; - } - tmp_loc.path = tmp_path; - - STACK_WIND_COOKIE (frame, trash_unlink_mkdir_cbk, tmp_path, - this->children->xlator, - this->children->xlator->fops->mkdir, - &tmp_loc, 0755, NULL); - -out: - GF_FREE (cookie); - GF_FREE (tmp_str); + if (rem_path == NULL) { + return; + } - return 0; + *rem_path = strchr(path + 1, '/'); + if (internal) + *rem_path = strchr(*rem_path + 1, '/'); } -int32_t -trash_rename_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent); - -int32_t -trash_unlink_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) +/** + * Checks whether the given path reside under the specified eliminate path + */ +int +check_whether_eliminate_path(trash_elim_path *trav, const char *path) { - trash_local_t *local = NULL; - char *tmp_str = NULL; - char *dir_name = NULL; - char *tmp_cookie = NULL; - loc_t tmp_loc = {0,}; - - local = frame->local; - - if ((op_ret == -1) && (op_errno == ENOENT)) { - tmp_str = gf_strdup (local->newpath); - if (!tmp_str) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - } - dir_name = dirname (tmp_str); - - tmp_loc.path = dir_name; - - tmp_cookie = gf_strdup (dir_name); - if (!tmp_cookie) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - } - /* TODO: create the directory with proper permissions */ - STACK_WIND_COOKIE (frame, trash_unlink_mkdir_cbk, tmp_cookie, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mkdir, - &tmp_loc, 0755, NULL); - - GF_FREE (tmp_str); - - return 0; - } + int match = 0; - if ((op_ret == -1) && (op_errno == ENOTDIR)) { - - gf_log (this->name, GF_LOG_DEBUG, - "target(%s) exists, cannot keep the copy, deleting", - local->newpath); - - STACK_WIND (frame, trash_common_unwind_cbk, - this->children->xlator, - this->children->xlator->fops->unlink, &local->loc); - - return 0; + while (trav) { + if (strncmp(path, trav->path, strlen(trav->path)) == 0) { + match++; + break; } - - if ((op_ret == -1) && (op_errno == EISDIR)) { - gf_log (this->name, GF_LOG_DEBUG, - "target(%s) exists as directory, cannot keep copy, " - "deleting", local->newpath); - - STACK_WIND (frame, trash_common_unwind_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, &local->loc); - return 0; - } - - /* All other cases, unlink should return success */ - TRASH_STACK_UNWIND (unlink, frame, 0, op_errno, &local->preparent, - &local->postparent); - - return 0; + trav = trav->next; + } + return match; } - - -int32_t -trash_common_unwind_buf_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *prebuf, struct iatt *postbuf) +/** + * Stores the eliminate path into internal eliminate path structure + */ +int +store_eliminate_path(char *str, trash_elim_path **eliminate) { - TRASH_STACK_UNWIND (truncate, frame, op_ret, op_errno, prebuf, postbuf); - return 0; + trash_elim_path *trav = NULL; + char *component = NULL; + char elm_path[PATH_MAX] = { + 0, + }; + int ret = 0; + char *strtokptr = NULL; + + if ((str == NULL) || (eliminate == NULL)) { + ret = EINVAL; + goto out; + } + + component = strtok_r(str, ",", &strtokptr); + while (component) { + trav = GF_CALLOC(1, sizeof(*trav), gf_trash_mt_trash_elim_path); + if (!trav) { + ret = ENOMEM; + goto out; + } + if (component[0] == '/') + sprintf(elm_path, "%s", component); + else + sprintf(elm_path, "/%s", component); + + if (component[strlen(component) - 1] != '/') + strncat(elm_path, "/", sizeof(elm_path) - strlen(elm_path) - 1); + + trav->path = gf_strdup(elm_path); + if (!trav->path) { + ret = ENOMEM; + gf_log("trash", GF_LOG_DEBUG, "out of memory"); + GF_FREE(trav); + goto out; + } + trav->next = *eliminate; + *eliminate = trav; + component = strtok_r(NULL, ",", &strtokptr); + } +out: + return ret; } -int -trash_common_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *stbuf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) +/** + * Appends time stamp to given string + */ +void +append_time_stamp(char *name, size_t name_size) { - TRASH_STACK_UNWIND (rename, frame, op_ret, op_errno, stbuf, preoldparent, - postoldparent, prenewparent, postnewparent); - return 0; + int i; + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + + gf_time_fmt(timestr, sizeof(timestr), gf_time(), gf_timefmt_F_HMS); + + /* removing white spaces in timestamp */ + for (i = 0; i < strlen(timestr); i++) { + if (timestr[i] == ' ') + timestr[i] = '_'; + } + strncat(name, "_", name_size - strlen(name) - 1); + strncat(name, timestr, name_size - strlen(name) - 1); } +/* * + * Check whether delete/rename operation is permitted on + * trash directory + */ -int32_t -trash_unlink_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) +gf_boolean_t +check_whether_op_permitted(trash_private_t *priv, loc_t *loc) { - trash_private_t *priv = NULL; - trash_local_t *local = NULL; - loc_t new_loc = {0,}; - - priv = this->private; - local = frame->local; - - if (-1 == op_ret) { - gf_log (this->name, GF_LOG_DEBUG, "%s: %s", - local->loc.path, strerror (op_errno)); - goto fail; - } - - if ((buf->ia_size == 0) || - (buf->ia_size > priv->max_trash_file_size)) { - /* if the file is too big or zero, just unlink it */ - - if (buf->ia_size > priv->max_trash_file_size) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: file size too big (%"PRId64") to " - "move into trash directory", - local->loc.path, buf->ia_size); - } - - STACK_WIND (frame, trash_common_unwind_cbk, - this->children->xlator, - this->children->xlator->fops->unlink, &local->loc); - return 0; - } - - new_loc.path = local->newpath; + if ((priv->state && (gf_uuid_compare(loc->inode->gfid, trash_gfid) == 0))) + return _gf_false; + if (priv->internal && + (gf_uuid_compare(loc->inode->gfid, internal_op_gfid) == 0)) + return _gf_false; - STACK_WIND (frame, trash_unlink_rename_cbk, - this->children->xlator, - this->children->xlator->fops->rename, - &local->loc, &new_loc); + return _gf_true; +} - return 0; +/** + * Wipe the memory used by trash location variable + */ +void +trash_local_wipe(trash_local_t *local) +{ + if (!local) + goto out; -fail: - TRASH_STACK_UNWIND (unlink, frame, op_ret, op_errno, buf, - NULL); + loc_wipe(&local->loc); + loc_wipe(&local->newloc); - return 0; + if (local->fd) + fd_unref(local->fd); + if (local->newfd) + fd_unref(local->newfd); + mem_put(local); +out: + return; } -int32_t -trash_rename_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) +/** + * Wipe the memory used by eliminate path through a + * recursive call + */ +void +wipe_eliminate_path(trash_elim_path **trav) { - trash_local_t *local = NULL; - char *tmp_str = NULL; - char *dir_name = NULL; - char *tmp_path = NULL; - loc_t tmp_loc = {0,}; - - local = frame->local; - if ((op_ret == -1) && (op_errno == ENOENT)) { - tmp_str = gf_strdup (local->newpath); - if (!tmp_str) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - } - dir_name = dirname (tmp_str); - - /* check for the errno, if its ENOENT create directory and call - * rename later - */ - tmp_path = gf_strdup (dir_name); - if (!tmp_path) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - } - tmp_loc.path = tmp_path; - - /* TODO: create the directory with proper permissions */ - STACK_WIND_COOKIE (frame, trash_rename_mkdir_cbk, tmp_path, - this->children->xlator, - this->children->xlator->fops->mkdir, - &tmp_loc, 0755, NULL); - - GF_FREE (tmp_str); - return 0; - } - - if ((op_ret == -1) && (op_errno == ENOTDIR)) { - gf_log (this->name, GF_LOG_DEBUG, - "target(%s) exists, cannot keep the dest entry(%s): " - "renaming", local->newpath, local->origpath); - } else if ((op_ret == -1) && (op_errno == EISDIR)) { - gf_log (this->name, GF_LOG_DEBUG, - "target(%s) exists as a directory, cannot keep the " - "copy (%s), renaming", local->newpath, local->origpath); - } + if (trav == NULL) { + return; + } - STACK_WIND (frame, trash_common_rename_cbk, - this->children->xlator, - this->children->xlator->fops->rename, &local->loc, - &local->newloc); + if (*trav == NULL) { + return; + } - return 0; + wipe_eliminate_path(&(*trav)->next); + GF_FREE((*trav)->path); + GF_FREE(*trav); + *trav = NULL; } - +/** + * This is the call back of rename fop initated using STACK_WIND in + * reconfigure/notify function which is used to rename trash directory + * in the brick when it is required either in volume start or set. + * This frame must destroyed from this function itself since it was + * created by trash xlator + */ int32_t -trash_rename_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) +trash_dir_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { - trash_local_t *local = NULL; - char *tmp_str = NULL; - char *tmp_path = NULL; - char *tmp_dirname = NULL; - char *dir_name = NULL; - int32_t count = 0; - loc_t tmp_loc = {0,}; - - local = frame->local; - tmp_str = gf_strdup (local->newpath); - if (!tmp_str) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - goto out; - } + trash_private_t *priv = NULL; + trash_local_t *local = NULL; - if ((op_ret == -1) && (op_errno == ENOENT)) { - tmp_dirname = strchr (tmp_str, '/'); - while (tmp_dirname) { - count = tmp_dirname - tmp_str; - if (count == 0) - count = 1; + priv = this->private; - tmp_dirname = strchr (tmp_str + count + 1, '/'); + local = frame->local; - tmp_path = memdup (local->newpath, count); - if (!tmp_path) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - } + if (op_ret == -1) { + gf_log(this->name, GF_LOG_ERROR, + "rename trash directory " + "failed: %s", + strerror(op_errno)); + goto out; + } - tmp_loc.path = tmp_path; + GF_FREE(priv->oldtrash_dir); - /* TODO: create the directory with proper permissions */ - STACK_WIND_COOKIE (frame, trash_rename_mkdir_cbk, - tmp_path, this->children->xlator, - this->children->xlator->fops->mkdir, - &tmp_loc, 0755, NULL); - } + priv->oldtrash_dir = gf_strdup(priv->newtrash_dir); + if (!priv->oldtrash_dir) { + op_ret = ENOMEM; + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + } - goto out; - } - - dir_name = dirname (tmp_str); - if (strcmp ((char*)cookie, dir_name) == 0) { - tmp_loc.path = local->newpath; +out: + frame->local = NULL; + STACK_DESTROY(frame->root); + trash_local_wipe(local); + return op_ret; +} - STACK_WIND (frame, trash_rename_rename_cbk, - this->children->xlator, - this->children->xlator->fops->rename, - &local->newloc, &tmp_loc); - } +int +rename_trash_directory(xlator_t *this) +{ + trash_private_t *priv = NULL; + int ret = 0; + loc_t loc = { + 0, + }; + loc_t old_loc = { + 0, + }; + call_frame_t *frame = NULL; + trash_local_t *local = NULL; + + priv = this->private; + + frame = create_frame(this, this->ctx->pool); + if (frame == NULL) { + gf_log(this->name, GF_LOG_ERROR, "failed to create frame"); + ret = ENOMEM; + goto out; + } + + local = mem_get0(this->local_pool); + if (!local) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + frame->local = local; + + /* assign new location values to new_loc members */ + gf_uuid_copy(loc.gfid, trash_gfid); + gf_uuid_copy(loc.pargfid, root_gfid); + ret = extract_trash_directory(priv->newtrash_dir, &loc.name); + if (ret) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + goto out; + } + loc.path = gf_strdup(priv->newtrash_dir); + if (!loc.path) { + ret = ENOMEM; + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + goto out; + } + + /* assign old location values to old_loc members */ + gf_uuid_copy(old_loc.gfid, trash_gfid); + gf_uuid_copy(old_loc.pargfid, root_gfid); + ret = extract_trash_directory(priv->oldtrash_dir, &old_loc.name); + if (ret) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + goto out; + } + old_loc.path = gf_strdup(priv->oldtrash_dir); + if (!old_loc.path) { + ret = ENOMEM; + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + goto out; + } + + old_loc.inode = inode_ref(priv->trash_inode); + gf_uuid_copy(old_loc.inode->gfid, old_loc.gfid); + + loc_copy(&local->loc, &old_loc); + loc_copy(&local->newloc, &loc); + + STACK_WIND(frame, trash_dir_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, &old_loc, &loc, NULL); + return 0; out: - GF_FREE (cookie); /* strdup (dir_name) was sent here :) */ - GF_FREE (tmp_str); + if (frame) { + frame->local = NULL; + STACK_DESTROY(frame->root); + } - return 0; + trash_local_wipe(local); + + return ret; } int32_t -trash_rename_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +trash_internal_op_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - trash_private_t *priv = NULL; - trash_local_t *local = NULL; - loc_t tmp_loc = {0,}; - - local = frame->local; - priv = this->private; - - if (op_ret == -1) { - STACK_WIND (frame, trash_common_rename_cbk, - this->children->xlator, - this->children->xlator->fops->rename, - &local->loc, &local->newloc); - return 0; - } - if ((buf->ia_size == 0) || - (buf->ia_size > priv->max_trash_file_size)) { - /* if the file is too big or zero, just unlink it */ - - if (buf->ia_size > priv->max_trash_file_size) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: file size too big (%"PRId64") to " - "move into trash directory", - local->newloc.path, buf->ia_size); - } - - STACK_WIND (frame, trash_common_rename_cbk, - this->children->xlator, - this->children->xlator->fops->rename, - &local->loc, &local->newloc); - return 0; - } - - tmp_loc.path = local->newpath; - - STACK_WIND (frame, trash_rename_rename_cbk, - this->children->xlator, - this->children->xlator->fops->rename, - &local->newloc, &tmp_loc); - - return 0; + trash_local_t *local = NULL; + local = frame->local; + + if (op_ret != 0 && !(op_errno == EEXIST)) + gf_log(this->name, GF_LOG_ERROR, + "mkdir failed for " + "internal op directory : %s", + strerror(op_errno)); + + frame->local = NULL; + STACK_DESTROY(frame->root); + trash_local_wipe(local); + return op_ret; } +/** + * This is the call back of mkdir fop initated using STACK_WIND in + * notify/reconfigure function which is used to create trash directory + * in the brick when "trash" is on. The frame of the mkdir must + * destroyed from this function itself since it was created by trash xlator + */ int32_t -trash_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) +trash_dir_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - trash_elim_pattern_t *trav = NULL; - trash_private_t *priv = NULL; - trash_local_t *local = NULL; - char timestr[64] = {0,}; - int32_t match = 0; - - priv = this->private; - if (priv->eliminate) { - trav = priv->eliminate; - while (trav) { - if (fnmatch(trav->pattern, newloc->name, 0) == 0) { - match++; - break; - } - trav = trav->next; - } - } + trash_private_t *priv = NULL; + trash_local_t *local = NULL; - if ((strncmp (oldloc->path, priv->trash_dir, - strlen (priv->trash_dir)) == 0) || match) { - /* Trying to rename from the trash dir, - do the actual rename */ - STACK_WIND (frame, trash_common_rename_cbk, - this->children->xlator, - this->children->xlator->fops->rename, - oldloc, newloc); + priv = this->private; - return 0; - } + local = frame->local; - local = mem_get0 (this->local_pool); - if (!local) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - TRASH_STACK_UNWIND (rename, frame, -1, ENOMEM, - NULL, NULL, NULL, NULL, NULL); - return 0; + if (op_ret == 0) { + priv->oldtrash_dir = gf_strdup(priv->newtrash_dir); + if (!priv->oldtrash_dir) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); + op_ret = ENOMEM; } + } else if (op_ret != 0 && errno != EEXIST) + gf_log(this->name, GF_LOG_ERROR, + "mkdir failed for trash" + " directory : %s", + strerror(op_errno)); - frame->local = local; - loc_copy (&local->loc, oldloc); - - loc_copy (&local->newloc, newloc); - - strcpy (local->origpath, newloc->path); - strcpy (local->newpath, priv->trash_dir); - strcat (local->newpath, newloc->path); - - { - /* append timestamp to file name */ - /* TODO: can we make it optional? */ - gf_time_ftm (timestr, sizeof timestr, time (NULL), - gf_timefmt_F_HMS); - strcat (local->newpath, timestr); - } + frame->local = NULL; + STACK_DESTROY(frame->root); + trash_local_wipe(local); + return op_ret; +} - /* Send a lookup call on newloc, to ensure we are not - overwriting */ - STACK_WIND (frame, trash_rename_lookup_cbk, - this->children->xlator, - this->children->xlator->fops->lookup, newloc, 0); +/** + * This getxattr calls returns existing trash directory path in + * the dictionary + */ +int32_t +trash_dir_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + data_t *data = NULL; + trash_private_t *priv = NULL; + int ret = 0; + trash_local_t *local = NULL; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + + local = frame->local; + + data = dict_get(dict, GET_ANCESTRY_PATH_KEY); + if (!data) { + goto out; + } + priv->oldtrash_dir = GF_MALLOC(PATH_MAX, gf_common_mt_char); + if (!priv->oldtrash_dir) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); + ret = ENOMEM; + goto out; + } + /* appending '/' if it is not present */ + sprintf(priv->oldtrash_dir, "%s%c", data->data, + data->data[strlen(data->data) - 1] != '/' ? '/' : '\0'); + gf_log(this->name, GF_LOG_DEBUG, + "old trash directory path " + "is %s", + priv->oldtrash_dir); + if (strcmp(priv->newtrash_dir, priv->oldtrash_dir) != 0) { + /* When user set a new name for trash directory, trash + * xlator will perform a rename operation on old trash + * directory to the new one using a STACK_WIND from here. + * This option can be configured only when volume is in + * started state + */ + ret = rename_trash_directory(this); + } +out: + frame->local = NULL; + STACK_DESTROY(frame->root); + trash_local_wipe(local); + return ret; +} +/** + * This is a nameless look up for internal op directory + * The lookup is based on gfid, because internal op directory + * has fixed gfid. + */ +int32_t +trash_internalop_dir_lookup_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + trash_private_t *priv = NULL; + int ret = 0; + uuid_t *gfid_ptr = NULL; + loc_t loc = { + 0, + }; + char internal_op_path[PATH_MAX] = { + 0, + }; + dict_t *dict = NULL; + trash_local_t *local = NULL; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + + local = frame->local; + if (op_ret != 0 && op_errno == ENOENT) { + loc_wipe(&local->loc); + gfid_ptr = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); + if (!gfid_ptr) { + ret = ENOMEM; + goto out; + } + + gf_uuid_copy(*gfid_ptr, internal_op_gfid); + + dict = dict_new(); + if (!dict) { + ret = ENOMEM; + goto out; + } + ret = dict_set_gfuuid(dict, "gfid-req", *gfid_ptr, false); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "setting key gfid-req failed"); + goto out; + } + gf_uuid_copy(loc.gfid, internal_op_gfid); + gf_uuid_copy(loc.pargfid, trash_gfid); + + loc.inode = inode_new(priv->trash_itable); + + /* The mkdir call for creating internal op directory */ + loc.name = gf_strdup("internal_op"); + if (!loc.name) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + sprintf(internal_op_path, "%s%s/", priv->newtrash_dir, loc.name); + + loc.path = gf_strdup(internal_op_path); + if (!loc.path) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + + loc_copy(&local->loc, &loc); + STACK_WIND(frame, trash_internal_op_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, &loc, 0755, 0022, dict); return 0; + } + +out: + if (ret && gfid_ptr) + GF_FREE(gfid_ptr); + if (dict) + dict_unref(dict); + frame->local = NULL; + STACK_DESTROY(frame->root); + trash_local_wipe(local); + return op_ret; } +/** + * This is a nameless look up for old trash directory + * The lookup is based on gfid, because trash directory + * has fixed gfid. + */ int32_t -trash_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) +trash_dir_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) { - trash_elim_pattern_t *trav = NULL; - trash_private_t *priv = NULL; - trash_local_t *local = NULL; - char timestr[64] = {0,}; - int32_t match = 0; + trash_private_t *priv = NULL; + loc_t loc = { + 0, + }; + int ret = 0; + uuid_t *gfid_ptr = NULL; + dict_t *dict = NULL; + trash_local_t *local = NULL; - priv = this->private; + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); - if (priv->eliminate) { - trav = priv->eliminate; - while (trav) { - if (fnmatch(trav->pattern, loc->name, 0) == 0) { - match++; - break; - } - trav = trav->next; - } - } + local = frame->local; - if ((strncmp (loc->path, priv->trash_dir, - strlen (priv->trash_dir)) == 0) || (match)) { - if (match) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: file matches eliminate pattern, " - "not moved to trash", loc->name); - } else { - /* unlink from the trash-dir, not keeping any copy */ - ; - } - - STACK_WIND (frame, trash_common_unwind_cbk, - this->children->xlator, - this->children->xlator->fops->unlink, loc); - return 0; - } + loc_wipe(&local->loc); + if (op_ret == 0) { + gf_log(this->name, GF_LOG_DEBUG, "inode found with gfid %s", + uuid_utoa(buf->ia_gfid)); - local = mem_get0 (this->local_pool); - if (!local) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - TRASH_STACK_UNWIND (unlink, frame, -1, ENOMEM, NULL, NULL); - return 0; - } - frame->local = local; - loc_copy (&local->loc, loc); - - strcpy (local->origpath, loc->path); - strcpy (local->newpath, priv->trash_dir); - strcat (local->newpath, loc->path); - - { - /* append timestamp to file name */ - /* TODO: can we make it optional? */ - gf_time_fmt (timestr, sizeof timestr, time (NULL), - gf_timefmt_F_HMS); - strcat (local->newpath, timestr); - } + gf_uuid_copy(loc.gfid, trash_gfid); - LOCK_INIT (&frame->lock); + /* Find trash inode using available information */ + priv->trash_inode = inode_link(inode, NULL, NULL, buf); - STACK_WIND (frame, trash_unlink_stat_cbk, - this->children->xlator, - this->children->xlator->fops->stat, loc); + loc.inode = inode_ref(priv->trash_inode); + loc_copy(&local->loc, &loc); + /*Used to find path of old trash directory*/ + STACK_WIND(frame, trash_dir_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, &loc, + GET_ANCESTRY_PATH_KEY, xdata); return 0; + } + + /* If there is no old trash directory we set its value to new one, + * which is the valid condition for trash directory creation + */ + else { + gf_log(this->name, GF_LOG_DEBUG, + "Creating trash " + "directory %s ", + priv->newtrash_dir); + + gfid_ptr = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); + if (!gfid_ptr) { + ret = ENOMEM; + goto out; + } + gf_uuid_copy(*gfid_ptr, trash_gfid); + + gf_uuid_copy(loc.gfid, trash_gfid); + gf_uuid_copy(loc.pargfid, root_gfid); + ret = extract_trash_directory(priv->newtrash_dir, &loc.name); + if (ret) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + goto out; + } + loc.path = gf_strdup(priv->newtrash_dir); + if (!loc.path) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + + priv->trash_inode = inode_new(priv->trash_itable); + priv->trash_inode->ia_type = IA_IFDIR; + loc.inode = inode_ref(priv->trash_inode); + dict = dict_new(); + if (!dict) { + ret = ENOMEM; + goto out; + } + /* Fixed gfid is set for trash directory with + * this function + */ + ret = dict_set_gfuuid(dict, "gfid-req", *gfid_ptr, false); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "setting key gfid-req failed"); + goto out; + } + loc_copy(&local->loc, &loc); + + /* The mkdir call for creating trash directory */ + STACK_WIND(frame, trash_dir_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, &loc, 0755, 0022, dict); + return 0; + } +out: + if (ret && gfid_ptr) + GF_FREE(gfid_ptr); + if (dict) + dict_unref(dict); + frame->local = NULL; + STACK_DESTROY(frame->root); + trash_local_wipe(local); + return ret; } -int32_t -trash_truncate_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent) +int +create_or_rename_trash_directory(xlator_t *this) { - /* use this Function when a failure occurs, and - delete the newly created file. */ - trash_local_t *local = NULL; - - local = frame->local; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "deleting the newly created file: %s", - strerror (op_errno)); - } + trash_private_t *priv = NULL; + int ret = 0; + loc_t loc = { + 0, + }; + call_frame_t *frame = NULL; + trash_local_t *local = NULL; + + priv = this->private; + + frame = create_frame(this, this->ctx->pool); + if (frame == NULL) { + gf_log(this->name, GF_LOG_ERROR, "failed to create frame"); + ret = ENOMEM; + goto out; + } + + local = mem_get0(this->local_pool); + if (!local) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + frame->local = local; + + loc.inode = inode_new(priv->trash_itable); + gf_uuid_copy(loc.gfid, trash_gfid); + loc_copy(&local->loc, &loc); + gf_log(this->name, GF_LOG_DEBUG, + "nameless lookup for" + "old trash directory"); + STACK_WIND(frame, trash_dir_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &loc, NULL); +out: + return ret; +} - STACK_WIND (frame, trash_common_unwind_buf_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, - &local->loc, local->fop_offset); +int +create_internalop_directory(xlator_t *this) +{ + trash_private_t *priv = NULL; + int ret = 0; + loc_t loc = { + 0, + }; + call_frame_t *frame = NULL; + trash_local_t *local = NULL; + + priv = this->private; + + frame = create_frame(this, this->ctx->pool); + if (frame == NULL) { + gf_log(this->name, GF_LOG_ERROR, "failed to create frame"); + ret = ENOMEM; + goto out; + } + + local = mem_get0(this->local_pool); + if (!local) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + frame->local = local; + + gf_uuid_copy(loc.gfid, internal_op_gfid); + gf_uuid_copy(loc.pargfid, trash_gfid); + loc.inode = inode_new(priv->trash_itable); + loc.inode->ia_type = IA_IFDIR; + + loc_copy(&local->loc, &loc); + STACK_WIND(frame, trash_internalop_dir_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &loc, NULL); +out: - return 0; + return ret; } int32_t -trash_truncate_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, - struct iatt *stbuf, struct iobref *iobuf) +trash_common_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - trash_local_t *local = NULL; + STACK_UNWIND_STRICT(mkdir, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; +} - local = frame->local; +int32_t +trash_common_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, buf, preoldparent, + postoldparent, prenewparent, postnewparent, xdata); + return 0; +} - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "readv on the existing file failed: %s", - strerror (op_errno)); +int32_t +trash_common_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + STACK_UNWIND_STRICT(rmdir, frame, op_ret, op_errno, preparent, postparent, + xdata); + return 0; +} - STACK_WIND (frame, trash_truncate_unlink_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, - &local->newloc); - goto out; - } +/** + * move backs from trash translator to unlink call + */ +int32_t +trash_common_unwind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + TRASH_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); + return 0; +} - local->fsize = stbuf->ia_size; - STACK_WIND (frame, trash_truncate_writev_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, - local->newfd, vector, count, local->cur_offset, 0, iobuf); +/** + * If the path is not present in the trash directory,it will recursively + * call this call-back and one by one directories will be created from + * the starting + */ +int32_t +trash_unlink_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + trash_local_t *local = NULL; + char *tmp_str = NULL; + char *tmp_path = NULL; + char *tmp_dirname = NULL; + char *tmp_stat = NULL; + char real_path[PATH_MAX] = { + 0, + }; + char *dir_name = NULL; + size_t count = 0; + int32_t loop_count = 0; + int i = 0; + loc_t tmp_loc = { + 0, + }; + trash_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + + local = frame->local; + GF_VALIDATE_OR_GOTO("trash", local, out); + + TRASH_UNSET_PID(frame, local); + + tmp_str = gf_strdup(local->newpath); + if (!tmp_str) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); + ret = -1; + goto out; + } + loop_count = local->loop_count; + + /* The directory is not present , need to create it */ + if ((op_ret == -1) && (op_errno == ENOENT)) { + tmp_dirname = strchr(tmp_str, '/'); + while (tmp_dirname) { + count = tmp_dirname - tmp_str; + if (count == 0) + count = 1; + i++; + if (i > loop_count) + break; + tmp_dirname = strchr(tmp_str + count + 1, '/'); + } + tmp_path = gf_memdup(local->newpath, count + 1); + if (!tmp_path) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); + ret = ENOMEM; + goto out; + } + tmp_path[count] = '\0'; + + loc_copy(&tmp_loc, &local->loc); + tmp_loc.path = gf_strdup(tmp_path); + if (!tmp_loc.path) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); + ret = ENOMEM; + goto out; + } + + /* Stores the the name of directory to be created */ + tmp_loc.name = gf_strdup(strrchr(tmp_path, '/') + 1); + if (!tmp_loc.name) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); + ret = ENOMEM; + goto out; + } + strncpy(real_path, priv->brick_path, sizeof(real_path)); + real_path[sizeof(real_path) - 1] = 0; + + remove_trash_path(tmp_path, (frame->root->pid < 0), &tmp_stat); + if (tmp_stat) + strncat(real_path, tmp_stat, + sizeof(real_path) - strlen(real_path) - 1); + + TRASH_SET_PID(frame, local); + + STACK_WIND_COOKIE(frame, trash_unlink_mkdir_cbk, tmp_path, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, + &tmp_loc, get_permission(real_path), 0022, xdata); + loc_wipe(&tmp_loc); + goto out; + } + + /* Given path is created , comparing to the required path */ + if (op_ret == 0) { + dir_name = dirname(tmp_str); + if (strcmp((char *)cookie, dir_name) == 0) { + /* File path exists we can rename it*/ + loc_copy(&tmp_loc, &local->loc); + tmp_loc.path = local->newpath; + STACK_WIND(frame, trash_unlink_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, &local->loc, &tmp_loc, + xdata); + goto out; + } + } + + if ((op_ret == -1) && (op_errno != EEXIST)) { + gf_log(this->name, GF_LOG_ERROR, + "Directory creation failed [%s]. " + "Therefore unlinking %s without moving to trash " + "directory", + strerror(op_errno), local->loc.name); + STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &local->loc, 0, xdata); + goto out; + } + + LOCK(&frame->lock); + { + loop_count = ++local->loop_count; + } + UNLOCK(&frame->lock); + + tmp_dirname = strchr(tmp_str, '/'); + + /* Path is not completed , need to create remaining path */ + while (tmp_dirname) { + count = tmp_dirname - tmp_str; + if (count == 0) + count = 1; + i++; + if (i > loop_count) + break; + tmp_dirname = strchr(tmp_str + count + 1, '/'); + } + tmp_path = gf_memdup(local->newpath, count + 1); + if (!tmp_path) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); + ret = -1; + goto out; + } + tmp_path[count] = '\0'; + + loc_copy(&tmp_loc, &local->loc); + tmp_loc.path = gf_strdup(tmp_path); + if (!tmp_loc.path) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); + ret = -1; + goto out; + } + + /* Stores the the name of directory to be created */ + tmp_loc.name = gf_strdup(strrchr(tmp_path, '/') + 1); + if (!tmp_loc.name) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); + ret = -1; + goto out; + } + + strncpy(real_path, priv->brick_path, sizeof(real_path)); + real_path[sizeof(real_path) - 1] = 0; + + remove_trash_path(tmp_path, (frame->root->pid < 0), &tmp_stat); + if (tmp_stat) + strncat(real_path, tmp_stat, sizeof(real_path) - strlen(real_path) - 1); + + TRASH_SET_PID(frame, local); + + STACK_WIND_COOKIE(frame, trash_unlink_mkdir_cbk, tmp_path, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, + &tmp_loc, get_permission(real_path), 0022, xdata); out: - return 0; - + if (tmp_path) + GF_FREE(tmp_path); + if (tmp_str) + GF_FREE(tmp_str); + return ret; } +/** + * The name of unlinking file should be renamed as starting + * from trash directory as mentioned in the mount point + */ int32_t -trash_truncate_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *prebuf, struct iatt *postbuf) +trash_unlink_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { - trash_local_t *local = NULL; - - local = frame->local; - - if (op_ret == -1) { - /* Let truncate work, but previous copy is not preserved. */ - gf_log (this->name, GF_LOG_DEBUG, - "writev on the existing file failed: %s", - strerror (op_errno)); - - STACK_WIND (frame, trash_truncate_unlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, &local->newloc); - goto out; - } - - if (local->cur_offset < local->fsize) { - local->cur_offset += GF_BLOCK_READV_SIZE; - /* Loop back and Read the contents again. */ - STACK_WIND (frame, trash_truncate_readv_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, - local->fd, (size_t)GF_BLOCK_READV_SIZE, - local->cur_offset, 0); - goto out; - } - - - /* OOFH.....Finally calling Truncate. */ - STACK_WIND (frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, &local->loc, - local->fop_offset); - + trash_local_t *local = NULL; + trash_private_t *priv = NULL; + char *tmp_str = NULL; + char *dir_name = NULL; + char *tmp_cookie = NULL; + loc_t tmp_loc = { + 0, + }; + dict_t *new_xdata = NULL; + char *tmp_stat = NULL; + char real_path[PATH_MAX] = { + 0, + }; + int ret = 0; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + + local = frame->local; + GF_VALIDATE_OR_GOTO("trash", local, out); + + if ((op_ret == -1) && (op_errno == ENOENT)) { + /* the file path does not exist we want to create path + * for the file + */ + tmp_str = gf_strdup(local->newpath); + if (!tmp_str) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + dir_name = dirname(tmp_str); /* stores directory name */ + + loc_copy(&tmp_loc, &local->loc); + tmp_loc.path = gf_strdup(dir_name); + if (!tmp_loc.path) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); + ret = ENOMEM; + goto out; + } + + tmp_cookie = gf_strdup(dir_name); + if (!tmp_cookie) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + strncpy(real_path, priv->brick_path, sizeof(real_path)); + real_path[sizeof(real_path) - 1] = 0; + remove_trash_path(tmp_str, (frame->root->pid < 0), &tmp_stat); + if (tmp_stat) + strncat(real_path, tmp_stat, + sizeof(real_path) - strlen(real_path) - 1); + + TRASH_SET_PID(frame, local); + + /* create the directory with proper permissions */ + STACK_WIND_COOKIE(frame, trash_unlink_mkdir_cbk, tmp_cookie, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, + &tmp_loc, get_permission(real_path), 0022, xdata); + loc_wipe(&tmp_loc); + goto out; + } + + if ((op_ret == -1) && (op_errno == ENOTDIR)) { + /* if entry is already present in trash directory, + * new one is not copied*/ + gf_log(this->name, GF_LOG_DEBUG, + "target(%s) exists, cannot keep the copy, deleting", + local->newpath); + + STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &local->loc, 0, xdata); + + goto out; + } + + if ((op_ret == -1) && (op_errno == EISDIR)) { + /* if entry is directory,we remove directly */ + gf_log(this->name, GF_LOG_DEBUG, + "target(%s) exists as directory, cannot keep copy, " + "deleting", + local->newpath); + + STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &local->loc, 0, xdata); + goto out; + } + + /********************************************************************** + * + * CTR Xlator message handling done here! + * + **********************************************************************/ + /** + * If unlink is handled by trash translator, it should inform the + * CTR Xlator. And trash translator only handles the unlink for + * the last hardlink. + * + * Check if there is a GF_REQUEST_LINK_COUNT_XDATA from CTR Xlator + * + */ + + if (local->ctr_link_count_req) { + /* Sending back inode link count to ctr_unlink + * (changetimerecoder xlator) via + * "GF_RESPONSE_LINK_COUNT_XDATA" key using xdata. + * */ + if (xdata) { + ret = dict_set_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA, 1); + if (ret == -1) { + gf_log(this->name, GF_LOG_WARNING, + "Failed to set" + " GF_RESPONSE_LINK_COUNT_XDATA"); + } + } else { + new_xdata = dict_new(); + if (!new_xdata) { + gf_log(this->name, GF_LOG_WARNING, + "Memory allocation failure while " + "creating new_xdata"); + goto ctr_out; + } + ret = dict_set_uint32(new_xdata, GF_RESPONSE_LINK_COUNT_XDATA, 1); + if (ret == -1) { + gf_log(this->name, GF_LOG_WARNING, + "Failed to set" + " GF_RESPONSE_LINK_COUNT_XDATA"); + } + ctr_out: + TRASH_STACK_UNWIND(unlink, frame, 0, op_errno, preoldparent, + postoldparent, new_xdata); + goto out; + } + } + /* All other cases, unlink should return success */ + TRASH_STACK_UNWIND(unlink, frame, 0, op_errno, preoldparent, postoldparent, + xdata); out: - return 0; -} + if (tmp_str) + GF_FREE(tmp_str); + if (tmp_cookie) + GF_FREE(tmp_cookie); + if (new_xdata) + dict_unref(new_xdata); + return ret; +} +/** + * move backs from trash translator to truncate call + */ int32_t -trash_truncate_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) +trash_common_unwind_buf_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - trash_local_t *local = NULL; - - local = frame->local; - - if (op_ret == -1) { - //Let truncate work, but previous copy is not preserved. - gf_log (this->name, GF_LOG_DEBUG, - "open on the existing file failed: %s", - strerror (op_errno)); + TRASH_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} - STACK_WIND (frame, trash_truncate_unlink_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, - &local->newloc); - goto out; - } +int32_t +trash_unlink_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + trash_private_t *priv = NULL; + trash_local_t *local = NULL; + loc_t new_loc = { + 0, + }; + int ret = 0; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + + local = frame->local; + GF_VALIDATE_OR_GOTO("trash", local, out); + + if (op_ret == -1) { + gf_log(this->name, GF_LOG_DEBUG, "%s: %s", local->loc.path, + strerror(op_errno)); + TRASH_STACK_UNWIND(unlink, frame, op_ret, op_errno, buf, NULL, xdata); + ret = -1; + goto out; + } + + /* Only last hardlink will be moved to trash directory */ + if (buf->ia_nlink > 1) { + STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &local->loc, 0, xdata); + goto out; + } + + /* if the file is too big just unlink it */ + if (buf->ia_size > (priv->max_trash_file_size)) { + gf_log(this->name, GF_LOG_DEBUG, + "%s: file size too big (%" PRId64 + ") to " + "move into trash directory", + local->loc.path, buf->ia_size); + + STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &local->loc, 0, xdata); + goto out; + } + + /* Copies new path for renaming */ + loc_copy(&new_loc, &local->loc); + new_loc.path = gf_strdup(local->newpath); + if (!new_loc.path) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + + STACK_WIND(frame, trash_unlink_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, &local->loc, &new_loc, xdata); - local->cur_offset = local->fop_offset; +out: + loc_wipe(&new_loc); - STACK_WIND (frame, trash_truncate_readv_cbk, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv, - local->fd, (size_t)GF_BLOCK_READV_SIZE, local->cur_offset, 0); + return ret; +} +/** + * Unlink is called internally by rm system call and also + * by internal operations of gluster such as self-heal + */ +int32_t +trash_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) +{ + trash_private_t *priv = NULL; + trash_local_t *local = NULL; /* files inside trash */ + int32_t match = 0; + int32_t ctr_link_req = 0; + char *pathbuf = NULL; + int ret = 0; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + + /* If trash is not active or not enabled through cli, then + * we bypass and wind back + */ + if (!priv->state) { + STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, 0, xdata); + goto out; + } + + /* The files removed by gluster internal operations such as self-heal, + * should moved to trash directory , but files by client should not + * moved + */ + if ((frame->root->pid < 0) && !priv->internal) { + STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, 0, xdata); + goto out; + } + /* loc need some gfid which will be present in inode */ + gf_uuid_copy(loc->gfid, loc->inode->gfid); + + /* Checking for valid location */ + if (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid)) { + gf_log(this->name, GF_LOG_DEBUG, "Bad address"); + STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, 0, xdata); + ret = EFAULT; + goto out; + } + + /* This will be more accurate */ + inode_path(loc->inode, NULL, &pathbuf); + /* Check whether the file is present under eliminate paths or + * inside trash directory. In both cases we don't need to move the + * file to trash directory. Instead delete it permanently + */ + match = check_whether_eliminate_path(priv->eliminate, pathbuf); + if ((strncmp(pathbuf, priv->newtrash_dir, strlen(priv->newtrash_dir)) == + 0) || + (match)) { + if (match) { + gf_log(this->name, GF_LOG_DEBUG, + "%s is a file comes under an eliminate path, " + "so it is not moved to trash", + loc->name); + } + + /* Trying to unlink from the trash-dir. So do the + * actual unlink without moving to trash-dir. + */ + STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, 0, xdata); + goto out; + } + + local = mem_get0(this->local_pool); + if (!local) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + TRASH_STACK_UNWIND(unlink, frame, -1, ENOMEM, NULL, NULL, xdata); + ret = ENOMEM; + goto out; + } + frame->local = local; + loc_copy(&local->loc, loc); + + /* rename new location of file as starting from trash directory */ + copy_trash_path(priv->newtrash_dir, (frame->root->pid < 0), local->newpath, + sizeof(local->newpath)); + strncat(local->newpath, pathbuf, + sizeof(local->newpath) - strlen(local->newpath) - 1); + + /* append timestamp to file name so that we can avoid + * name collisions inside trash + */ + append_time_stamp(local->newpath, sizeof(local->newpath)); + if (strlen(local->newpath) > PATH_MAX) { + STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, 0, xdata); + goto out; + } + + /* To know whether CTR xlator requested for the link count */ + ret = dict_get_int32(xdata, GF_REQUEST_LINK_COUNT_XDATA, &ctr_link_req); + if (ret) { + local->ctr_link_count_req = _gf_false; + ret = 0; + } else + local->ctr_link_count_req = _gf_true; + + LOCK_INIT(&frame->lock); + + STACK_WIND(frame, trash_unlink_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); out: - return 0; + return ret; } - +/** + * Use this when a failure occurs, and delete the newly created file + */ int32_t -trash_truncate_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, - inode_t *inode, struct iatt *buf, - struct iatt *preparent, struct iatt *postparent) +trash_truncate_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - trash_local_t *local = NULL; - char *tmp_str = NULL; - char *dir_name = NULL; - char *tmp_path = NULL; - int32_t flags = 0; - loc_t tmp_loc = {0,}; - - local = frame->local; - - if ((op_ret == -1) && (op_errno == ENOENT)) { - //Creating the directory structure here. - tmp_str = gf_strdup (local->newpath); - if (!tmp_str) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - } - dir_name = dirname (tmp_str); - - tmp_path = gf_strdup (dir_name); - if (!tmp_path) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - } - tmp_loc.path = tmp_path; - - /* TODO: create the directory with proper permissions */ - STACK_WIND_COOKIE (frame, trash_truncate_mkdir_cbk, - tmp_path, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mkdir, - &tmp_loc, 0755, NULL); - GF_FREE (tmp_str); - goto out; - } - - if (op_ret == -1) { - //Let truncate work, but previous copy is not preserved. - //Deleting the newly created copy. - gf_log (this->name, GF_LOG_DEBUG, - "creation of new file in trash-dir failed, " - "when truncate was called: %s", strerror (op_errno)); - - STACK_WIND (frame, trash_common_unwind_buf_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, &local->loc, - local->fop_offset); - goto out; - } + trash_local_t *local = NULL; - flags = O_RDONLY; + local = frame->local; + GF_VALIDATE_OR_GOTO("trash", local, out); - local->fd = fd_create (local->loc.inode, frame->root->pid); + if (op_ret == -1) { + gf_log(this->name, GF_LOG_DEBUG, "deleting the newly created file: %s", + strerror(op_errno)); + } - STACK_WIND (frame, trash_truncate_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, &local->loc, flags, - local->fd, 0); + STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, &local->loc, + local->fop_offset, xdata); out: - return 0; + return 0; } +/** + * Read from source file + */ int32_t -trash_truncate_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) +trash_truncate_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, + struct iobref *iobuf, dict_t *xdata) { - trash_local_t *local = NULL; - char *tmp_str = NULL; - char *tmp_path = NULL; - char *tmp_dirname = NULL; - char *dir_name = NULL; - int32_t count = 0; - int32_t flags = 0; - int32_t loop_count = 0; - int i = 0; - loc_t tmp_loc = {0,}; - - local = frame->local; - if (!local) - goto out; - - loop_count = local->loop_count; - - tmp_str = gf_strdup (local->newpath); - if (!tmp_str) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - goto out; - } - - if ((op_ret == -1) && (op_errno == ENOENT)) { - tmp_dirname = strchr (tmp_str, '/'); - while (tmp_dirname) { - count = tmp_dirname - tmp_str; - if (count == 0) - count = 1; - i++; - if (i > loop_count) - break; - tmp_dirname = strchr (tmp_str + count + 1, '/'); - } - tmp_path = memdup (local->newpath, count); - if (!tmp_path) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - } - tmp_loc.path = tmp_path; - STACK_WIND_COOKIE (frame, trash_truncate_mkdir_cbk, - tmp_path, this->children->xlator, - this->children->xlator->fops->mkdir, - &tmp_loc, 0755, NULL); - - goto out; - } + trash_local_t *local = NULL; - if (op_ret == 0) { - dir_name = dirname (tmp_str); - if (strcmp ((char*)cookie, dir_name) == 0) { - flags = O_CREAT|O_EXCL|O_WRONLY; - ia_prot_t prot = {0, }; - - //Call create again once directory structure is created. - STACK_WIND (frame, trash_truncate_create_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, - &local->newloc, flags, - st_mode_from_ia (prot, local->loc.inode->ia_type), - local->newfd, NULL); - goto out; - } - } + local = frame->local; + GF_VALIDATE_OR_GOTO("trash", local, out); - LOCK (&frame->lock); - { - loop_count = ++local->loop_count; - } - UNLOCK (&frame->lock); + if (op_ret == -1) { + gf_log(this->name, GF_LOG_DEBUG, + "readv on the existing file failed: %s", strerror(op_errno)); - tmp_dirname = strchr (tmp_str, '/'); - while (tmp_dirname) { - count = tmp_dirname - tmp_str; - if (count == 0) - count = 1; - - i++; - if ((i > loop_count) || (count > PATH_MAX)) - break; - tmp_dirname = strchr (tmp_str + count + 1, '/'); - } - tmp_path = memdup (local->newpath, count); - if (!tmp_path) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - } - tmp_loc.path = tmp_path; + STACK_WIND(frame, trash_truncate_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &local->newloc, 0, xdata); + goto out; + } - STACK_WIND_COOKIE (frame, trash_truncate_mkdir_cbk, tmp_path, - this->children->xlator, - this->children->xlator->fops->mkdir, - &tmp_loc, 0755, NULL); + local->fsize = stbuf->ia_size; + STACK_WIND(frame, trash_truncate_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, local->newfd, vector, count, + local->cur_offset, 0, iobuf, xdata); out: - GF_FREE (cookie); /* strdup (dir_name) was sent here :) */ - GF_FREE (tmp_str); - - return 0; + return 0; } - +/** + * Write to file created in trash directory + */ int32_t -trash_truncate_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) +trash_truncate_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - trash_private_t *priv = NULL; - trash_local_t *local = NULL; - char timestr[64] = {0,}; - char loc_newname[PATH_MAX] = {0,}; - int32_t flags = 0; - - priv = this->private; - local = frame->local; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "fstat on the file failed: %s", - strerror (op_errno)); - - TRASH_STACK_UNWIND (truncate, frame, op_ret, op_errno, buf, NULL); - return 0; - } - - if ((buf->ia_size == 0) || (buf->ia_size > priv->max_trash_file_size)) { - // If the file is too big, just unlink it. - if (buf->ia_size > priv->max_trash_file_size) - gf_log (this->name, GF_LOG_DEBUG, "%s: file too big, " - "not moving to trash", local->loc.path); - - STACK_WIND (frame, trash_common_unwind_buf_cbk, - this->children->xlator, - this->children->xlator->fops->truncate, - &local->loc, local->fop_offset); - return 0; - } - - strcpy (local->newpath, priv->trash_dir); - strcat (local->newpath, local->loc.path); - - { - gf_time_fmt (timestr, sizeof timestr, time (NULL), - gf_timefmt_F_HMS); - strcat (local->newpath, timestr); - } - strcpy (loc_newname,local->loc.name); - strcat (loc_newname,timestr); - - local->newloc.name = gf_strdup (loc_newname); - local->newloc.path = gf_strdup (local->newpath); - local->newloc.inode = inode_new (local->loc.inode->table); - local->newfd = fd_create (local->newloc.inode, frame->root->pid); - - flags = O_CREAT|O_EXCL|O_WRONLY; - - STACK_WIND (frame, trash_truncate_create_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - &local->newloc, flags, - st_mode_from_ia (buf->ia_prot, local->loc.inode->ia_type), - local->newfd, NULL); + trash_local_t *local = NULL; + + local = frame->local; + GF_VALIDATE_OR_GOTO("trash", local, out); + + if (op_ret == -1) { + /* Let truncate work, but previous copy is not preserved. */ + gf_log(this->name, GF_LOG_DEBUG, + "writev on the existing file failed: %s", strerror(op_errno)); + + STACK_WIND(frame, trash_truncate_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &local->newloc, 0, xdata); + goto out; + } + + if (local->cur_offset < local->fsize) { + local->cur_offset += GF_BLOCK_READV_SIZE; + /* Loop back and Read the contents again. */ + STACK_WIND(frame, trash_truncate_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, local->fd, + (size_t)GF_BLOCK_READV_SIZE, local->cur_offset, 0, xdata); + goto out; + } + + /* OOFH.....Finally calling Truncate. */ + STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, &local->loc, + local->fop_offset, xdata); - return 0; +out: + return 0; } +/** + * The source file is opened for reading and writing + */ int32_t -trash_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset) +trash_truncate_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { - trash_elim_pattern_t *trav = NULL; - trash_private_t *priv = NULL; - trash_local_t *local = NULL; - int32_t match = 0; - - priv = this->private; - if (priv->eliminate) { - trav = priv->eliminate; - while (trav) { - if (fnmatch(trav->pattern, loc->name, 0) == 0) { - match++; - break; - } - trav = trav->next; - } - } + trash_local_t *local = NULL; - if ((strncmp (loc->path, priv->trash_dir, - strlen (priv->trash_dir)) == 0) || (offset) || (match)) { - if (match) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: file not moved to trash as per option " - "'eliminate'", loc->path); - } - - // Trying to truncate from the trash can dir, - // do the actual truncate without moving to trash-dir. - STACK_WIND (frame, trash_common_unwind_buf_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, loc, offset); - goto out; - } + local = frame->local; + GF_VALIDATE_OR_GOTO("trash", local, out); - LOCK_INIT (&frame->lock); + if (op_ret == -1) { + /* Let truncate work, but previous copy is not preserved. */ + gf_log(this->name, GF_LOG_DEBUG, "open on the existing file failed: %s", + strerror(op_errno)); - local = mem_get0 (this->local_pool); - if (!local) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - TRASH_STACK_UNWIND (truncate, frame, -1, ENOMEM, NULL, NULL); - return 0; - } + STACK_WIND(frame, trash_truncate_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &local->newloc, 0, xdata); + goto out; + } - loc_copy (&local->loc, loc); + fd_bind(fd); - local->fop_offset = offset; + local->cur_offset = 0; - frame->local = local; - - STACK_WIND (frame, trash_truncate_stat_cbk, - this->children->xlator, - this->children->xlator->fops->stat, loc); + STACK_WIND(frame, trash_truncate_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, local->fd, + (size_t)GF_BLOCK_READV_SIZE, local->cur_offset, 0, xdata); out: - return 0; + return 0; } +/** + * Creates new file descriptor for read and write operations, + * if the path is present in trash directory + */ int32_t -trash_ftruncate_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent) +trash_truncate_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - trash_local_t *local = NULL; - - local = frame->local; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: failed to unlink new file: %s", - local->newloc.path, strerror(op_errno)); - + trash_local_t *local = NULL; + char *tmp_str = NULL; + char *dir_name = NULL; + char *tmp_path = NULL; + int32_t flags = 0; + loc_t tmp_loc = { + 0, + }; + char *tmp_stat = NULL; + char real_path[PATH_MAX] = { + 0, + }; + trash_private_t *priv = NULL; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + + local = frame->local; + GF_VALIDATE_OR_GOTO("trash", local, out); + + TRASH_UNSET_PID(frame, local); + + /* Checks whether path is present in trash directory or not */ + + if ((op_ret == -1) && (op_errno == ENOENT)) { + /* Creating the directory structure here. */ + tmp_str = gf_strdup(local->newpath); + if (!tmp_str) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + goto out; } + dir_name = dirname(tmp_str); - STACK_WIND (frame, trash_common_unwind_buf_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, - local->fd, local->fop_offset); + tmp_path = gf_strdup(dir_name); + if (!tmp_path) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + goto out; + } + loc_copy(&tmp_loc, &local->newloc); + tmp_loc.path = gf_strdup(tmp_path); + if (!tmp_loc.path) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + goto out; + } + strncpy(real_path, priv->brick_path, sizeof(real_path)); + real_path[sizeof(real_path) - 1] = 0; + remove_trash_path(tmp_path, (frame->root->pid < 0), &tmp_stat); + if (tmp_stat) + strncat(real_path, tmp_stat, + sizeof(real_path) - strlen(real_path) - 1); + + TRASH_SET_PID(frame, local); + + /* create the directory with proper permissions */ + STACK_WIND_COOKIE(frame, trash_truncate_mkdir_cbk, tmp_path, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, + &tmp_loc, get_permission(real_path), 0022, xdata); + loc_wipe(&tmp_loc); + goto out; + } + + if (op_ret == -1) { + /* Let truncate work, but previous copy is not preserved. + * Deleting the newly created copy. + */ + gf_log(this->name, GF_LOG_DEBUG, + "creation of new file in trash-dir failed, " + "when truncate was called: %s", + strerror(op_errno)); + + STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, &local->loc, + local->fop_offset, xdata); + goto out; + } + + fd_bind(fd); + flags = O_RDONLY; + + /* fd which represents source file for reading and writing from it */ + + local->fd = fd_create(local->loc.inode, frame->root->pid); + + STACK_WIND(frame, trash_truncate_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, &local->loc, flags, local->fd, 0); +out: + if (tmp_str) + GF_FREE(tmp_str); + if (tmp_path) + GF_FREE(tmp_path); - return 0; + return 0; } +/** + * If the path is not present in the trash directory,it will recursively call + * this call-back and one by one directories will be created from the + * beginning + */ int32_t -trash_ftruncate_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *prebuf, struct iatt *postbuf) +trash_truncate_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - trash_local_t *local = NULL; - - local = frame->local; - - if (op_ret == -1) { - STACK_WIND (frame, trash_ftruncate_unlink_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, - &local->newloc); - return 0; - } - - if (local->cur_offset < local->fsize) { - local->cur_offset += GF_BLOCK_READV_SIZE; - STACK_WIND (frame, trash_ftruncate_readv_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, - local->fd, (size_t)GF_BLOCK_READV_SIZE, - local->cur_offset, 0); - return 0; - } + trash_local_t *local = NULL; + trash_private_t *priv = NULL; + char *tmp_str = NULL; + char *tmp_path = NULL; + char *tmp_dirname = NULL; + char *dir_name = NULL; + char *tmp_stat = NULL; + char real_path[PATH_MAX] = { + 0, + }; + size_t count = 0; + int32_t flags = 0; + int32_t loop_count = 0; + int i = 0; + loc_t tmp_loc = { + 0, + }; + int ret = 0; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + + local = frame->local; + GF_VALIDATE_OR_GOTO("trash", local, out); + + loop_count = local->loop_count; + + TRASH_UNSET_PID(frame, local); + + tmp_str = gf_strdup(local->newpath); + if (!tmp_str) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + + if ((op_ret == -1) && (op_errno == ENOENT)) { + tmp_dirname = strchr(tmp_str, '/'); + while (tmp_dirname) { + count = tmp_dirname - tmp_str; + if (count == 0) + count = 1; + i++; + if (i > loop_count) + break; + tmp_dirname = strchr(tmp_str + count + 1, '/'); + } + tmp_path = gf_memdup(local->newpath, count + 1); + if (!tmp_path) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + tmp_path[count] = '\0'; + + loc_copy(&tmp_loc, &local->newloc); + tmp_loc.path = gf_strdup(tmp_path); + if (!tmp_loc.path) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + + /* Stores the the name of directory to be created */ + tmp_loc.name = gf_strdup(strrchr(tmp_path, '/') + 1); + if (!tmp_loc.name) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + strncpy(real_path, priv->brick_path, sizeof(real_path)); + real_path[sizeof(real_path) - 1] = 0; + remove_trash_path(tmp_path, (frame->root->pid < 0), &tmp_stat); + if (tmp_stat) + strncat(real_path, tmp_stat, + sizeof(real_path) - strlen(real_path) - 1); + + TRASH_SET_PID(frame, local); + + STACK_WIND_COOKIE(frame, trash_truncate_mkdir_cbk, tmp_path, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, + &tmp_loc, get_permission(real_path), 0022, xdata); + loc_wipe(&tmp_loc); + goto out; + } + + if (op_ret == 0) { + dir_name = dirname(tmp_str); + if (strcmp((char *)cookie, dir_name) == 0) { + flags = O_CREAT | O_EXCL | O_WRONLY; + strncpy(real_path, priv->brick_path, sizeof(real_path)); + real_path[sizeof(real_path) - 1] = 0; + strncat(real_path, local->origpath, + sizeof(real_path) - strlen(real_path) - 1); + /* Call create again once directory structure + is created. */ + + TRASH_SET_PID(frame, local); + + STACK_WIND(frame, trash_truncate_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, &local->newloc, flags, + get_permission(real_path), 0022, local->newfd, xdata); + goto out; + } + } + + if ((op_ret == -1) && (op_errno != EEXIST)) { + gf_log(this->name, GF_LOG_ERROR, + "Directory creation failed [%s]. " + "Therefore truncating %s without moving the " + "original copy to trash directory", + strerror(op_errno), local->loc.name); + STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, &local->loc, + local->fop_offset, xdata); + goto out; + } + + LOCK(&frame->lock); + { + loop_count = ++local->loop_count; + } + UNLOCK(&frame->lock); + + tmp_dirname = strchr(tmp_str, '/'); + while (tmp_dirname) { + count = tmp_dirname - tmp_str; + if (count == 0) + count = 1; + i++; + if (i > loop_count) + break; + tmp_dirname = strchr(tmp_str + count + 1, '/'); + } + tmp_path = gf_memdup(local->newpath, count + 1); + if (!tmp_path) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + tmp_path[count] = '\0'; + + loc_copy(&tmp_loc, &local->newloc); + tmp_loc.path = gf_strdup(tmp_path); + if (!tmp_loc.path) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + + /* Stores the the name of directory to be created */ + tmp_loc.name = gf_strdup(strrchr(tmp_path, '/') + 1); + if (!tmp_loc.name) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + goto out; + } + + strncpy(real_path, priv->brick_path, sizeof(real_path)); + real_path[sizeof(real_path) - 1] = 0; + remove_trash_path(tmp_path, (frame->root->pid < 0), &tmp_stat); + if (tmp_stat) + strncat(real_path, tmp_stat, sizeof(real_path) - strlen(real_path) - 1); + + TRASH_SET_PID(frame, local); + + STACK_WIND_COOKIE(frame, trash_truncate_mkdir_cbk, tmp_path, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, + &tmp_loc, get_permission(real_path), 0022, xdata); - STACK_WIND (frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, local->fd, - local->fop_offset); +out: + if (tmp_str) + GF_FREE(tmp_str); + if (tmp_path) + GF_FREE(tmp_path); - return 0; + return ret; } - int32_t -trash_ftruncate_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, - struct iatt *stbuf, struct iobref *iobuf) +trash_truncate_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) { - trash_local_t *local = NULL; + trash_private_t *priv = NULL; + trash_local_t *local = NULL; + char loc_newname[PATH_MAX] = { + 0, + }; + int32_t flags = 0; + dentry_t *dir_entry = NULL; + inode_table_t *table = NULL; + int ret = 0; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + + local = frame->local; + GF_VALIDATE_OR_GOTO("trash", local, out); + + table = local->loc.inode->table; + + pthread_mutex_lock(&table->lock); + { + dir_entry = __dentry_search_arbit(local->loc.inode); + } + pthread_mutex_unlock(&table->lock); + + if (op_ret == -1) { + gf_log(this->name, GF_LOG_DEBUG, "fstat on the file failed: %s", + strerror(op_errno)); + + TRASH_STACK_UNWIND(truncate, frame, op_ret, op_errno, buf, NULL, xdata); + goto out; + } + + /* Only last hardlink will be moved to trash directory */ + if (buf->ia_nlink > 1) { + STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, &local->loc, + local->fop_offset, xdata); + goto out; + } + + /** + * If the file is too big or if it is extended truncate, + * just don't move it to trash directory. + */ + if (buf->ia_size > (priv->max_trash_file_size) || + buf->ia_size <= local->fop_offset) { + gf_log(this->name, GF_LOG_DEBUG, + "%s: file is too large to move to trash", local->loc.path); + + STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, &local->loc, + local->fop_offset, xdata); + goto out; + } + + /* Retrieves the name of file from path */ + local->loc.name = gf_strdup(strrchr(local->loc.path, '/')); + if (!local->loc.name) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + goto out; + } + + /* Stores new path for source file */ + copy_trash_path(priv->newtrash_dir, (frame->root->pid < 0), local->newpath, + sizeof(local->newpath)); + strncat(local->newpath, local->loc.path, + sizeof(local->newpath) - strlen(local->newpath) - 1); + + /* append timestamp to file name so that we can avoid + name collisions inside trash */ + append_time_stamp(local->newpath, sizeof(local->newpath)); + if (strlen(local->newpath) > PATH_MAX) { + STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, &local->loc, + local->fop_offset, xdata); + goto out; + } + + strncpy(loc_newname, local->loc.name, sizeof(loc_newname)); + loc_newname[sizeof(loc_newname) - 1] = 0; + append_time_stamp(loc_newname, sizeof(loc_newname)); + /* local->newloc represents old file(file inside trash), + where as local->loc represents truncated file. We need + to create new inode and fd for new file*/ + local->newloc.name = gf_strdup(loc_newname); + if (!local->newloc.name) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + local->newloc.path = gf_strdup(local->newpath); + if (!local->newloc.path) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; + } + local->newloc.inode = inode_new(local->loc.inode->table); + local->newfd = fd_create(local->newloc.inode, frame->root->pid); + + /* Creating valid parent and pargfids for both files */ + + if (dir_entry == NULL) { + ret = EINVAL; + goto out; + } + local->loc.parent = inode_ref(dir_entry->parent); + gf_uuid_copy(local->loc.pargfid, dir_entry->parent->gfid); + + local->newloc.parent = inode_ref(dir_entry->parent); + gf_uuid_copy(local->newloc.pargfid, dir_entry->parent->gfid); + + flags = O_CREAT | O_EXCL | O_WRONLY; + + TRASH_SET_PID(frame, local); + + STACK_WIND(frame, trash_truncate_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, &local->newloc, flags, + st_mode_from_ia(buf->ia_prot, local->loc.inode->ia_type), 0022, + local->newfd, xdata); - local = frame->local; - local->fsize = stbuf->ia_size; - - if (op_ret == -1) { - STACK_WIND (frame, trash_ftruncate_unlink_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, - &local->newloc); - return 0; - } +out: + return ret; +} - STACK_WIND (frame, trash_ftruncate_writev_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, - local->newfd, vector, count, local->cur_offset, 0, NULL); +/** + * Truncate can be explicitly called or implicitly by some other applications + * like text editors etc.. + */ +int32_t +trash_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + trash_private_t *priv = NULL; + trash_local_t *local = NULL; + int32_t match = 0; + char *pathbuf = NULL; + int ret = 0; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + /* If trash is not active or not enabled through cli, then + * we bypass and wind back + */ + if (!priv->state) { + STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + goto out; + } + + /* The files removed by gluster operations such as self-heal, + should moved to trash directory, but files by client should + not moved */ + if ((frame->root->pid < 0) && !priv->internal) { + STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + goto out; + } + /* This will be more accurate */ + inode_path(loc->inode, NULL, &pathbuf); + + /* Checks whether file is in trash directory or eliminate path. + * In all such cases it does not move to trash directory, + * truncate will be performed + */ + match = check_whether_eliminate_path(priv->eliminate, pathbuf); + + if ((strncmp(pathbuf, priv->newtrash_dir, strlen(priv->newtrash_dir)) == + 0) || + (match)) { + if (match) { + gf_log(this->name, GF_LOG_DEBUG, + "%s: file not moved to trash as per option " + "'eliminate path'", + loc->path); + } + + /* Trying to truncate from the trash-dir. So do the + * actual truncate without moving to trash-dir. + */ + STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + goto out; + } + + LOCK_INIT(&frame->lock); + + local = mem_get0(this->local_pool); + if (!local) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + TRASH_STACK_UNWIND(truncate, frame, -1, ENOMEM, NULL, NULL, xdata); + ret = ENOMEM; + goto out; + } + + strncpy(local->origpath, pathbuf, sizeof(local->origpath)); + local->origpath[sizeof(local->origpath) - 1] = 0; + + loc_copy(&local->loc, loc); + local->loc.path = pathbuf; + local->fop_offset = offset; + + frame->local = local; + + STACK_WIND(frame, trash_truncate_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); - return 0; +out: + return ret; } +/** + * When we call truncate from terminal it comes to ftruncate of trash-xlator. + * Since truncate internally calls ftruncate and we receive fd of the file, + * other than that it also called by Rebalance operation + */ +int32_t +trash_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + trash_private_t *priv = NULL; + trash_local_t *local = NULL; /* file inside trash */ + char *pathbuf = NULL; /* path of file from fd */ + int32_t retval = 0; + int32_t match = 0; + int ret = 0; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + /* If trash is not active or not enabled through cli, then + * we bypass and wind back + */ + if (!priv->state) { + STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + goto out; + } + + /* The files removed by gluster operations such as self-heal, + * should moved to trash directory, but files by client + * should not moved + */ + if ((frame->root->pid < 0) && !priv->internal) { + STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + goto out; + } + /* This will be more accurate */ + retval = inode_path(fd->inode, NULL, &pathbuf); + + /* Checking the eliminate path */ + + /* Checks whether file is trash directory or eliminate path or + * invalid fd. In all such cases it does not move to trash directory, + * ftruncate will be performed + */ + match = check_whether_eliminate_path(priv->eliminate, pathbuf); + if ((strncmp(pathbuf, priv->newtrash_dir, strlen(priv->newtrash_dir)) == + 0) || + match || !retval) { + if (match) { + gf_log(this->name, GF_LOG_DEBUG, + "%s: file matches eliminate path, " + "not moved to trash", + pathbuf); + } + + /* Trying to ftruncate from the trash-dir. So do the + * actual ftruncate without moving to trash-dir + */ + STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + goto out; + } + + local = mem_get0(this->local_pool); + if (!local) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + TRASH_STACK_UNWIND(ftruncate, frame, -1, ENOMEM, NULL, NULL, xdata); + ret = -1; + goto out; + } + + strncpy(local->origpath, pathbuf, sizeof(local->origpath)); + local->origpath[sizeof(local->origpath) - 1] = 0; + + /* To convert fd to location */ + frame->local = local; + + local->loc.path = pathbuf; + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, local->loc.inode->gfid); + + local->fop_offset = offset; + + /* Else remains same to truncate code, so from here flow goes + * to truncate_stat + */ + STACK_WIND(frame, trash_truncate_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); +out: + return ret; +} +/** + * The mkdir call is intercepted to avoid creation of + * trash directory in the mount by the user + */ int32_t -trash_ftruncate_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, - inode_t *inode, struct iatt *buf, - struct iatt *preparent, struct iatt *postparent) +trash_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { - trash_local_t *local = NULL; - char *tmp_str = NULL; - char *dir_name = NULL; - char *tmp_path = NULL; - loc_t tmp_loc = {0,}; - - local = frame->local; - - if ((op_ret == -1) && (op_errno == ENOENT)) { - tmp_str = gf_strdup (local->newpath); - if (!tmp_str) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - } - dir_name = dirname (tmp_str); - - tmp_path = gf_strdup (dir_name); - if (!tmp_path) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - } - tmp_loc.path = tmp_path; - - /* TODO: create the directory with proper permissions */ - STACK_WIND_COOKIE (frame, trash_truncate_mkdir_cbk, - tmp_path, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mkdir, - &tmp_loc, 0755, NULL); - GF_FREE (tmp_str); - return 0; - } + int32_t op_ret = 0; + int32_t op_errno = 0; + trash_private_t *priv = NULL; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + + if (!check_whether_op_permitted(priv, loc)) { + gf_log(this->name, GF_LOG_WARNING, + "mkdir issued on %s, which is not permitted", + priv->newtrash_dir); + op_errno = EPERM; + op_ret = -1; + + STACK_UNWIND_STRICT(mkdir, frame, op_ret, op_errno, NULL, NULL, NULL, + NULL, xdata); + } else { + STACK_WIND(frame, trash_common_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); + } - if (op_ret == -1) { - STACK_WIND (frame, trash_common_unwind_buf_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, - local->fd, local->fop_offset); - return 0; - } +out: + return 0; +} - STACK_WIND (frame, trash_ftruncate_readv_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, local->fd, - (size_t)GF_BLOCK_READV_SIZE, local->cur_offset, 0); +/** + * The rename call is intercepted to avoid renaming + * of trash directory in the mount by the user + */ +int +trash_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + int32_t op_ret = 0; + int32_t op_errno = 0; + trash_private_t *priv = NULL; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + + if (!check_whether_op_permitted(priv, oldloc)) { + gf_log(this->name, GF_LOG_WARNING, + "rename issued on %s, which is not permitted", + priv->newtrash_dir); + op_errno = EPERM; + op_ret = -1; + + STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, NULL, NULL, NULL, + NULL, NULL, xdata); + } else { + STACK_WIND(frame, trash_common_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + } - return 0; +out: + return 0; } - +/** + * The rmdir call is intercepted to avoid deletion of + * trash directory in the mount by the user + */ int32_t -trash_ftruncate_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) +trash_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { - trash_local_t *local = NULL; - char *tmp_str = NULL; - char *tmp_path = NULL; - char *tmp_dirname = NULL; - char *dir_name = NULL; - int32_t count = 0; - int32_t flags = 0; - int32_t loop_count = 0; - int i = 0; - loc_t tmp_loc = {0,}; - - local = frame->local; - if (!local) - goto out; + int32_t op_ret = 0; + int32_t op_errno = 0; + trash_private_t *priv = NULL; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + + if (!check_whether_op_permitted(priv, loc)) { + gf_log(this->name, GF_LOG_WARNING, + "rmdir issued on %s, which is not permitted", + priv->newtrash_dir); + op_errno = EPERM; + op_ret = -1; + + STACK_UNWIND_STRICT(rmdir, frame, op_ret, op_errno, NULL, NULL, xdata); + } else { + STACK_WIND(frame, trash_common_rmdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata); + } - loop_count = local->loop_count; +out: + return 0; +} - tmp_str = gf_strdup (local->newpath); - if (!tmp_str) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); +/** + * Volume set option is handled by the reconfigure function. + * Here we checks whether each option is set or not ,if it + * sets then corresponding modifciations will be made + */ +int +reconfigure(xlator_t *this, dict_t *options) +{ + uint64_t max_fsize = 0; + int ret = 0; + char *tmp = NULL; + char *tmp_str = NULL; + trash_private_t *priv = NULL; + char trash_dir[PATH_MAX] = { + 0, + }; + + priv = this->private; + + GF_VALIDATE_OR_GOTO("trash", priv, out); + + GF_OPTION_RECONF("trash-internal-op", priv->internal, options, bool, out); + GF_OPTION_RECONF("trash-dir", tmp, options, str, out); + + GF_OPTION_RECONF("trash", priv->state, options, bool, out); + + if (priv->state) { + ret = create_or_rename_trash_directory(this); + + if (tmp) + sprintf(trash_dir, "/%s/", tmp); + else + sprintf(trash_dir, "%s", priv->oldtrash_dir); + + if (strcmp(priv->newtrash_dir, trash_dir) != 0) { + /* When user set a new name for trash directory, trash + * xlator will perform a rename operation on old trash + * directory to the new one using a STACK_WIND from here. + * This option can be configured only when volume is in + * started state + */ + + GF_FREE(priv->newtrash_dir); + + priv->newtrash_dir = gf_strdup(trash_dir); + if (!priv->newtrash_dir) { + ret = ENOMEM; + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); goto out; - } - - if ((op_ret == -1) && (op_errno == ENOENT)) { - tmp_dirname = strchr (tmp_str, '/'); - while (tmp_dirname) { - count = tmp_dirname - tmp_str; - if (count == 0) - count = 1; - i++; - if (i > loop_count) - break; - tmp_dirname = strchr (tmp_str + count + 1, '/'); - } - tmp_path = memdup (local->newpath, count); - if (!tmp_path) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - } - tmp_loc.path = tmp_path; - STACK_WIND_COOKIE (frame, trash_ftruncate_mkdir_cbk, - tmp_path, this->children->xlator, - this->children->xlator->fops->mkdir, - &tmp_loc, 0755, NULL); - + } + gf_log(this->name, GF_LOG_DEBUG, + "Renaming %s -> %s from reconfigure", priv->oldtrash_dir, + priv->newtrash_dir); + + if (!priv->newtrash_dir) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; goto out; + } + ret = rename_trash_directory(this); + } + + if (priv->internal) { + ret = create_internalop_directory(this); + } + } + tmp = NULL; + + GF_OPTION_RECONF("trash-max-filesize", max_fsize, options, size_uint64, + out); + if (max_fsize) { + priv->max_trash_file_size = max_fsize; + gf_log(this->name, GF_LOG_DEBUG, "%" GF_PRI_SIZET " max-size", + priv->max_trash_file_size); + } + GF_OPTION_RECONF("trash-eliminate-path", tmp, options, str, out); + if (!tmp) { + gf_log(this->name, GF_LOG_DEBUG, + "no option specified for 'eliminate', using NULL"); + } else { + if (priv->eliminate) + wipe_eliminate_path(&priv->eliminate); + + tmp_str = gf_strdup(tmp); + if (!tmp_str) { + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + ret = ENOMEM; + goto out; } + ret = store_eliminate_path(tmp_str, &priv->eliminate); + } - if (op_ret == 0) { - dir_name = dirname (tmp_str); - if (strcmp ((char*)cookie, dir_name) == 0) { - ia_prot_t prot = {0, }; - flags = O_CREAT|O_EXCL|O_WRONLY; - - //Call create again once directory structure is created. - STACK_WIND (frame, trash_ftruncate_create_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - &local->newloc, flags, - st_mode_from_ia (prot, local->loc.inode->ia_type), - local->newfd, NULL); - goto out; - } - } +out: - LOCK (&frame->lock); - { - loop_count = ++local->loop_count; - } - UNLOCK (&frame->lock); - tmp_dirname = strchr (tmp_str, '/'); - while (tmp_dirname) { - count = tmp_dirname - tmp_str; - if (count == 0) - count = 1; - - i++; - if ((i > loop_count) || (count > PATH_MAX)) - break; - tmp_dirname = strchr (tmp_str + count + 1, '/'); - } - tmp_path = memdup (local->newpath, count); - if (!tmp_path) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - } - tmp_loc.path = tmp_path; + return ret; +} - STACK_WIND_COOKIE (frame, trash_ftruncate_mkdir_cbk, tmp_path, - this->children->xlator, - this->children->xlator->fops->mkdir, - &tmp_loc, 0755, NULL); +/** + * Notify is used to create the trash directory with fixed gfid + * using STACK_WIND only when posix xlator is up + */ +int +notify(xlator_t *this, int event, void *data, ...) +{ + trash_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + GF_VALIDATE_OR_GOTO("trash", priv, out); + + /* Check whether posix is up not */ + if (event == GF_EVENT_CHILD_UP) { + if (!priv->state) { + gf_log(this->name, GF_LOG_DEBUG, "trash xlator is off"); + goto out; + } + + /* Here there is two possibilities ,if trash directory already + * exist ,then we need to perform a rename operation on the + * old one. Otherwise, we need to create the trash directory + * For both, we need to pass location variable, gfid of parent + * and a frame for calling STACK_WIND.The location variable + * requires name,path,gfid and inode + */ + if (!priv->oldtrash_dir) + ret = create_or_rename_trash_directory(this); + else if (strcmp(priv->newtrash_dir, priv->oldtrash_dir) != 0) + ret = rename_trash_directory(this); + if (ret) + goto out; + + if (priv->internal) + (void)create_internalop_directory(this); + } out: - GF_FREE (cookie); /* strdup (dir_name) was sent here :) */ - GF_FREE (tmp_str); - - return 0; + ret = default_notify(this, event, data); + if (ret) + gf_log(this->name, GF_LOG_INFO, "default notify event failed"); + return ret; } - int32_t -trash_ftruncate_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) +mem_acct_init(xlator_t *this) { - trash_private_t *priv = NULL; - trash_local_t *local = NULL; + int ret = -1; - priv = this->private; - local = frame->local; + GF_VALIDATE_OR_GOTO("trash", this, out); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: %s",local->newloc.path, strerror(op_errno)); - - TRASH_STACK_UNWIND (ftruncate, frame, -1, op_errno, buf, NULL); - return 0; - } - if ((buf->ia_size == 0) || (buf->ia_size > priv->max_trash_file_size)) - { - STACK_WIND (frame, trash_common_unwind_buf_cbk, - this->children->xlator, - this->children->xlator->fops->ftruncate, - local->fd, local->fop_offset); - return 0; - } - - - STACK_WIND (frame, trash_ftruncate_create_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, &local->newloc, - ( O_CREAT | O_EXCL | O_WRONLY ), - st_mode_from_ia (buf->ia_prot, local->loc.inode->ia_type), - local->newfd, NULL); - - return 0; + ret = xlator_mem_acct_init(this, gf_trash_mt_end + 1); + if (ret != 0) { + gf_log(this->name, GF_LOG_ERROR, + "Memory accounting init" + "failed"); + return ret; + } +out: + return ret; } +/** + * trash_init + */ int32_t -trash_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) +init(xlator_t *this) { - trash_elim_pattern_t *trav = NULL; - trash_private_t *priv = NULL; - trash_local_t *local = NULL; - dentry_t *dir_entry = NULL; - char *pathbuf = NULL; - inode_t *newinode = NULL; - char timestr[64]; - int32_t retval = 0; - int32_t match = 0; - - priv = this->private; - - dir_entry = __dentry_search_arbit (fd->inode); - retval = inode_path (fd->inode, NULL, &pathbuf); - - if (priv->eliminate) { - trav = priv->eliminate; - while (trav) { - if (fnmatch(trav->pattern, dir_entry->name, 0) == 0) { - match++; - break; - } - trav = trav->next; - } - } - - if ((strncmp (pathbuf, priv->trash_dir, - strlen (priv->trash_dir)) == 0) || - (offset >= priv->max_trash_file_size) || - (!retval) || - match) { - STACK_WIND (frame, trash_common_unwind_buf_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, - fd, offset); - return 0; - } - - local = mem_get0 (this->local_pool); - if (!local) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - TRASH_STACK_UNWIND (ftruncate, frame, -1, ENOMEM, NULL, NULL); - return 0; - } - - gf_time_fmt (timestr, sizeof timestr, time (NULL), gf_timefmt_F_HMS); - strcpy (local->newpath, priv->trash_dir); - strcat (local->newpath, pathbuf); - strcat (local->newpath, timestr); - - local->fd = fd_ref (fd); - newinode = inode_new (fd->inode->table); - local->newfd = fd_create (newinode, frame->root->pid); - frame->local=local; - - local->newloc.inode = newinode; - local->newloc.path = local->newpath; - - local->loc.inode = inode_ref (fd->inode); - local->loc.path = pathbuf; - - local->fop_offset = offset; - local->cur_offset = offset; - - STACK_WIND (frame, trash_ftruncate_fstat_cbk, this->children->xlator, - this->children->xlator->fops->fstat, fd); + trash_private_t *priv = NULL; + int ret = -1; + char *tmp = NULL; + char *tmp_str = NULL; + char trash_dir[PATH_MAX] = { + 0, + }; + uint64_t max_trash_file_size64 = 0; + data_t *data = NULL; + + GF_VALIDATE_OR_GOTO("trash", this, out); + + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_ERROR, + "not configured with exactly one child. exiting"); + ret = -1; + goto out; + } + + if (!this->parents) { + gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile"); + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_trash_mt_trash_private_t); + if (!priv) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); + ret = ENOMEM; + goto out; + } + + /* Trash priv data members are initialized through the following + * set of statements + */ + GF_OPTION_INIT("trash", priv->state, bool, out); + + GF_OPTION_INIT("trash-dir", tmp, str, out); + + /* We store trash dir value as path for easier manipulation*/ + if (!tmp) { + gf_log(this->name, GF_LOG_INFO, + "no option specified for 'trash-dir', " + "using \"/.trashcan/\""); + priv->newtrash_dir = gf_strdup("/.trashcan/"); + if (!priv->newtrash_dir) { + ret = ENOMEM; + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + goto out; + } + } else { + sprintf(trash_dir, "/%s/", tmp); + priv->newtrash_dir = gf_strdup(trash_dir); + if (!priv->newtrash_dir) { + ret = ENOMEM; + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + goto out; + } + } + tmp = NULL; + + GF_OPTION_INIT("trash-eliminate-path", tmp, str, out); + if (!tmp) { + gf_log(this->name, GF_LOG_INFO, + "no option specified for 'eliminate', using NULL"); + } else { + tmp_str = gf_strdup(tmp); + if (!tmp_str) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); + ret = ENOMEM; + goto out; + } + ret = store_eliminate_path(tmp_str, &priv->eliminate); + } + tmp = NULL; + + GF_OPTION_INIT("trash-max-filesize", max_trash_file_size64, size_uint64, + out); + if (!max_trash_file_size64) { + gf_log(this->name, GF_LOG_ERROR, + "no option specified for 'max-trashable-file-size', " + "using default = %lld MB", + GF_DEFAULT_MAX_FILE_SIZE / GF_UNIT_MB); + priv->max_trash_file_size = GF_DEFAULT_MAX_FILE_SIZE; + } else { + priv->max_trash_file_size = max_trash_file_size64; + gf_log(this->name, GF_LOG_DEBUG, "%" GF_PRI_SIZET " max-size", + priv->max_trash_file_size); + } + + GF_OPTION_INIT("trash-internal-op", priv->internal, bool, out); + + this->local_pool = mem_pool_new(trash_local_t, 64); + if (!this->local_pool) { + gf_log(this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + ret = ENOMEM; + goto out; + } + + /* For creating directories inside trash with proper permissions, + * we need to perform stat on that directories, for this we use + * brick path + */ + data = dict_get(this->options, "brick-path"); + if (!data) { + gf_log(this->name, GF_LOG_ERROR, + "no option specified for 'brick-path'"); + ret = ENOMEM; + goto out; + } + priv->brick_path = gf_strdup(data->data); + if (!priv->brick_path) { + ret = ENOMEM; + gf_log(this->name, GF_LOG_DEBUG, "out of memory"); + goto out; + } + + priv->trash_itable = inode_table_new(0, this); + gf_log(this->name, GF_LOG_DEBUG, "brick path is%s", priv->brick_path); + + this->private = (void *)priv; + ret = 0; - return 0; +out: + if (tmp_str) + GF_FREE(tmp_str); + if (ret) { + if (priv) { + if (priv->newtrash_dir) + GF_FREE(priv->newtrash_dir); + if (priv->oldtrash_dir) + GF_FREE(priv->oldtrash_dir); + if (priv->brick_path) + GF_FREE(priv->brick_path); + if (priv->eliminate) + wipe_eliminate_path(&priv->eliminate); + GF_FREE(priv); + } + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } + return ret; } /** - * trash_init - + * trash_fini */ -int32_t -init (xlator_t *this) +void +fini(xlator_t *this) { - data_t *data = NULL; - trash_private_t *_priv = NULL; - trash_elim_pattern_t *trav = NULL; - char *tmp_str = NULL; - char *strtokptr = NULL; - char *component = NULL; - char trash_dir[PATH_MAX] = {0,}; - uint64_t max_trash_file_size64 = 0; - - /* Create .trashcan directory in init */ - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "not configured with exactly one child. exiting"); - return -1; - } + trash_private_t *priv = NULL; + inode_table_t *inode_table = NULL; - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); + GF_VALIDATE_OR_GOTO("trash", this, out); + priv = this->private; + if (priv) { + inode_table = priv->trash_itable; + if (priv->newtrash_dir) { + GF_FREE(priv->newtrash_dir); + priv->newtrash_dir = NULL; } - - _priv = GF_CALLOC (1, sizeof (*_priv), gf_trash_mt_trash_private_t); - if (!_priv) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - return -1; + if (priv->oldtrash_dir) { + GF_FREE(priv->oldtrash_dir); + priv->oldtrash_dir = NULL; } - - data = dict_get (this->options, "trash-dir"); - if (!data) { - gf_log (this->name, GF_LOG_INFO, - "no option specified for 'trash-dir', " - "using \"/.trashcan/\""); - _priv->trash_dir = gf_strdup ("/.trashcan"); - } else { - /* Need a path with '/' as the first char, if not - given, append it */ - if (data->data[0] == '/') { - _priv->trash_dir = gf_strdup (data->data); - } else { - /* TODO: Make sure there is no ".." in the path */ - strcpy (trash_dir, "/"); - strcat (trash_dir, data->data); - _priv->trash_dir = gf_strdup (trash_dir); - } + if (priv->brick_path) { + GF_FREE(priv->brick_path); + priv->brick_path = NULL; } - - data = dict_get (this->options, "eliminate-pattern"); - if (!data) { - gf_log (this->name, GF_LOG_TRACE, - "no option specified for 'eliminate', using NULL"); - } else { - tmp_str = gf_strdup (data->data); - if (!tmp_str) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - } - - /* Match Filename to option specified in eliminate. */ - component = strtok_r (tmp_str, "|", &strtokptr); - while (component) { - trav = GF_CALLOC (1, sizeof (*trav), - gf_trash_mt_trash_elim_pattern_t); - if (!trav) { - gf_log (this->name, GF_LOG_DEBUG, "out of memory"); - break; - } - trav->pattern = component; - trav->next = _priv->eliminate; - _priv->eliminate = trav; - - component = strtok_r (NULL, "|", &strtokptr); - } - } - - /* TODO: do gf_string2sizet () */ - data = dict_get (this->options, "max-trashable-file-size"); - if (!data) { - gf_log (this->name, GF_LOG_DEBUG, - "no option specified for 'max-trashable-file-size', " - "using default = %lld MB", - GF_DEFAULT_MAX_FILE_SIZE / GF_UNIT_MB); - _priv->max_trash_file_size = GF_DEFAULT_MAX_FILE_SIZE; - } else { - (void)gf_string2bytesize (data->data, - &max_trash_file_size64); - if( max_trash_file_size64 > GF_ALLOWED_MAX_FILE_SIZE ) { - gf_log (this->name, GF_LOG_DEBUG, - "Size specified for max-size(in MB) is too " - "large so using 1GB as max-size (NOT IDEAL)"); - _priv->max_trash_file_size = GF_ALLOWED_MAX_FILE_SIZE; - } else - _priv->max_trash_file_size = max_trash_file_size64; - gf_log (this->name, GF_LOG_DEBUG, "%"GF_PRI_SIZET" max-size", - _priv->max_trash_file_size); + if (priv->eliminate) { + wipe_eliminate_path(&priv->eliminate); + priv->eliminate = NULL; } - - this->local_pool = mem_pool_new (trash_local_t, 64); - if (!this->local_pool) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create local_t's memory pool"); - return -1; + if (inode_table) { + inode_table_destroy(inode_table); + priv->trash_itable = NULL; } + GF_FREE(priv); + } - - this->private = (void *)_priv; - return 0; -} - -void -fini (xlator_t *this) -{ - trash_private_t *priv = NULL; - - priv = this->private; - GF_FREE (priv); - - return; + if (this->local_pool) { + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } + this->private = NULL; +out: + return; } struct xlator_fops fops = { - .unlink = trash_unlink, - .rename = trash_rename, - .truncate = trash_truncate, - .ftruncate = trash_ftruncate, + .unlink = trash_unlink, + .truncate = trash_truncate, + .ftruncate = trash_ftruncate, + .rmdir = trash_rmdir, + .mkdir = trash_mkdir, + .rename = trash_rename, }; -struct xlator_cbks cbks = { -}; +struct xlator_cbks cbks = {}; struct volume_options options[] = { - { .key = { "trash-directory" }, - .type = GF_OPTION_TYPE_PATH, - }, - { .key = { "eliminate-pattern" }, - .type = GF_OPTION_TYPE_STR, - }, - { .key = { "max-trashable-file-size" }, - .type = GF_OPTION_TYPE_SIZET, - }, - { .key = {NULL} }, + { + .key = {"trash"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enable/disable trash translator", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"backup"}, + }, + { + .key = {"trash-dir"}, + .type = GF_OPTION_TYPE_STR, + .default_value = ".trashcan", + .description = "Directory for trash files", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"backup"}, + }, + { + .key = {"trash-eliminate-path"}, + .type = GF_OPTION_TYPE_STR, + .description = "Eliminate paths to be excluded " + "from trashing", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"backup"}, + }, + { + .key = {"trash-max-filesize"}, + .type = GF_OPTION_TYPE_SIZET, + .default_value = "5MB", + .description = "Maximum size of file that can be " + "moved to trash", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"backup"}, + }, + { + .key = {"trash-internal-op"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enable/disable trash translator for " + "internal operations", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"backup"}, + }, + {.key = {"brick-path"}, + .type = GF_OPTION_TYPE_PATH, + .default_value = "{{ brick.path }}"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "trash", + .category = GF_TECH_PREVIEW, }; diff --git a/xlators/features/trash/src/trash.h b/xlators/features/trash/src/trash.h index 9a7c033617d..6671617c2c6 100644 --- a/xlators/features/trash/src/trash.h +++ b/xlators/features/trash/src/trash.h @@ -10,70 +10,88 @@ #ifndef __TRASH_H__ #define __TRASH_H__ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "defaults.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> #include "inode.c" #include "fnmatch.h" #include <libgen.h> #ifndef GF_BLOCK_READV_SIZE -#define GF_BLOCK_READV_SIZE (128 * GF_UNIT_KB) +#define GF_BLOCK_READV_SIZE (128 * GF_UNIT_KB) #endif #ifndef GF_DEFAULT_MAX_FILE_SIZE #define GF_DEFAULT_MAX_FILE_SIZE (200 * GF_UNIT_MB) #endif -#ifndef GF_ALLOWED_MAX_FILE_SIZE -#define GF_ALLOWED_MAX_FILE_SIZE (1 * GF_UNIT_GB) -#endif - - struct trash_struct { - fd_t *fd; /* for the fd of existing file */ - fd_t *newfd; /* for the newly created file */ - loc_t loc; /* to store the location of the existing file */ - loc_t newloc; /* to store the location for the new file */ - size_t fsize; /* for keeping the size of existing file */ - off_t cur_offset; /* current offset for read and write ops */ - off_t fop_offset; - char origpath[PATH_MAX]; - char newpath[PATH_MAX]; - int32_t loop_count; - struct iatt preparent; - struct iatt postparent; + fd_t *fd; /* for the fd of existing file */ + fd_t *newfd; /* for the newly created file */ + loc_t loc; /* to store the location of the existing file */ + loc_t newloc; /* to store the location for the new file */ + size_t fsize; /* for keeping the size of existing file */ + off_t cur_offset; /* current offset for read and write ops */ + off_t fop_offset; /* original offset received with the fop */ + pid_t pid; + char origpath[PATH_MAX]; + char newpath[PATH_MAX]; + int32_t loop_count; + gf_boolean_t is_set_pid; + struct iatt preparent; + struct iatt postparent; + gf_boolean_t ctr_link_count_req; }; typedef struct trash_struct trash_local_t; -struct _trash_elim_pattern; -typedef struct _trash_elim_pattern { - struct _trash_elim_pattern *next; - char *pattern; -} trash_elim_pattern_t; +struct _trash_elim_path { + struct _trash_elim_path *next; + char *path; +}; +typedef struct _trash_elim_path trash_elim_path; struct trash_priv { - char *trash_dir; - trash_elim_pattern_t *eliminate; - size_t max_trash_file_size; + char *oldtrash_dir; + char *newtrash_dir; + char *brick_path; + trash_elim_path *eliminate; + size_t max_trash_file_size; + gf_boolean_t state; + gf_boolean_t internal; + inode_t *trash_inode; + inode_table_t *trash_itable; }; typedef struct trash_priv trash_private_t; -#define TRASH_STACK_UNWIND(op, frame, params ...) do { \ - trash_local_t *__local = NULL; \ - __local = frame->local; \ - frame->local = NULL; \ - STACK_UNWIND_STRICT (op, frame, params); \ - trash_local_wipe (__local); \ - } while (0) +#define TRASH_SET_PID(frame, local) \ + do { \ + GF_ASSERT(!local->is_set_pid); \ + if (!local->is_set_pid) { \ + local->pid = frame->root->pid; \ + frame->root->pid = GF_SERVER_PID_TRASH; \ + local->is_set_pid = _gf_true; \ + } \ + } while (0) + +#define TRASH_UNSET_PID(frame, local) \ + do { \ + GF_ASSERT(local->is_set_pid); \ + if (local->is_set_pid) { \ + frame->root->pid = local->pid; \ + local->is_set_pid = _gf_false; \ + } \ + } while (0) +#define TRASH_STACK_UNWIND(op, frame, params...) \ + do { \ + trash_local_t *__local = NULL; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_UNWIND_STRICT(op, frame, params); \ + trash_local_wipe(__local); \ + } while (0) #endif /* __TRASH_H__ */ diff --git a/xlators/features/upcall/Makefile.am b/xlators/features/upcall/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/features/upcall/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/upcall/src/Makefile.am b/xlators/features/upcall/src/Makefile.am new file mode 100644 index 00000000000..72b7f55ae0a --- /dev/null +++ b/xlators/features/upcall/src/Makefile.am @@ -0,0 +1,23 @@ +if WITH_SERVER +xlator_LTLIBRARIES = upcall.la +endif +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +upcall_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +upcall_la_SOURCES = upcall.c upcall-internal.c + +upcall_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \ + $(top_builddir)/rpc/xdr/src/libgfxdr.la + +noinst_HEADERS = upcall.h upcall-mem-types.h upcall-messages.h \ + upcall-cache-invalidation.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/upcall/src/upcall-cache-invalidation.h b/xlators/features/upcall/src/upcall-cache-invalidation.h new file mode 100644 index 00000000000..db649b2c9a6 --- /dev/null +++ b/xlators/features/upcall/src/upcall-cache-invalidation.h @@ -0,0 +1,18 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __UPCALL_CACHE_INVALIDATION_H__ +#define __UPCALL_CACHE_INVALIDATION_H__ + +/* The time period for which a client will be notified of cache_invalidation + * events post its last access */ +#define CACHE_INVALIDATION_TIMEOUT "60" + +#endif /* __UPCALL_CACHE_INVALIDATION_H__ */ diff --git a/xlators/features/upcall/src/upcall-internal.c b/xlators/features/upcall/src/upcall-internal.c new file mode 100644 index 00000000000..c641bd6f432 --- /dev/null +++ b/xlators/features/upcall/src/upcall-internal.c @@ -0,0 +1,689 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <unistd.h> +#include <fcntl.h> +#include <limits.h> + +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/xlator.h> +#include <glusterfs/logging.h> +#include <glusterfs/common-utils.h> + +#include <glusterfs/statedump.h> +#include <glusterfs/syncop.h> + +#include "upcall.h" +#include "upcall-mem-types.h" +#include "glusterfs3-xdr.h" +#include "protocol-common.h" +#include <glusterfs/defaults.h> + +/* + * Check if any of the upcall options are enabled: + * - cache_invalidation + */ +gf_boolean_t +is_upcall_enabled(xlator_t *this) +{ + upcall_private_t *priv = NULL; + + if (this->private) { + priv = (upcall_private_t *)this->private; + return priv->cache_invalidation_enabled; + } + + return _gf_false; +} + +/* + * Get the cache_invalidation_timeout + */ +static int32_t +get_cache_invalidation_timeout(xlator_t *this) +{ + upcall_private_t *priv = NULL; + + if (this->private) { + priv = (upcall_private_t *)this->private; + return priv->cache_invalidation_timeout; + } + + return 0; +} + +static upcall_client_t * +__add_upcall_client(call_frame_t *frame, client_t *client, + upcall_inode_ctx_t *up_inode_ctx, time_t now) +{ + upcall_client_t *up_client_entry = GF_MALLOC( + sizeof(*up_client_entry), gf_upcall_mt_upcall_client_entry_t); + if (!up_client_entry) { + gf_msg("upcall", GF_LOG_WARNING, 0, UPCALL_MSG_NO_MEMORY, + "Memory allocation failed"); + return NULL; + } + INIT_LIST_HEAD(&up_client_entry->client_list); + up_client_entry->client_uid = gf_strdup(client->client_uid); + up_client_entry->access_time = now; + up_client_entry->expire_time_attr = get_cache_invalidation_timeout( + frame->this); + + list_add_tail(&up_client_entry->client_list, &up_inode_ctx->client_list); + + gf_log(THIS->name, GF_LOG_DEBUG, "upcall_entry_t client added - %s", + up_client_entry->client_uid); + + return up_client_entry; +} + +static int +__upcall_inode_ctx_set(inode_t *inode, xlator_t *this) +{ + upcall_inode_ctx_t *inode_ctx = NULL; + upcall_private_t *priv = NULL; + int ret = -1; + uint64_t ctx = 0; + + priv = this->private; + GF_ASSERT(priv); + + ret = __inode_ctx_get(inode, this, &ctx); + + if (!ret) + goto out; + + inode_ctx = GF_MALLOC(sizeof(upcall_inode_ctx_t), + gf_upcall_mt_upcall_inode_ctx_t); + + if (!inode_ctx) { + ret = -ENOMEM; + goto out; + } + + pthread_mutex_init(&inode_ctx->client_list_lock, NULL); + INIT_LIST_HEAD(&inode_ctx->inode_ctx_list); + INIT_LIST_HEAD(&inode_ctx->client_list); + inode_ctx->destroy = 0; + gf_uuid_copy(inode_ctx->gfid, inode->gfid); + + ctx = (long)inode_ctx; + ret = __inode_ctx_set(inode, this, &ctx); + if (ret) { + gf_log(this->name, GF_LOG_DEBUG, "failed to set inode ctx (%p)", inode); + GF_FREE(inode_ctx); + goto out; + } + + /* add this inode_ctx to the global list */ + LOCK(&priv->inode_ctx_lk); + { + list_add_tail(&inode_ctx->inode_ctx_list, &priv->inode_ctx_list); + } + UNLOCK(&priv->inode_ctx_lk); +out: + return ret; +} + +static upcall_inode_ctx_t * +__upcall_inode_ctx_get(inode_t *inode, xlator_t *this) +{ + upcall_inode_ctx_t *inode_ctx = NULL; + uint64_t ctx = 0; + int ret = 0; + + ret = __inode_ctx_get(inode, this, &ctx); + + if (ret < 0) { + ret = __upcall_inode_ctx_set(inode, this); + if (ret < 0) + goto out; + + ret = __inode_ctx_get(inode, this, &ctx); + if (ret < 0) + goto out; + } + + inode_ctx = (upcall_inode_ctx_t *)(long)(ctx); + +out: + return inode_ctx; +} + +upcall_inode_ctx_t * +upcall_inode_ctx_get(inode_t *inode, xlator_t *this) +{ + upcall_inode_ctx_t *inode_ctx = NULL; + + LOCK(&inode->lock); + { + inode_ctx = __upcall_inode_ctx_get(inode, this); + } + UNLOCK(&inode->lock); + + return inode_ctx; +} + +static int +__upcall_cleanup_client_entry(upcall_client_t *up_client) +{ + list_del_init(&up_client->client_list); + + GF_FREE(up_client->client_uid); + GF_FREE(up_client); + + return 0; +} + +static int +upcall_cleanup_expired_clients(xlator_t *this, upcall_inode_ctx_t *up_inode_ctx, + time_t now) +{ + upcall_client_t *up_client = NULL; + upcall_client_t *tmp = NULL; + int ret = -1; + time_t timeout = 0; + time_t t_expired = 0; + + timeout = get_cache_invalidation_timeout(this); + + pthread_mutex_lock(&up_inode_ctx->client_list_lock); + { + list_for_each_entry_safe(up_client, tmp, &up_inode_ctx->client_list, + client_list) + { + t_expired = now - up_client->access_time; + + if (t_expired > (2 * timeout)) { + gf_log(THIS->name, GF_LOG_TRACE, "Cleaning up client_entry(%s)", + up_client->client_uid); + + ret = __upcall_cleanup_client_entry(up_client); + + if (ret) { + gf_msg("upcall", GF_LOG_WARNING, 0, + UPCALL_MSG_INTERNAL_ERROR, + "Client entry cleanup failed (%p)", up_client); + goto out; + } + } + } + } + pthread_mutex_unlock(&up_inode_ctx->client_list_lock); + + ret = 0; +out: + return ret; +} + +/* + * Free Upcall inode_ctx client list + */ +int +__upcall_cleanup_inode_ctx_client_list(upcall_inode_ctx_t *inode_ctx) +{ + upcall_client_t *up_client = NULL; + upcall_client_t *tmp = NULL; + + list_for_each_entry_safe(up_client, tmp, &inode_ctx->client_list, + client_list) + { + __upcall_cleanup_client_entry(up_client); + } + + return 0; +} + +static void +upcall_cache_forget(xlator_t *this, inode_t *inode, + upcall_inode_ctx_t *up_inode_ctx); + +/* + * Free upcall_inode_ctx + */ +int +upcall_cleanup_inode_ctx(xlator_t *this, inode_t *inode) +{ + uint64_t ctx = 0; + upcall_inode_ctx_t *inode_ctx = NULL; + int ret = 0; + upcall_private_t *priv = NULL; + + priv = this->private; + GF_ASSERT(priv); + + ret = inode_ctx_del(inode, this, &ctx); + + if (ret < 0) { + gf_msg("upcall", GF_LOG_WARNING, 0, UPCALL_MSG_INTERNAL_ERROR, + "Failed to del upcall_inode_ctx (%p)", inode); + goto out; + } + + inode_ctx = (upcall_inode_ctx_t *)(long)ctx; + + if (inode_ctx) { + /* Invalidate all the upcall cache entries */ + upcall_cache_forget(this, inode, inode_ctx); + + /* do we really need lock? yes now reaper thread + * may also be trying to cleanup the client entries. + */ + pthread_mutex_lock(&inode_ctx->client_list_lock); + { + if (!list_empty(&inode_ctx->client_list)) { + __upcall_cleanup_inode_ctx_client_list(inode_ctx); + } + } + pthread_mutex_unlock(&inode_ctx->client_list_lock); + + /* Mark the inode_ctx to be destroyed */ + inode_ctx->destroy = 1; + gf_msg_debug("upcall", 0, "set upcall_inode_ctx (%p) to destroy mode", + inode_ctx); + } + +out: + return ret; +} + +/* + * Traverse through the list of upcall_inode_ctx(s), + * cleanup the expired client entries and destroy the ctx + * which is no longer valid and has destroy bit set. + */ +void * +upcall_reaper_thread(void *data) +{ + upcall_private_t *priv = NULL; + upcall_inode_ctx_t *inode_ctx = NULL; + upcall_inode_ctx_t *tmp = NULL; + xlator_t *this = NULL; + time_t timeout = 0; + time_t time_now; + + this = (xlator_t *)data; + GF_ASSERT(this); + + priv = this->private; + GF_ASSERT(priv); + + time_now = gf_time(); + while (!priv->fini) { + list_for_each_entry_safe(inode_ctx, tmp, &priv->inode_ctx_list, + inode_ctx_list) + { + /* cleanup expired clients */ + upcall_cleanup_expired_clients(this, inode_ctx, time_now); + + if (!inode_ctx->destroy) { + continue; + } + + /* client list would have been cleaned up*/ + gf_msg_debug("upcall", 0, "Freeing upcall_inode_ctx (%p)", + inode_ctx); + LOCK(&priv->inode_ctx_lk); + { + list_del_init(&inode_ctx->inode_ctx_list); + pthread_mutex_destroy(&inode_ctx->client_list_lock); + } + UNLOCK(&priv->inode_ctx_lk); + GF_FREE(inode_ctx); + inode_ctx = NULL; + } + + /* don't do a very busy loop */ + timeout = get_cache_invalidation_timeout(this); + sleep(timeout / 2); + time_now = gf_time(); + } + + return NULL; +} + +/* + * Initialize upcall reaper thread. + */ +int +upcall_reaper_thread_init(xlator_t *this) +{ + upcall_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + GF_ASSERT(priv); + + ret = gf_thread_create(&priv->reaper_thr, NULL, upcall_reaper_thread, this, + "upreaper"); + + return ret; +} + +int +up_compare_afr_xattr(dict_t *d, char *k, data_t *v, void *tmp) +{ + dict_t *dict = tmp; + + if (!strncmp(k, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX)) && + (!is_data_equal(v, dict_get(dict, k)))) + return -1; + + return 0; +} + +static void +up_filter_afr_xattr(dict_t *xattrs, char *xattr, data_t *v) +{ + /* Filter the afr pending xattrs, with value 0. Ideally this should + * be executed only in case of xattrop and not in set and removexattr, + * butset and remove xattr fops do not come with keys AFR_XATTR_PREFIX + */ + if (!strncmp(xattr, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX)) && + (mem_0filled(v->data, v->len) == 0)) { + dict_del(xattrs, xattr); + } + return; +} + +static gf_boolean_t +up_key_is_regd_xattr(dict_t *regd_xattrs, char *regd_xattr, data_t *v, + void *xattr) +{ + int ret = _gf_false; + char *key = xattr; + + if (fnmatch(regd_xattr, key, 0) == 0) + ret = _gf_true; + + return ret; +} + +int +up_filter_unregd_xattr(dict_t *xattrs, char *xattr, data_t *v, + void *regd_xattrs) +{ + int ret = 0; + + ret = dict_foreach_match(regd_xattrs, up_key_is_regd_xattr, xattr, + dict_null_foreach_fn, NULL); + if (ret == 0) { + /* xattr was not found in the registered xattr, hence do not + * send notification for its change + */ + dict_del(xattrs, xattr); + goto out; + } + up_filter_afr_xattr(xattrs, xattr, v); +out: + return 0; +} + +int +up_filter_xattr(dict_t *xattr, dict_t *regd_xattrs) +{ + int ret = 0; + + ret = dict_foreach(xattr, up_filter_unregd_xattr, regd_xattrs); + + return ret; +} + +static void +upcall_client_cache_invalidate(xlator_t *this, uuid_t gfid, + upcall_client_t *up_client_entry, uint32_t flags, + struct iatt *stbuf, struct iatt *p_stbuf, + struct iatt *oldp_stbuf, dict_t *xattr, + time_t now); + +gf_boolean_t +up_invalidate_needed(dict_t *xattrs) +{ + if (dict_key_count(xattrs) == 0) { + gf_msg_trace("upcall", 0, + "None of xattrs requested for" + " invalidation, were changed. Nothing to " + "invalidate"); + return _gf_false; + } + + return _gf_true; +} + +/* + * Given a client, first fetch upcall_entry_t from the inode_ctx client list. + * Later traverse through the client list of that upcall entry. If this client + * is not present in the list, create one client entry with this client info. + * Also check if there are other clients which need to be notified of this + * op. If yes send notify calls to them. + * + * Since sending notifications for cache_invalidation is a best effort, + * any errors during the process are logged and ignored. + */ +void +upcall_cache_invalidate(call_frame_t *frame, xlator_t *this, client_t *client, + inode_t *inode, uint32_t flags, struct iatt *stbuf, + struct iatt *p_stbuf, struct iatt *oldp_stbuf, + dict_t *xattr) +{ + upcall_client_t *up_client_entry = NULL; + upcall_client_t *tmp = NULL; + upcall_inode_ctx_t *up_inode_ctx = NULL; + gf_boolean_t found = _gf_false; + time_t time_now; + inode_t *linked_inode = NULL; + + if (!is_upcall_enabled(this)) + return; + + /* server-side generated fops like quota/marker will not have any + * client associated with them. Ignore such fops. + */ + if (!client) { + gf_msg_debug("upcall", 0, "Internal fop - client NULL"); + return; + } + + /* For nameless LOOKUPs, inode created shall always be + * invalid. Hence check if there is any already linked inode. + * If yes, update the inode_ctx of that valid inode + */ + if (inode && (inode->ia_type == IA_INVAL) && stbuf) { + linked_inode = inode_find(inode->table, stbuf->ia_gfid); + if (linked_inode) { + gf_log("upcall", GF_LOG_DEBUG, + "upcall_inode_ctx_get of linked inode (%p)", inode); + up_inode_ctx = upcall_inode_ctx_get(linked_inode, this); + } + } + + if (inode && !up_inode_ctx) + up_inode_ctx = upcall_inode_ctx_get(inode, this); + + if (!up_inode_ctx) { + gf_msg("upcall", GF_LOG_WARNING, 0, UPCALL_MSG_INTERNAL_ERROR, + "upcall_inode_ctx_get failed (%p)", inode); + return; + } + + /* In case of LOOKUP, if first time, inode created shall be + * invalid till it gets linked to inode table. Read gfid from + * the stat returned in such cases. + */ + if (gf_uuid_is_null(up_inode_ctx->gfid) && stbuf) { + /* That means inode must have been invalid when this inode_ctx + * is created. Copy the gfid value from stbuf instead. + */ + gf_uuid_copy(up_inode_ctx->gfid, stbuf->ia_gfid); + } + + if (gf_uuid_is_null(up_inode_ctx->gfid)) { + gf_msg_debug(this->name, 0, + "up_inode_ctx->gfid and " + "stbuf->ia_gfid is NULL, fop:%s", + gf_fop_list[frame->root->op]); + goto out; + } + + time_now = gf_time(); + pthread_mutex_lock(&up_inode_ctx->client_list_lock); + { + list_for_each_entry_safe(up_client_entry, tmp, + &up_inode_ctx->client_list, client_list) + { + /* Do not send UPCALL event if same client. */ + if (!strcmp(client->client_uid, up_client_entry->client_uid)) { + up_client_entry->access_time = time_now; + found = _gf_true; + continue; + } + + /* + * Ignore sending notifications in case of only UP_ATIME + */ + if (!(flags & ~(UP_ATIME))) { + if (found) + break; + else /* we still need to find current client entry*/ + continue; + } + + /* any other client */ + + /* XXX: Send notifications asynchrounously + * instead of in the I/O path - BZ 1200264 + * Also if the file is frequently accessed, set + * expire_time_attr to 0. + */ + upcall_client_cache_invalidate( + this, up_inode_ctx->gfid, up_client_entry, flags, stbuf, + p_stbuf, oldp_stbuf, xattr, time_now); + } + + if (!found) { + up_client_entry = __add_upcall_client(frame, client, up_inode_ctx, + time_now); + } + } + pthread_mutex_unlock(&up_inode_ctx->client_list_lock); +out: + /* release the ref from inode_find */ + if (linked_inode) + inode_unref(linked_inode); + return; +} + +/* + * If the upcall_client_t has recently accessed the file (i.e, within + * priv->cache_invalidation_timeout), send a upcall notification. + */ +static void +upcall_client_cache_invalidate(xlator_t *this, uuid_t gfid, + upcall_client_t *up_client_entry, uint32_t flags, + struct iatt *stbuf, struct iatt *p_stbuf, + struct iatt *oldp_stbuf, dict_t *xattr, + time_t now) +{ + struct gf_upcall up_req = { + 0, + }; + struct gf_upcall_cache_invalidation ca_req = { + 0, + }; + time_t timeout = 0; + int ret = -1; + time_t t_expired = now - up_client_entry->access_time; + + GF_VALIDATE_OR_GOTO("upcall_client_cache_invalidate", + !(gf_uuid_is_null(gfid)), out); + timeout = get_cache_invalidation_timeout(this); + + if (t_expired < timeout) { + /* Send notify call */ + up_req.client_uid = up_client_entry->client_uid; + gf_uuid_copy(up_req.gfid, gfid); + + ca_req.flags = flags; + ca_req.expire_time_attr = up_client_entry->expire_time_attr; + if (stbuf) + ca_req.stat = *stbuf; + if (p_stbuf) + ca_req.p_stat = *p_stbuf; + if (oldp_stbuf) + ca_req.oldp_stat = *oldp_stbuf; + ca_req.dict = xattr; + + up_req.data = &ca_req; + up_req.event_type = GF_UPCALL_CACHE_INVALIDATION; + + gf_log(THIS->name, GF_LOG_TRACE, + "Cache invalidation notification sent to %s", + up_client_entry->client_uid); + + /* Need to send inode flags */ + ret = this->notify(this, GF_EVENT_UPCALL, &up_req); + + /* + * notify may fail as the client could have been + * dis(re)connected. Cleanup the client entry. + */ + if (ret < 0) + __upcall_cleanup_client_entry(up_client_entry); + + } else { + gf_log(THIS->name, GF_LOG_TRACE, + "Cache invalidation notification NOT sent to %s", + up_client_entry->client_uid); + + if (t_expired > (2 * timeout)) { + /* Cleanup the entry */ + __upcall_cleanup_client_entry(up_client_entry); + } + } +out: + return; +} + +/* + * This is called during upcall_inode_ctx cleanup in case of 'inode_forget'. + * Send "UP_FORGET" to all the clients so that they invalidate their cache + * entry and do a fresh lookup next time when any I/O comes in. + */ +static void +upcall_cache_forget(xlator_t *this, inode_t *inode, + upcall_inode_ctx_t *up_inode_ctx) +{ + upcall_client_t *up_client_entry = NULL; + upcall_client_t *tmp = NULL; + uint32_t flags = UP_FORGET; + time_t time_now; + + if (!up_inode_ctx) { + return; + } + + time_now = gf_time(); + pthread_mutex_lock(&up_inode_ctx->client_list_lock); + { + list_for_each_entry_safe(up_client_entry, tmp, + &up_inode_ctx->client_list, client_list) + { + /* Set the access time to gf_time() + * to send notify */ + up_client_entry->access_time = time_now; + + upcall_client_cache_invalidate(this, up_inode_ctx->gfid, + up_client_entry, flags, NULL, NULL, + NULL, NULL, time_now); + } + } + pthread_mutex_unlock(&up_inode_ctx->client_list_lock); +} diff --git a/xlators/features/upcall/src/upcall-mem-types.h b/xlators/features/upcall/src/upcall-mem-types.h new file mode 100644 index 00000000000..f9883d9d72c --- /dev/null +++ b/xlators/features/upcall/src/upcall-mem-types.h @@ -0,0 +1,23 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __UPCALL_MEM_TYPES_H__ +#define __UPCALL_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> + +enum gf_upcall_mem_types_ { + gf_upcall_mt_conf_t = gf_common_mt_end + 1, + gf_upcall_mt_private_t, + gf_upcall_mt_upcall_inode_ctx_t, + gf_upcall_mt_upcall_client_entry_t, + gf_upcall_mt_end +}; +#endif diff --git a/xlators/features/upcall/src/upcall-messages.h b/xlators/features/upcall/src/upcall-messages.h new file mode 100644 index 00000000000..4095a34c200 --- /dev/null +++ b/xlators/features/upcall/src/upcall-messages.h @@ -0,0 +1,29 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _UPCALL_MESSAGES_H_ +#define _UPCALL_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(UPCALL, UPCALL_MSG_NO_MEMORY, UPCALL_MSG_INTERNAL_ERROR, + UPCALL_MSG_NOTIFY_FAILED); + +#endif /* !_UPCALL_MESSAGES_H_ */ diff --git a/xlators/features/upcall/src/upcall.c b/xlators/features/upcall/src/upcall.c new file mode 100644 index 00000000000..0795f58059d --- /dev/null +++ b/xlators/features/upcall/src/upcall.c @@ -0,0 +1,2505 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <unistd.h> +#include <fcntl.h> +#include <limits.h> +#include <pthread.h> + +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/xlator.h> +#include <glusterfs/logging.h> +#include <glusterfs/common-utils.h> + +#include <glusterfs/statedump.h> + +#include "upcall.h" +#include "upcall-mem-types.h" +#include "glusterfs3-xdr.h" +#include "protocol-common.h" +#include <glusterfs/defaults.h> + +static int32_t +up_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata); + + return 0; +} + +static int32_t +up_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(open, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +static int32_t +up_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_WRITE_FLAGS; + upcall_cache_invalidate(frame, this, client, local->inode, flags, postbuf, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +static int32_t +up_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int count, off_t off, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, off, flags, + iobref, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +static int32_t +up_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iovec *vector, int count, struct iatt *stbuf, + struct iobref *iobref, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->inode, flags, stbuf, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); + + return 0; +} + +static int32_t +up_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); + + return 0; +} + +static int32_t +up_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_flock *lock, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(lk, frame, op_ret, op_errno, lock, xdata); + + return 0; +} + +static int32_t +up_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_lk_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lk, + fd, cmd, flock, xdata); + return 0; + +err: + UPCALL_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +static int32_t +up_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_WRITE_FLAGS; + upcall_cache_invalidate(frame, this, client, local->inode, flags, postbuf, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +static int32_t +up_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +static int32_t +up_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *statpre, struct iatt *statpost, + dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + /* XXX: setattr -> UP_SIZE or UP_OWN or UP_MODE or UP_TIMES + * or INODE_UPDATE (or UP_PERM esp in case of ACLs -> INODE_INVALIDATE) + * Need to check what attr is changed and accordingly pass UP_FLAGS. + * Bug1200271. + */ + flags = UP_ATTR_FLAGS; + /* If mode bits have changed invalidate the xattrs, as posix-acl and + * others store permission related information in xattrs. With changing + * of permissions/mode, we need to make clients to forget all the + * xattrs related to permissions. + * TODO: Invalidate the xattr system.posix_acl_access alone. + */ + if (is_same_mode(statpre->ia_prot, statpost->ia_prot) != 0) + flags |= UP_XATTR; + + upcall_cache_invalidate(frame, this, client, local->inode, flags, statpost, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(setattr, frame, op_ret, op_errno, statpre, statpost, + xdata); + + return 0; +} + +static int32_t +up_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +static int32_t +up_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *stbuf, struct iatt *preoldparent, + struct iatt *postoldparent, struct iatt *prenewparent, + struct iatt *postnewparent, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = (UP_RENAME_FLAGS | UP_PARENT_DENTRY_FLAGS); + upcall_cache_invalidate(frame, this, client, local->inode, flags, stbuf, + postnewparent, postoldparent, NULL); + + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->rename_oldloc.parent, + flags, postoldparent, NULL, NULL, NULL); + + if (local->rename_oldloc.parent == local->loc.parent) + goto out; + + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->loc.parent, flags, + postnewparent, NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(rename, frame, op_ret, op_errno, stbuf, preoldparent, + postoldparent, prenewparent, postnewparent, xdata); + + return 0; +} + +static int32_t +up_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, newloc, NULL, oldloc->inode, NULL); + if (!local) { + goto err; + } + + /* copy oldloc */ + loc_copy(&local->rename_oldloc, oldloc); +out: + STACK_WIND(frame, up_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); + + return 0; +} + +static int32_t +up_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = (UP_NLINK_FLAGS | UP_PARENT_DENTRY_FLAGS); + upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL, + postparent, NULL, NULL); + + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->loc.parent, flags, + postparent, NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); + + return 0; +} + +static int32_t +up_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +static int32_t +up_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = (UP_NLINK_FLAGS | UP_PARENT_DENTRY_FLAGS); + upcall_cache_invalidate(frame, this, client, local->inode, flags, stbuf, + postparent, NULL, NULL); + + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->loc.parent, flags, + postparent, NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, xdata); + + return 0; +} + +static int32_t +up_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, newloc, NULL, oldloc->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + + return 0; +} + +static int32_t +up_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + + flags = (UP_NLINK_FLAGS | UP_PARENT_DENTRY_FLAGS); + upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL, + postparent, NULL, NULL); + + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->loc.parent, flags, + postparent, NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(rmdir, frame, op_ret, op_errno, preparent, postparent, + xdata); + + return 0; +} + +static int32_t +up_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_rmdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +static int32_t +up_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + + /* invalidate parent's entry too */ + flags = UP_TIMES; + upcall_cache_invalidate(frame, this, client, local->inode, flags, + postparent, NULL, NULL, NULL); + + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->loc.inode, flags, stbuf, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, xdata); + + return 0; +} + +static int32_t +up_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *params) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->parent, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, params); + + return 0; + +err: + UPCALL_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + + return 0; +} + +static int32_t +up_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + + /* As its a new file create, no need of sending notification + * However invalidate parent's entry and update that fact that the + * client has accessed the newly created entry */ + flags = UP_TIMES; + upcall_cache_invalidate(frame, this, client, local->inode, flags, + postparent, NULL, NULL, NULL); + + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->loc.inode, flags, stbuf, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf, + preparent, postparent, xdata); + + return 0; +} + +static int32_t +up_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *params) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->parent, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + params); + + return 0; + +err: + UPCALL_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); + + return 0; +} + +static int32_t +up_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->inode, flags, stbuf, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr, + postparent); + + return 0; +} + +static int32_t +up_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xattr_req); + + return 0; + +err: + UPCALL_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + + return 0; +} + +static int32_t +up_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->inode, flags, buf, NULL, + NULL, NULL); + +out: + UPCALL_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata); + + return 0; +} + +static int32_t +up_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +static int32_t +up_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +static int32_t +up_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +static int32_t +up_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(access, frame, op_ret, op_errno, xdata); + + return 0; +} + +static int32_t +up_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_access_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->access, loc, mask, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(access, frame, -1, op_errno, NULL); + + return 0; +} + +static int32_t +up_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, const char *path, struct iatt *stbuf, + dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->inode, flags, stbuf, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(readlink, frame, op_ret, op_errno, path, stbuf, xdata); + + return 0; +} + +static int32_t +up_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_readlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, loc, size, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(readlink, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +static int32_t +up_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + + /* invalidate parent's entry too */ + flags = UP_TIMES; + upcall_cache_invalidate(frame, this, client, local->inode, flags, + postparent, NULL, NULL, NULL); + + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->loc.inode, flags, buf, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + + return 0; +} + +static int32_t +up_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->parent, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + + return 0; +} + +static int32_t +up_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + + /* invalidate parent's entry too */ + flags = UP_TIMES; + upcall_cache_invalidate(frame, this, client, local->inode, flags, + postparent, NULL, NULL, NULL); + + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->loc.inode, flags, buf, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(symlink, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + + return 0; +} + +static int32_t +up_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->parent, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_symlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + + return 0; +} + +static int32_t +up_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(opendir, frame, op_ret, op_errno, fd, xdata); + + return 0; +} + +static int32_t +up_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(opendir, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +static int32_t +up_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct statvfs *buf, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(statfs, frame, op_ret, op_errno, buf, xdata); + + return 0; +} + +static int32_t +up_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_statfs_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, loc, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +static int32_t +up_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(readdir, frame, op_ret, op_errno, entries, xdata); + + return 0; +} + +static int32_t +up_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, fd, size, off, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +static int32_t +up_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + gf_dirent_t *entry = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL, + NULL, NULL, NULL); + + list_for_each_entry(entry, &entries->list, list) + { + if (entry->inode == NULL) { + continue; + } + upcall_cache_invalidate(frame, this, client, entry->inode, flags, + &entry->d_stat, NULL, NULL, NULL); + } + +out: + UPCALL_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata); + + return 0; +} + +static int32_t +up_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, dict); + + return 0; + +err: + UPCALL_STACK_UNWIND(readdirp, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +static int32_t +up_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +static int32_t +up_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_WRITE_FLAGS; + upcall_cache_invalidate(frame, this, client, local->inode, flags, post, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(fallocate, frame, op_ret, op_errno, pre, post, xdata); + + return 0; +} + +static int32_t +up_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, + xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +static int32_t +up_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_WRITE_FLAGS; + upcall_cache_invalidate(frame, this, client, local->inode, flags, post, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(discard, frame, op_ret, op_errno, pre, post, xdata); + + return 0; +} + +static int32_t +up_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +static int32_t +up_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_WRITE_FLAGS; + upcall_cache_invalidate(frame, this, client, local->inode, flags, post, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(zerofill, frame, op_ret, op_errno, pre, post, xdata); + + return 0; +} + +static int +up_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +static int32_t +up_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, off_t offset, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(seek, frame, op_ret, op_errno, offset, xdata); + + return 0; +} + +static int32_t +up_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_seek_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->seek, fd, offset, what, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(seek, frame, -1, op_errno, 0, NULL); + + return 0; +} + +static int32_t +up_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + int ret = 0; + struct iatt stbuf = { + 0, + }; + upcall_private_t *priv = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, priv, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + + flags = UP_XATTR; + + ret = up_filter_xattr(local->xattr, priv->xattrs); + if (ret < 0) { + op_ret = ret; + goto out; + } + if (!up_invalidate_needed(local->xattr)) + goto out; + + ret = dict_get_iatt(xdata, GF_POSTSTAT, &stbuf); + if (ret == 0) + flags |= UP_TIMES; + + upcall_cache_invalidate(frame, this, client, local->inode, flags, &stbuf, + NULL, NULL, local->xattr); + +out: + UPCALL_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +static int32_t +up_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->inode, dict); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); + + return 0; +} + +static int32_t +up_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + int ret = 0; + struct iatt stbuf = { + 0, + }; + upcall_private_t *priv = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, priv, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + + flags = UP_XATTR; + + ret = up_filter_xattr(local->xattr, priv->xattrs); + if (ret < 0) { + op_ret = ret; + goto out; + } + if (!up_invalidate_needed(local->xattr)) + goto out; + + ret = dict_get_iatt(xdata, GF_POSTSTAT, &stbuf); + if (ret == 0) + flags |= UP_TIMES; + + upcall_cache_invalidate(frame, this, client, local->inode, flags, &stbuf, + NULL, NULL, local->xattr); + +out: + UPCALL_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +static int32_t +up_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, fd, fd->inode, dict); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + + return 0; + +err: + UPCALL_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL); + + return 0; +} + +static int32_t +up_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + struct iatt stbuf = { + 0, + }; + int ret = 0; + upcall_private_t *priv = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, priv, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_XATTR_RM; + + ret = up_filter_xattr(local->xattr, priv->xattrs); + if (ret < 0) { + op_ret = ret; + goto out; + } + if (!up_invalidate_needed(local->xattr)) + goto out; + + ret = dict_get_iatt(xdata, GF_POSTSTAT, &stbuf); + if (ret == 0) + flags |= UP_TIMES; + + upcall_cache_invalidate(frame, this, client, local->inode, flags, &stbuf, + NULL, NULL, local->xattr); + +out: + UPCALL_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int32_t +up_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + dict_t *xattr = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + xattr = dict_for_key_value(name, "", 1, _gf_true); + if (!xattr) { + goto err; + } + + local = upcall_local_init(frame, this, NULL, fd, fd->inode, xattr); + if (!local) { + goto err; + } + +out: + if (xattr) + dict_unref(xattr); + + STACK_WIND(frame, up_fremovexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + return 0; + +err: + if (xattr) + dict_unref(xattr); + + UPCALL_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL); + + return 0; +} + +static int32_t +up_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + struct iatt stbuf = { + 0, + }; + int ret = 0; + upcall_private_t *priv = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, priv, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + flags = UP_XATTR_RM; + + ret = up_filter_xattr(local->xattr, priv->xattrs); + if (ret < 0) { + op_ret = ret; + goto out; + } + if (!up_invalidate_needed(local->xattr)) + goto out; + + ret = dict_get_iatt(xdata, GF_POSTSTAT, &stbuf); + if (ret == 0) + flags |= UP_TIMES; + + upcall_cache_invalidate(frame, this, client, local->inode, flags, &stbuf, + NULL, NULL, local->xattr); + +out: + UPCALL_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int32_t +up_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + dict_t *xattr = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + xattr = dict_for_key_value(name, "", 1, _gf_true); + if (!xattr) { + goto err; + } + + local = upcall_local_init(frame, this, loc, NULL, loc->inode, xattr); + if (!local) { + goto err; + } + +out: + if (xattr) + dict_unref(xattr); + + STACK_WIND(frame, up_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; + +err: + if (xattr) + dict_unref(xattr); + + UPCALL_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL); + + return 0; +} + +static int32_t +up_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +static int32_t +up_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + return 0; +err: + UPCALL_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL); + return 0; +} + +static int32_t +up_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + client_t *client = NULL; + uint32_t flags = 0; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + + flags = UP_UPDATE_CLIENT; + upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL, + NULL, NULL, NULL); + +out: + UPCALL_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +static int32_t +up_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata) +{ + int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { + goto err; + } + +out: + STACK_WIND(frame, up_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + return 0; +err: + UPCALL_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL); + return 0; +} + +/* The xattrops here mainly tracks changes in afr pending xattr. + * 1. xattrop doesn't carry info saying post op/pre op. + * 2. Pre xattrop will have 0 value for all pending xattrs, + * the cbk of pre xattrop carries the on-disk xattr value. + * Non zero on-disk xattr indicates pending healing. + * 3. Post xattrop will either have 0 or 1 as value of pending xattrs, + * 0 on success, 1 on failure. But the post xattrop cbk will have + * 0 or 1 or any higher value. + * 0 - if no healing required* + * 1 - if this is the first time pending xattr is being set. + * n - if there is already a pending xattr set, it will increment + * the on-disk value and send that in cbk. + * Our aim is to send an invalidation, only the first time a pending + * xattr was set on a file. Below are some of the exceptions in handling + * xattrop: + * - Do not filter unregistered xattrs in the cbk, but in the call path. + * Else, we will be invalidating on every preop, if the file already has + * pending xattr set. Filtering unregistered xattrs on the fop path + * ensures we invalidate only in postop, every time a postop comes with + * pending xattr value 1. + * - Consider a brick is down, and the postop sets pending xattrs as long + * as the other brick is down. But we do not want to invalidate every time + * a pending xattr is set, but we want to invalidate only the first time + * a pending xattr is set on any file. Hence, to identify if its the first + * time a pending xattr is set, we compare the value of pending xattrs that + * came in postop and postop cbk, if its same then its the first time. + */ +static int32_t +up_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + client_t *client = NULL; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + client = frame->root->client; + local = frame->local; + + if ((op_ret < 0) || !local) { + goto out; + } + + if (up_invalidate_needed(local->xattr)) { + if (dict_foreach(local->xattr, up_compare_afr_xattr, dict) < 0) + goto out; + + upcall_cache_invalidate(frame, this, client, local->inode, UP_XATTR, + NULL, NULL, NULL, local->xattr); + } +out: + if (frame->root->op == GF_FOP_FXATTROP) { + UPCALL_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, dict, xdata); + } else { + UPCALL_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata); + } + return 0; +} + +static int32_t +up_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + upcall_local_t *local = NULL; + int ret = 0; + upcall_private_t *priv = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, priv, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->inode, xattr); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + ret = up_filter_xattr(local->xattr, priv->xattrs); + if (ret < 0) { + goto err; + } + +out: + STACK_WIND(frame, up_xattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, loc, optype, xattr, xdata); + return 0; +err: + UPCALL_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL); + return 0; +} + +static int32_t +up_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + upcall_local_t *local = NULL; + int ret = 0; + upcall_private_t *priv = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, priv, out); + + local = upcall_local_init(frame, this, NULL, fd, fd->inode, xattr); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + ret = up_filter_xattr(local->xattr, priv->xattrs); + if (ret < 0) { + goto err; + } + +out: + STACK_WIND(frame, up_xattrop_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, fd, optype, xattr, xdata); + return 0; +err: + STACK_UNWIND_STRICT(fxattrop, frame, -1, op_errno, NULL, NULL); + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init(this, gf_upcall_mt_end + 1); + + if (ret != 0) { + gf_msg("upcall", GF_LOG_WARNING, 0, UPCALL_MSG_NO_MEMORY, + "Memory allocation failed"); + return ret; + } + + return ret; +} + +void +upcall_local_wipe(xlator_t *this, upcall_local_t *local) +{ + if (local) { + inode_unref(local->inode); + if (local->xattr) + dict_unref(local->xattr); + loc_wipe(&local->rename_oldloc); + loc_wipe(&local->loc); + if (local->fd) + fd_unref(local->fd); + mem_put(local); + } +} + +upcall_local_t * +upcall_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + inode_t *inode, dict_t *xattr) +{ + upcall_local_t *local = NULL; + + GF_VALIDATE_OR_GOTO("upcall", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + local = mem_get0(THIS->local_pool); + + if (!local) + goto out; + + local->inode = inode_ref(inode); + if (xattr) + local->xattr = dict_copy_with_ref(xattr, NULL); + + if (loc) + loc_copy(&local->loc, loc); + if (fd) + local->fd = fd_ref(fd); + + frame->local = local; + +out: + return local; +} + +static int32_t +update_xattrs(dict_t *dict, char *key, data_t *value, void *data) +{ + dict_t *xattrs = data; + int ret = 0; + + ret = dict_set_int8(xattrs, key, 0); + return ret; +} + +int32_t +up_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) +{ + upcall_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, priv, out); + + if (op != GF_IPC_TARGET_UPCALL) + goto wind; + + /* TODO: Bz-1371622 Along with the xattrs also store list of clients + * that are interested in notifications, so that the notification + * can be sent to the clients that have registered. + * Once this implemented there can be unregister of xattrs for + * notifications. Until then there is no unregister of xattrs*/ + if (xdata && priv->xattrs) { + ret = dict_foreach(xdata, update_xattrs, priv->xattrs); + } + +out: + STACK_UNWIND_STRICT(ipc, frame, ret, 0, NULL); + return 0; + +wind: + STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ipc, op, xdata); + return 0; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + upcall_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, priv, out); + + GF_OPTION_RECONF("cache-invalidation", priv->cache_invalidation_enabled, + options, bool, out); + GF_OPTION_RECONF("cache-invalidation-timeout", + priv->cache_invalidation_timeout, options, int32, out); + + ret = 0; + + if (priv->cache_invalidation_enabled && !priv->reaper_init_done) { + ret = upcall_reaper_thread_init(this); + + if (ret) { + gf_msg("upcall", GF_LOG_WARNING, 0, UPCALL_MSG_INTERNAL_ERROR, + "reaper_thread creation failed (%s)." + " Disabling cache_invalidation", + strerror(errno)); + } + priv->reaper_init_done = _gf_true; + } + +out: + return ret; +} + +int +init(xlator_t *this) +{ + int ret = -1; + upcall_private_t *priv = NULL; + + priv = GF_CALLOC(1, sizeof(*priv), gf_upcall_mt_private_t); + if (!priv) + goto out; + + priv->xattrs = dict_new(); + if (!priv->xattrs) + goto out; + + GF_OPTION_INIT("cache-invalidation", priv->cache_invalidation_enabled, bool, + out); + GF_OPTION_INIT("cache-invalidation-timeout", + priv->cache_invalidation_timeout, int32, out); + + LOCK_INIT(&priv->inode_ctx_lk); + INIT_LIST_HEAD(&priv->inode_ctx_list); + + priv->fini = 0; + priv->reaper_init_done = _gf_false; + + this->private = priv; + this->local_pool = mem_pool_new(upcall_local_t, 512); + ret = 0; + + if (priv->cache_invalidation_enabled) { + ret = upcall_reaper_thread_init(this); + + if (ret) { + gf_msg("upcall", GF_LOG_WARNING, 0, UPCALL_MSG_INTERNAL_ERROR, + "reaper_thread creation failed (%s)." + " Disabling cache_invalidation", + strerror(errno)); + } + priv->reaper_init_done = _gf_true; + } +out: + if (ret && priv) { + if (priv->xattrs) + dict_unref(priv->xattrs); + + GF_FREE(priv); + } + + return ret; +} + +void +fini(xlator_t *this) +{ + upcall_private_t *priv = NULL; + + priv = this->private; + if (!priv) { + return; + } + this->private = NULL; + + priv->fini = 1; + + if (priv->reaper_thr) { + gf_thread_cleanup_xint(priv->reaper_thr); + priv->reaper_thr = 0; + priv->reaper_init_done = _gf_false; + } + + dict_unref(priv->xattrs); + LOCK_DESTROY(&priv->inode_ctx_lk); + + /* Do we need to cleanup the inode_ctxs? IMO not required + * as inode_forget would have been done on all the inodes + * before calling xlator_fini */ + GF_FREE(priv); + + if (this->local_pool) { + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } + + return; +} + +int +upcall_forget(xlator_t *this, inode_t *inode) +{ + upcall_private_t *priv = this->private; + + if (!priv) + goto out; + + upcall_cleanup_inode_ctx(this, inode); +out: + return 0; +} + +int +upcall_release(xlator_t *this, fd_t *fd) +{ + return 0; +} + +int +notify(xlator_t *this, int32_t event, void *data, ...) +{ + int ret = -1; + struct gf_upcall *up_req = NULL; + + switch (event) { + case GF_EVENT_UPCALL: { + gf_log(this->name, GF_LOG_DEBUG, "Upcall Notify event = %d", event); + + up_req = (struct gf_upcall *)data; + + GF_VALIDATE_OR_GOTO(this->name, up_req, out); + + ret = default_notify(this, event, up_req); + + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, UPCALL_MSG_NOTIFY_FAILED, + "Failed to notify cache invalidation" + " to client(%s)", + up_req->client_uid); + goto out; + } + } break; + default: + default_notify(this, event, data); + break; + } + ret = 0; + +out: + return ret; +} + +struct xlator_fops fops = { + .ipc = up_ipc, + /* fops which change only "ATIME" do not result + * in any cache invalidation. Hence upcall + * notifications are not sent in this case. + * But however, we need to store/update the + * client info in the upcall state to be able + * to notify them in case of any changes done + * to the data. + * + * Below such fops do not trigger upcall + * notifications but will add/update + * clients info in the upcall inode ctx.*/ + .lookup = up_lookup, + .open = up_open, + .statfs = up_statfs, + .opendir = up_opendir, + .readdir = up_readdir, + .readdirp = up_readdirp, + .stat = up_stat, + .fstat = up_fstat, + .access = up_access, + .readlink = up_readlink, + .readv = up_readv, + .lk = up_lk, + .seek = up_seek, + + /* fops doing write */ + .truncate = up_truncate, + .ftruncate = up_ftruncate, + .writev = up_writev, + .zerofill = up_zerofill, + .fallocate = up_fallocate, + .discard = up_discard, + + /* fops changing attributes */ + .fsetattr = up_fsetattr, + .setattr = up_setattr, + + /* fops affecting parent dirent */ + .mknod = up_mknod, + .create = up_create, + .symlink = up_symlink, + .mkdir = up_mkdir, + + /* fops affecting both file and parent + * cache entries */ + .unlink = up_unlink, + .link = up_link, + .rmdir = up_rmdir, + .rename = up_rename, + + .setxattr = up_setxattr, + .fsetxattr = up_fsetxattr, + .getxattr = up_getxattr, + .fgetxattr = up_fgetxattr, + .fremovexattr = up_fremovexattr, + .removexattr = up_removexattr, + .xattrop = up_xattrop, + .fxattrop = up_fxattrop, + +#ifdef NOT_SUPPORTED + /* internal lk fops */ + .inodelk = up_inodelk, + .finodelk = up_finodelk, + .entrylk = up_entrylk, + .fentrylk = up_fentrylk, + + /* Below fops follow 'WRITE' which + * would have already sent upcall + * notifications */ + .flush = up_flush, + .fsync = up_fsync, + .fsyncdir = up_fsyncdir, +#endif +}; + +struct xlator_cbks cbks = { + .forget = upcall_forget, + .release = upcall_release, +}; + +struct volume_options options[] = { + { + .key = {"cache-invalidation"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "When \"on\", sends cache-invalidation" + " notifications.", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"cache", "cacheconsistency", "upcall"}, + }, + {.key = {"cache-invalidation-timeout"}, + .type = GF_OPTION_TYPE_INT, + .default_value = CACHE_INVALIDATION_TIMEOUT, + .description = "After 'timeout' seconds since the time" + " client accessed any file, cache-invalidation" + " notifications are no longer sent to that client.", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"cache", "cachetimeout", "upcall"}}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "upcall", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/upcall/src/upcall.h b/xlators/features/upcall/src/upcall.h new file mode 100644 index 00000000000..aa535088ad7 --- /dev/null +++ b/xlators/features/upcall/src/upcall.h @@ -0,0 +1,131 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __UPCALL_H__ +#define __UPCALL_H__ + +#include <glusterfs/compat-errno.h> +#include "upcall-mem-types.h" +#include <glusterfs/client_t.h> +#include "upcall-messages.h" +#include "upcall-cache-invalidation.h" +#include <glusterfs/upcall-utils.h> + +#define EXIT_IF_UPCALL_OFF(this, label) \ + do { \ + if (!is_upcall_enabled(this)) \ + goto label; \ + } while (0) + +#define UPCALL_STACK_UNWIND(fop, frame, params...) \ + do { \ + upcall_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + if (frame) { \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + upcall_local_wipe(__xl, __local); \ + } while (0) + +#define UPCALL_STACK_DESTROY(frame) \ + do { \ + upcall_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_DESTROY(frame->root); \ + upcall_local_wipe(__xl, __local); \ + } while (0) + +struct _upcall_private { + gf_boolean_t cache_invalidation_enabled; + int32_t cache_invalidation_timeout; + struct list_head inode_ctx_list; + gf_lock_t inode_ctx_lk; + gf_boolean_t reaper_init_done; + pthread_t reaper_thr; + int32_t fini; + dict_t *xattrs; /* list of xattrs registered by clients + for receiving invalidation */ +}; +typedef struct _upcall_private upcall_private_t; + +struct _upcall_client { + struct list_head client_list; + /* strdup to store client_uid, strdup. Free it explicitly */ + char *client_uid; + time_t access_time; /* time last accessed */ + /* the amount of time which client can cache this entry */ + uint32_t expire_time_attr; +}; +typedef struct _upcall_client upcall_client_t; + +/* Upcall entries are maintained in inode_ctx */ +struct _upcall_inode_ctx { + struct list_head inode_ctx_list; + struct list_head client_list; + pthread_mutex_t client_list_lock; /* mutex for clients list + of this upcall entry */ + int destroy; + uuid_t gfid; /* gfid of the entry */ +}; +typedef struct _upcall_inode_ctx upcall_inode_ctx_t; + +struct upcall_local { + /* XXX: need to check if we can store + * pointers in 'local' which may get freed + * in future by other thread + */ + inode_t *inode; + loc_t rename_oldloc; + loc_t loc; /* required for stat in *xattr_cbk */ + fd_t *fd; /* required for fstat in *xattr_cbk */ + dict_t *xattr; +}; +typedef struct upcall_local upcall_local_t; + +void +upcall_local_wipe(xlator_t *this, upcall_local_t *local); +upcall_local_t * +upcall_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + inode_t *inode, dict_t *xattr); + +upcall_inode_ctx_t * +upcall_inode_ctx_get(inode_t *inode, xlator_t *this); +int +upcall_cleanup_inode_ctx(xlator_t *this, inode_t *inode); + +void * +upcall_reaper_thread(void *data); +int +upcall_reaper_thread_init(xlator_t *this); + +/* Xlator options */ +gf_boolean_t +is_upcall_enabled(xlator_t *this); + +/* Cache invalidation specific */ +void +upcall_cache_invalidate(call_frame_t *frame, xlator_t *this, client_t *client, + inode_t *inode, uint32_t flags, struct iatt *stbuf, + struct iatt *p_stbuf, struct iatt *oldp_stbuf, + dict_t *xattr); +int +up_filter_xattr(dict_t *xattr, dict_t *regd_xattrs); + +int +up_compare_afr_xattr(dict_t *d, char *k, data_t *v, void *tmp); + +gf_boolean_t +up_invalidate_needed(dict_t *xattrs); +#endif /* __UPCALL_H__ */ diff --git a/xlators/features/utime/Makefile.am b/xlators/features/utime/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/features/utime/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/utime/src/Makefile.am b/xlators/features/utime/src/Makefile.am new file mode 100644 index 00000000000..7c3adbc2195 --- /dev/null +++ b/xlators/features/utime/src/Makefile.am @@ -0,0 +1,41 @@ +xlator_LTLIBRARIES = utime.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +UTIME_SRC = $(top_srcdir)/xlators/features/utime/src + +utime_sources = $(UTIME_SRC)/utime-helpers.c +utime_sources += $(UTIME_SRC)/utime.c + +utime_la_SOURCES = $(utime_sources) +nodist_utime_la_SOURCES = utime-autogen-fops.c utime-autogen-fops.h +BUILT_SOURCES = utime-autogen-fops.h + +utime_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) +utime_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS_utime = $(UTIME_SRC)/utime-helpers.h +noinst_HEADERS_utime += $(UTIME_SRC)/utime.h +noinst_HEADERS_utime += $(UTIME_SRC)/utime-messages.h +noinst_HEADERS_utime += $(UTIME_SRC)/utime-mem-types.h +noinst_HEADERS = $(top_srcdir)/xlators/lib/src/libxlator.h +noinst_HEADERS += $(noinst_HEADERS_utime) + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(top_srcdir)/xlators/lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +noinst_PYTHON = utime-gen-fops-c.py utime-gen-fops-h.py +EXTRA_DIST = utime-autogen-fops-tmpl.c utime-autogen-fops-tmpl.h + +utime-autogen-fops.c: utime-gen-fops-c.py utime-autogen-fops-tmpl.c + $(PYTHON) $(UTIME_SRC)/utime-gen-fops-c.py $(UTIME_SRC)/utime-autogen-fops-tmpl.c > $@ + +utime-autogen-fops.h: utime-gen-fops-h.py utime-autogen-fops-tmpl.h + $(PYTHON) $(UTIME_SRC)/utime-gen-fops-h.py $(UTIME_SRC)/utime-autogen-fops-tmpl.h > $@ + +CLEANFILES = $(nodist_utime_la_SOURCES) + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/utime.so diff --git a/xlators/features/utime/src/utime-autogen-fops-tmpl.c b/xlators/features/utime/src/utime-autogen-fops-tmpl.c new file mode 100644 index 00000000000..f2f35322926 --- /dev/null +++ b/xlators/features/utime/src/utime-autogen-fops-tmpl.c @@ -0,0 +1,28 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/* File: utime-autogen-fops-tmpl.c + * This file contains the utime autogenerated FOPs. This is run through + * the code generator, generator.py to generate the required FOPs. + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> +#include <glusterfs/logging.h> +#include <glusterfs/statedump.h> +#include "utime-helpers.h" +#include <glusterfs/timespec.h> + +#pragma generate diff --git a/xlators/features/utime/src/utime-autogen-fops-tmpl.h b/xlators/features/utime/src/utime-autogen-fops-tmpl.h new file mode 100644 index 00000000000..4e102ffed6c --- /dev/null +++ b/xlators/features/utime/src/utime-autogen-fops-tmpl.h @@ -0,0 +1,22 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/* File: utime-autogen-fops-tmpl.h + * This file contains the utime autogenerated FOPs declarations. + */ + +#ifndef _UTIME_AUTOGEN_FOPS_H +#define _UTIME_AUTOGEN_FOPS_H + +#include <glusterfs/xlator.h> + +#pragma generate + +#endif /* _UTIME_AUTOGEN_FOPS_H */ diff --git a/xlators/features/utime/src/utime-gen-fops-c.py b/xlators/features/utime/src/utime-gen-fops-c.py new file mode 100755 index 00000000000..9fb3e1b8b1a --- /dev/null +++ b/xlators/features/utime/src/utime-gen-fops-c.py @@ -0,0 +1,147 @@ +#!/usr/bin/python3 + +from __future__ import print_function +import os +import sys + +curdir = os.path.dirname(sys.argv[0]) +gendir = os.path.join(curdir, '../../../../libglusterfs/src') +sys.path.append(gendir) +from generator import ops, fop_subs, cbk_subs, generate + +FOPS_COMMON_TEMPLATE = """ +int32_t +gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + gl_timespec_get(&frame->root->ctime); + + (void) utime_update_attribute_flags(frame, this, GF_FOP_@UPNAME@); + STACK_WIND (frame, gf_utime_@NAME@_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->@NAME@, @SHORT_ARGS@); + return 0; +} +""" + +FOPS_CBK_COMMON_TEMPLATE = """ +int32_t +gf_utime_@NAME@_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + @LONG_ARGS@) +{ + STACK_UNWIND_STRICT (@NAME@, frame, op_ret, op_errno, @SHORT_ARGS@); + return 0; +} +""" + +FOPS_READ_TEMPLATE = """ +int32_t +gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + gl_timespec_get(&frame->root->ctime); + + (void) utime_update_attribute_flags(frame, this, GF_FOP_READ); + STACK_WIND (frame, gf_utime_@NAME@_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->@NAME@, @SHORT_ARGS@); + return 0; +} +""" + +FOPS_WRITE_TEMPLATE = """ +int32_t +gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + gl_timespec_get(&frame->root->ctime); + + (void) utime_update_attribute_flags(frame, this, GF_FOP_WRITE); + STACK_WIND (frame, gf_utime_@NAME@_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->@NAME@, @SHORT_ARGS@); + return 0; +} +""" + +FOPS_COPY_FILE_RANGE_TEMPLATE = """ +int32_t +gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + gl_timespec_get(&frame->root->ctime); + + (void) utime_update_attribute_flags(frame, this, GF_FOP_COPY_FILE_RANGE); + STACK_WIND (frame, gf_utime_@NAME@_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->@NAME@, @SHORT_ARGS@); + return 0; +} +""" + +FOPS_SETATTR_TEMPLATE = """ +int32_t +gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + gl_timespec_get(&frame->root->ctime); + + if (!valid) { + frame->root->flags |= MDATA_CTIME; + } + + if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) { + frame->root->flags |= MDATA_CTIME; + } + + if (valid & GF_SET_ATTR_MODE) { + frame->root->flags |= MDATA_CTIME; + } + + if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) { + if (valid & GF_ATTR_ATIME_NOW) { + frame->root->ctime.tv_sec = stbuf->ia_atime; + frame->root->ctime.tv_nsec = stbuf->ia_atime_nsec; + } else if (valid & GF_ATTR_MTIME_NOW) { + frame->root->ctime.tv_sec = stbuf->ia_mtime; + frame->root->ctime.tv_nsec = stbuf->ia_mtime_nsec; + } + } + + STACK_WIND (frame, gf_utime_@NAME@_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->@NAME@, @SHORT_ARGS@); + return 0; +} +""" + +utime_ops = ['fallocate', 'zerofill', 'opendir', 'mknod', 'mkdir', + 'unlink', 'rmdir', 'symlink', 'rename', 'link', 'truncate', + 'ftruncate', 'create', 'open', 'removexattr', 'fremovexattr'] + +utime_read_op = ['readv'] +utime_write_op = ['writev'] +utime_setattr_ops = ['setattr', 'fsetattr'] +utime_copy_file_range_ops = ['copy_file_range'] + +def gen_defaults(): + for name in ops: + if name in utime_ops: + print(generate(FOPS_CBK_COMMON_TEMPLATE, name, cbk_subs)) + print(generate(FOPS_COMMON_TEMPLATE, name, fop_subs)) + if name in utime_read_op: + print(generate(FOPS_CBK_COMMON_TEMPLATE, name, cbk_subs)) + print(generate(FOPS_READ_TEMPLATE, name, fop_subs)) + if name in utime_write_op: + print(generate(FOPS_CBK_COMMON_TEMPLATE, name, cbk_subs)) + print(generate(FOPS_WRITE_TEMPLATE, name, fop_subs)) + if name in utime_setattr_ops: + print(generate(FOPS_CBK_COMMON_TEMPLATE, name, cbk_subs)) + print(generate(FOPS_SETATTR_TEMPLATE, name, fop_subs)) + if name in utime_copy_file_range_ops: + print(generate(FOPS_CBK_COMMON_TEMPLATE, name, cbk_subs)) + print(generate(FOPS_COPY_FILE_RANGE_TEMPLATE, name, fop_subs)) + +for l in open(sys.argv[1], 'r').readlines(): + if l.find('#pragma generate') != -1: + print("/* BEGIN GENERATED CODE - DO NOT MODIFY */") + gen_defaults() + print("/* END GENERATED CODE */") + else: + print(l[:-1]) diff --git a/xlators/features/utime/src/utime-gen-fops-h.py b/xlators/features/utime/src/utime-gen-fops-h.py new file mode 100755 index 00000000000..e96274c229a --- /dev/null +++ b/xlators/features/utime/src/utime-gen-fops-h.py @@ -0,0 +1,35 @@ +#!/usr/bin/python3 + +from __future__ import print_function +import os +import sys + +curdir = os.path.dirname(sys.argv[0]) +gendir = os.path.join(curdir, '../../../../libglusterfs/src') +sys.path.append(gendir) +from generator import ops, fop_subs, generate + +OP_FOP_TEMPLATE = """ +int32_t +gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@); +""" + +utime_ops = ['fallocate', 'zerofill', 'opendir', 'mknod', 'mkdir', + 'unlink', 'rmdir', 'symlink', 'rename', 'link', 'truncate', + 'ftruncate', 'create', 'open', 'removexattr', 'fremovexattr', + 'readv', 'writev', 'setattr', 'fsetattr', 'copy_file_range'] + +def gen_defaults(): + for name, value in ops.items(): + if name in utime_ops: + print(generate(OP_FOP_TEMPLATE, name, fop_subs)) + + +for l in open(sys.argv[1], 'r').readlines(): + if l.find('#pragma generate') != -1: + print("/* BEGIN GENERATED CODE - DO NOT MODIFY */") + gen_defaults() + print("/* END GENERATED CODE */") + else: + print(l[:-1]) diff --git a/xlators/features/utime/src/utime-helpers.c b/xlators/features/utime/src/utime-helpers.c new file mode 100644 index 00000000000..29d9ad93561 --- /dev/null +++ b/xlators/features/utime/src/utime-helpers.c @@ -0,0 +1,110 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "utime-helpers.h" +#include "utime.h" + +void +gl_timespec_get(struct timespec *ts) +{ +#ifdef TIME_UTC + timespec_get(ts, TIME_UTC); +#else + timespec_now_realtime(ts); +#endif +} + +void +utime_update_attribute_flags(call_frame_t *frame, xlator_t *this, + glusterfs_fop_t fop) +{ + utime_priv_t *utime_priv = NULL; + + if (!frame || !this) { + goto out; + } + + utime_priv = this->private; + + switch (fop) { + case GF_FOP_SETXATTR: + case GF_FOP_FSETXATTR: + frame->root->flags |= MDATA_CTIME; + break; + + case GF_FOP_FALLOCATE: + case GF_FOP_ZEROFILL: + frame->root->flags |= MDATA_MTIME; + frame->root->flags |= MDATA_ATIME; + break; + + case GF_FOP_OPENDIR: + case GF_FOP_OPEN: + case GF_FOP_READ: + if (!utime_priv->noatime) { + frame->root->flags |= MDATA_ATIME; + } + break; + case GF_FOP_MKNOD: + case GF_FOP_MKDIR: + case GF_FOP_SYMLINK: + case GF_FOP_CREATE: + frame->root->flags |= MDATA_ATIME; + frame->root->flags |= MDATA_CTIME; + frame->root->flags |= MDATA_MTIME; + frame->root->flags |= MDATA_PAR_CTIME; + frame->root->flags |= MDATA_PAR_MTIME; + break; + + case GF_FOP_UNLINK: + case GF_FOP_RMDIR: + frame->root->flags |= MDATA_CTIME; + frame->root->flags |= MDATA_PAR_CTIME; + frame->root->flags |= MDATA_PAR_MTIME; + break; + + case GF_FOP_WRITE: + frame->root->flags |= MDATA_MTIME; + frame->root->flags |= MDATA_CTIME; + break; + + case GF_FOP_LINK: + case GF_FOP_RENAME: + frame->root->flags |= MDATA_CTIME; + frame->root->flags |= MDATA_PAR_CTIME; + frame->root->flags |= MDATA_PAR_MTIME; + break; + + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + frame->root->flags |= MDATA_CTIME; + frame->root->flags |= MDATA_MTIME; + break; + + case GF_FOP_REMOVEXATTR: + case GF_FOP_FREMOVEXATTR: + frame->root->flags |= MDATA_CTIME; + break; + + case GF_FOP_COPY_FILE_RANGE: + /* Below 2 are for destination fd */ + frame->root->flags |= MDATA_CTIME; + frame->root->flags |= MDATA_MTIME; + /* Below flag is for the source fd */ + if (!utime_priv->noatime) { + frame->root->flags |= MDATA_ATIME; + } + break; + default: + frame->root->flags = 0; + } +out: + return; +} diff --git a/xlators/features/utime/src/utime-helpers.h b/xlators/features/utime/src/utime-helpers.h new file mode 100644 index 00000000000..2e32d4bece6 --- /dev/null +++ b/xlators/features/utime/src/utime-helpers.h @@ -0,0 +1,25 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _UTIME_HELPERS_H +#define _UTIME_HELPERS_H + +#include <glusterfs/stack.h> +#include <glusterfs/xlator.h> +#include <glusterfs/timespec.h> +#include <time.h> + +void +gl_timespec_get(struct timespec *ts); +void +utime_update_attribute_flags(call_frame_t *frame, xlator_t *this, + glusterfs_fop_t fop); + +#endif /* _UTIME_HELPERS_H */ diff --git a/xlators/features/utime/src/utime-mem-types.h b/xlators/features/utime/src/utime-mem-types.h new file mode 100644 index 00000000000..ad1255f85f3 --- /dev/null +++ b/xlators/features/utime/src/utime-mem-types.h @@ -0,0 +1,21 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __UTIME_MEM_TYPES_H__ +#define __UTIME_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> + +enum gf_utime_mem_types_ { + utime_mt_utime_t = gf_common_mt_end + 1, + utime_mt_end +}; + +#endif /* __UTIME_MEM_TYPES_H__ */ diff --git a/xlators/features/utime/src/utime-messages.h b/xlators/features/utime/src/utime-messages.h new file mode 100644 index 00000000000..bd40265abaf --- /dev/null +++ b/xlators/features/utime/src/utime-messages.h @@ -0,0 +1,29 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __UTIME_MESSAGES_H__ +#define __UTIME_MESSAGES_H__ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(UTIME, UTIME_MSG_NO_MEMORY, UTIME_MSG_SET_MDATA_FAILED, + UTIME_MSG_DICT_SET_FAILED); + +#endif /* __UTIME_MESSAGES_H__ */ diff --git a/xlators/features/utime/src/utime.c b/xlators/features/utime/src/utime.c new file mode 100644 index 00000000000..2acc63e6a05 --- /dev/null +++ b/xlators/features/utime/src/utime.c @@ -0,0 +1,392 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "utime.h" +#include "utime-helpers.h" +#include "utime-messages.h" +#include "utime-mem-types.h" +#include <glusterfs/call-stub.h> + +int32_t +gf_utime_invalidate(xlator_t *this, inode_t *inode) +{ + return 0; +} + +int32_t +gf_utime_forget(xlator_t *this, inode_t *inode) +{ + return 0; +} + +int32_t +gf_utime_client_destroy(xlator_t *this, client_t *client) +{ + return 0; +} + +void +gf_utime_ictxmerge(xlator_t *this, fd_t *fd, inode_t *inode, + inode_t *linked_inode) +{ + return; +} + +int32_t +gf_utime_release(xlator_t *this, fd_t *fd) +{ + return 0; +} + +int32_t +gf_utime_releasedir(xlator_t *this, fd_t *fd) +{ + return 0; +} + +int32_t +gf_utime_client_disconnect(xlator_t *this, client_t *client) +{ + return 0; +} + +int32_t +gf_utime_fdctx_to_dict(xlator_t *this, fd_t *fd, dict_t *dict) +{ + return 0; +} + +int32_t +gf_utime_inode(xlator_t *this) +{ + return 0; +} + +int32_t +gf_utime_inode_to_dict(xlator_t *this, dict_t *dict) +{ + return 0; +} + +int32_t +gf_utime_history(xlator_t *this) +{ + return 0; +} + +int32_t +gf_utime_fd(xlator_t *this) +{ + return 0; +} + +int32_t +gf_utime_fd_to_dict(xlator_t *this, dict_t *dict) +{ + return 0; +} + +int32_t +gf_utime_fdctx(xlator_t *this, fd_t *fd) +{ + return 0; +} + +int32_t +gf_utime_inodectx(xlator_t *this, inode_t *ino) +{ + return 0; +} + +int32_t +gf_utime_inodectx_to_dict(xlator_t *this, inode_t *ino, dict_t *dict) +{ + return 0; +} + +int32_t +gf_utime_priv_to_dict(xlator_t *this, dict_t *dict, char *brickname) +{ + return 0; +} + +int32_t +gf_utime_priv(xlator_t *this) +{ + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + if (xlator_mem_acct_init(this, utime_mt_end + 1) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, UTIME_MSG_NO_MEMORY, + "Memory accounting initialization failed."); + return -1; + } + return 0; +} + +int32_t +gf_utime_set_mdata_setxattr_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xdata) +{ + call_stub_t *stub = frame->local; + /* Don't fail lookup if mdata setxattr fails */ + if (op_ret) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, UTIME_MSG_SET_MDATA_FAILED, + "dict set of key for set-ctime-mdata failed"); + } + frame->local = NULL; + call_resume(stub); + STACK_DESTROY(frame->root); + return 0; +} + +int32_t +gf_utime_set_mdata_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xdata, + struct iatt *postparent) +{ + dict_t *dict = NULL; + struct mdata_iatt *mdata = NULL; + int ret = 0; + loc_t loc = { + 0, + }; + call_frame_t *new_frame = NULL; + + if (!op_ret && dict_get(xdata, GF_XATTR_MDATA_KEY) == NULL) { + dict = dict_new(); + if (!dict) { + op_errno = ENOMEM; + goto err; + } + mdata = GF_MALLOC(sizeof(struct mdata_iatt), gf_common_mt_char); + if (mdata == NULL) { + op_errno = ENOMEM; + goto err; + } + iatt_to_mdata(mdata, stbuf); + ret = dict_set_mdata(dict, CTIME_MDATA_XDATA_KEY, mdata, _gf_false); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, UTIME_MSG_NO_MEMORY, + "dict set of key for set-ctime-mdata failed"); + goto err; + } + new_frame = copy_frame(frame); + if (!new_frame) { + op_errno = ENOMEM; + goto stub_err; + } + + new_frame->local = fop_lookup_cbk_stub(frame, default_lookup_cbk, + op_ret, op_errno, inode, stbuf, + xdata, postparent); + if (!new_frame->local) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, UTIME_MSG_NO_MEMORY, + "lookup_cbk stub allocation failed"); + op_errno = ENOMEM; + STACK_DESTROY(new_frame->root); + goto stub_err; + } + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, stbuf->ia_gfid); + + new_frame->root->uid = 0; + new_frame->root->gid = 0; + new_frame->root->pid = GF_CLIENT_PID_SET_UTIME; + STACK_WIND(new_frame, gf_utime_set_mdata_setxattr_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, &loc, + dict, 0, NULL); + + dict_unref(dict); + inode_unref(loc.inode); + return 0; + } + + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, stbuf, xdata, + postparent); + return 0; + +err: + if (mdata) { + GF_FREE(mdata); + } +stub_err: + if (dict) { + dict_unref(dict); + } + STACK_UNWIND_STRICT(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; +} + +int +gf_utime_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int op_errno = EINVAL; + int ret = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + + xdata = xdata ? dict_ref(xdata) : dict_new(); + if (!xdata) { + op_errno = ENOMEM; + goto err; + } + + ret = dict_set_int8(xdata, GF_XATTR_MDATA_KEY, 1); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, UTIME_MSG_DICT_SET_FAILED, + "%s: Unable to set dict value for %s", loc->path, + GF_XATTR_MDATA_KEY); + op_errno = -ret; + goto free_dict; + } + + STACK_WIND(frame, gf_utime_set_mdata_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + dict_unref(xdata); + return 0; + +free_dict: + dict_unref(xdata); +err: + STACK_UNWIND_STRICT(lookup, frame, ret, op_errno, NULL, NULL, NULL, NULL); + return 0; +} + +int32_t +init(xlator_t *this) +{ + utime_priv_t *utime = NULL; + + utime = GF_MALLOC(sizeof(*utime), utime_mt_utime_t); + if (utime == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, UTIME_MSG_NO_MEMORY, + "Failed to allocate private memory."); + return -1; + } + memset(utime, 0, sizeof(*utime)); + + this->private = utime; + GF_OPTION_INIT("noatime", utime->noatime, bool, err); + + return 0; +err: + return -1; +} + +void +fini(xlator_t *this) +{ + utime_priv_t *utime = NULL; + + utime = this->private; + GF_FREE(utime); + return; +} + +int32_t +reconfigure(xlator_t *this, dict_t *options) +{ + utime_priv_t *utime = this->private; + + GF_OPTION_RECONF("noatime", utime->noatime, options, bool, err); + + return 0; +err: + return -1; +} + +int +notify(xlator_t *this, int event, void *data, ...) +{ + return default_notify(this, event, data); +} + +struct xlator_fops fops = { + .rename = gf_utime_rename, + .mknod = gf_utime_mknod, + .readv = gf_utime_readv, + .fremovexattr = gf_utime_fremovexattr, + .open = gf_utime_open, + .create = gf_utime_create, + .mkdir = gf_utime_mkdir, + .writev = gf_utime_writev, + .rmdir = gf_utime_rmdir, + .fallocate = gf_utime_fallocate, + .truncate = gf_utime_truncate, + .symlink = gf_utime_symlink, + .zerofill = gf_utime_zerofill, + .link = gf_utime_link, + .ftruncate = gf_utime_ftruncate, + .unlink = gf_utime_unlink, + .setattr = gf_utime_setattr, + .fsetattr = gf_utime_fsetattr, + .opendir = gf_utime_opendir, + .removexattr = gf_utime_removexattr, + .lookup = gf_utime_lookup, +}; +struct xlator_cbks cbks = { + .invalidate = gf_utime_invalidate, + .forget = gf_utime_forget, + .client_destroy = gf_utime_client_destroy, + .ictxmerge = gf_utime_ictxmerge, + .release = gf_utime_release, + .releasedir = gf_utime_releasedir, + .client_disconnect = gf_utime_client_disconnect, +}; +struct xlator_dumpops dumpops = { + .fdctx_to_dict = gf_utime_fdctx_to_dict, + .inode = gf_utime_inode, + .inode_to_dict = gf_utime_inode_to_dict, + .history = gf_utime_history, + .fd = gf_utime_fd, + .fd_to_dict = gf_utime_fd_to_dict, + .fdctx = gf_utime_fdctx, + .inodectx = gf_utime_inodectx, + .inodectx_to_dict = gf_utime_inodectx_to_dict, + .priv_to_dict = gf_utime_priv_to_dict, + .priv = gf_utime_priv, +}; + +struct volume_options options[] = { + {.key = {"noatime"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {GD_OP_VERSION_5_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"ctime"}, + .description = "Enable/Disable atime updation when ctime feature is " + "enabled. When noatime is on, atime is not updated with " + "ctime feature enabled and vice versa."}, + {.key = {NULL}}}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {GD_OP_VERSION_5_0}, + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "utime", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/utime/src/utime.h b/xlators/features/utime/src/utime.h new file mode 100644 index 00000000000..ba55eec00de --- /dev/null +++ b/xlators/features/utime/src/utime.h @@ -0,0 +1,23 @@ +/* + Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __UTIME_H__ +#define __UTIME_H__ + +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include "utime-autogen-fops.h" + +typedef struct utime_priv { + gf_boolean_t noatime; +} utime_priv_t; + +#endif /* __UTIME_H__ */ |
