diff options
Diffstat (limited to 'xlators/storage')
| -rw-r--r-- | xlators/storage/Makefile.am | 3 | ||||
| -rw-r--r-- | xlators/storage/bd/Makefile.am (renamed from xlators/storage/bd_map/Makefile.am) | 0 | ||||
| -rw-r--r-- | xlators/storage/bd/src/Makefile.am (renamed from xlators/storage/bd_map/src/Makefile.am) | 11 | ||||
| -rw-r--r-- | xlators/storage/bd/src/bd-aio.c | 527 | ||||
| -rw-r--r-- | xlators/storage/bd/src/bd-aio.h | 41 | ||||
| -rw-r--r-- | xlators/storage/bd/src/bd-helper.c | 783 | ||||
| -rw-r--r-- | xlators/storage/bd/src/bd.c | 2404 | ||||
| -rw-r--r-- | xlators/storage/bd/src/bd.h | 178 | ||||
| -rw-r--r-- | xlators/storage/bd_map/src/bd_map.c | 2580 | ||||
| -rw-r--r-- | xlators/storage/bd_map/src/bd_map.h | 76 | ||||
| -rw-r--r-- | xlators/storage/bd_map/src/bd_map_help.c | 501 | ||||
| -rw-r--r-- | xlators/storage/bd_map/src/bd_map_help.h | 69 | ||||
| -rw-r--r-- | xlators/storage/posix/src/Makefile.am | 2 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-aio.c | 12 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-handle.c | 7 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-helpers.c | 407 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.c | 1103 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.h | 46 |
18 files changed, 5245 insertions, 3505 deletions
diff --git a/xlators/storage/Makefile.am b/xlators/storage/Makefile.am index e1316a127..c08e8e41b 100644 --- a/xlators/storage/Makefile.am +++ b/xlators/storage/Makefile.am @@ -1,6 +1,7 @@ SUBDIRS = posix if ENABLE_BD_XLATOR -SUBDIRS += bd_map +SUBDIRS += bd endif + CLEANFILES = diff --git a/xlators/storage/bd_map/Makefile.am b/xlators/storage/bd/Makefile.am index a985f42a8..a985f42a8 100644 --- a/xlators/storage/bd_map/Makefile.am +++ b/xlators/storage/bd/Makefile.am diff --git a/xlators/storage/bd_map/src/Makefile.am b/xlators/storage/bd/src/Makefile.am index be43d2abb..3d93f7442 100644 --- a/xlators/storage/bd_map/src/Makefile.am +++ b/xlators/storage/bd/src/Makefile.am @@ -1,14 +1,13 @@ - if ENABLE_BD_XLATOR -xlator_LTLIBRARIES = bd_map.la +xlator_LTLIBRARIES = bd.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage -bd_map_la_LDFLAGS = -module -avoidversion +bd_la_LDFLAGS = -module -avoid-version LIBBD = -llvm2app -lrt -bd_map_la_SOURCES = bd_map.c bd_map_help.c -bd_map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBBD) +bd_la_SOURCES = bd.c bd-helper.c bd-aio.c +bd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBBD) $(LIBAIO) -noinst_HEADERS = bd_map.h bd_map_help.h +noinst_HEADERS = bd.h bd-aio.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ -I$(top_srcdir)/rpc/xdr/src \ diff --git a/xlators/storage/bd/src/bd-aio.c b/xlators/storage/bd/src/bd-aio.c new file mode 100644 index 000000000..62d4590f7 --- /dev/null +++ b/xlators/storage/bd/src/bd-aio.c @@ -0,0 +1,527 @@ +/* + Copyright IBM, Corp. 2013 + + This file is part of GlusterFS. + + Author: M. Mohan Kumar <mohan@in.ibm.com> + + Based on posix-aio.c + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <lvm2app.h> +#include <sys/uio.h> + +#include "xlator.h" +#include "glusterfs.h" +#include "defaults.h" +#include "bd.h" +#include "bd-aio.h" + +#ifdef HAVE_LIBAIO +#include <libaio.h> + +struct bd_aio_cb { + struct iocb iocb; + call_frame_t *frame; + struct iobuf *iobuf; + struct iobref *iobref; + struct iatt prebuf; + int op; + off_t offset; + fd_t *fd; +}; + +void +__bd_fd_set_odirect (fd_t *fd, bd_fd_t *bd_fd, int opflags, + off_t offset, size_t size) +{ + int odirect = 0; + int flags = 0; + int ret = 0; + + odirect = bd_fd->odirect; + + if ((fd->flags|opflags) & O_DIRECT) { + /* if instructed, use O_DIRECT always */ + odirect = 1; + } else { + /* else use O_DIRECT when feasible */ + if ((offset|size) & 0xfff) + odirect = 0; + else + odirect = 1; + } + + if (!odirect && bd_fd->odirect) { + flags = fcntl (bd_fd->fd, F_GETFL); + ret = fcntl (bd_fd->fd, F_SETFL, (flags & (~O_DIRECT))); + bd_fd->odirect = 0; + } + + if (odirect && !bd_fd->odirect) { + flags = fcntl (bd_fd->fd, F_GETFL); + ret = fcntl (bd_fd->fd, F_SETFL, (flags | O_DIRECT)); + bd_fd->odirect = 1; + } + + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, + "fcntl() failed (%s). fd=%d flags=%d pfd->odirect=%d", + strerror (errno), bd_fd->fd, flags, bd_fd->odirect); + } +} + +int +bd_aio_readv_complete (struct bd_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iobuf *iobuf = NULL; + struct iatt postbuf = {0,}; + int op_ret = -1; + int op_errno = 0; + struct iovec iov; + struct iobref *iobref = NULL; + off_t offset = 0; + bd_attr_t *bdatt = NULL; + + frame = paiocb->frame; + this = frame->this; + iobuf = paiocb->iobuf; + offset = paiocb->offset; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "readv(async) failed fd=%p,size=%lu,offset=%llu (%d/%s)", + paiocb->fd, paiocb->iocb.u.c.nbytes, + (unsigned long long) paiocb->offset, + res, strerror (op_errno)); + goto out; + } + + bd_inode_ctx_get (paiocb->fd->inode, this, &bdatt); + memcpy (&postbuf, &bdatt->iatt, sizeof (struct iatt)); + + op_ret = res; + op_errno = 0; + + iobref = iobref_new (); + if (!iobref) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + iobref_add (iobref, iobuf); + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = op_ret; + + /* Hack to notify higher layers of EOF. */ + if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size) + op_errno = ENOENT; + +out: + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, &iov, 1, + &postbuf, iobref, NULL); + if (iobuf) + iobuf_unref (iobuf); + if (iobref) + iobref_unref (iobref); + + GF_FREE (paiocb); + + return 0; +} + +int +bd_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t offset, uint32_t flags, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct iobuf *iobuf = NULL; + bd_fd_t *bd_fd = NULL; + int ret = -1; + struct bd_aio_cb *paiocb = NULL; + bd_priv_t *priv = NULL; + struct iocb *iocb = NULL; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + STACK_WIND (frame, default_readv_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readv, fd, size, offset, + flags, xdata); + return 0; + } + _fd = bd_fd->fd; + bd_inode_ctx_get (fd->inode, this, &bdatt); + if (!size) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); + goto err; + } + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto err; + } + + paiocb = CALLOC (1, sizeof (*paiocb)); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + paiocb->frame = frame; + paiocb->iobuf = iobuf; + paiocb->offset = offset; + paiocb->op = GF_FOP_READ; + paiocb->fd = fd; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.c.buf = iobuf_ptr (iobuf); + paiocb->iocb.u.c.nbytes = size; + paiocb->iocb.u.c.offset = offset; + + iocb = &paiocb->iocb; + + LOCK (&fd->lock); + { + __bd_fd_set_odirect (fd, bd_fd, flags, offset, size); + + ret = io_submit (priv->ctxp, 1, &iocb); + } + UNLOCK (&fd->lock); + + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + if (iobuf) + iobuf_unref (iobuf); + + if (paiocb) + GF_FREE (paiocb); + + return 0; +} + +int +bd_aio_writev_complete (struct bd_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iatt prebuf = {0,}; + struct iatt postbuf = {0,}; + int op_ret = -1; + int op_errno = 0; + bd_attr_t *bdatt = NULL; + + frame = paiocb->frame; + prebuf = paiocb->prebuf; + this = frame->this; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "writev(async) failed fd=%p,offset=%llu (%d/%s)", + paiocb->fd, (unsigned long long) paiocb->offset, res, + strerror (op_errno)); + + goto out; + } + + bd_inode_ctx_get (paiocb->fd->inode, this, &bdatt); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); + memcpy (&postbuf, &bdatt->iatt, sizeof (struct iatt)); + + op_ret = res; + op_errno = 0; + +out: + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &prebuf, &postbuf, + NULL); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + +int +bd_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *iov, int count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + bd_fd_t *bd_fd = NULL; + int ret = -1; + struct bd_aio_cb *paiocb = NULL; + bd_priv_t *priv = NULL; + struct iocb *iocb = NULL; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + STACK_WIND (frame, default_writev_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, + fd, iov, count, offset, flags, iobref, xdata); + return 0; + } + + bd_inode_ctx_get (fd->inode, this, &bdatt); + + _fd = bd_fd->fd; + + paiocb = CALLOC (1, sizeof (*paiocb)); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + + paiocb->frame = frame; + paiocb->offset = offset; + paiocb->op = GF_FOP_WRITE; + paiocb->fd = fd; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iobref = iobref_ref (iobref); + paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.v.vec = iov; + paiocb->iocb.u.v.nr = count; + paiocb->iocb.u.v.offset = offset; + + iocb = &paiocb->iocb; + + memcpy (&paiocb->prebuf, &bdatt->iatt, sizeof (struct iatt)); + LOCK (&fd->lock); + { + __bd_fd_set_odirect (fd, bd_fd, flags, offset, + iov_length (iov, count)); + + ret = io_submit (priv->ctxp, 1, &iocb); + } + UNLOCK (&fd->lock); + + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, 0, 0, 0); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + +void * +bd_aio_thread (void *data) +{ + xlator_t *this = NULL; + bd_priv_t *priv = NULL; + int ret = 0; + int i = 0; + struct io_event *event = NULL; + struct bd_aio_cb *paiocb = NULL; + struct io_event events[BD_AIO_MAX_NR_GETEVENTS]; + struct timespec ts = {0, }; + + this = data; + THIS = this; + priv = this->private; + + ts.tv_sec = 5; + for (;;) { + memset (&events[0], 0, sizeof (events)); + ret = io_getevents (priv->ctxp, 1, BD_AIO_MAX_NR_GETEVENTS, + &events[0], &ts); + if (ret < 0) { + if (ret == -EINTR) + continue; + gf_log (this->name, GF_LOG_ERROR, + "io_getevents() returned %d, exiting", ret); + break; + } + + for (i = 0; i < ret; i++) { + event = &events[i]; + + paiocb = event->data; + + switch (paiocb->op) { + case GF_FOP_READ: + bd_aio_readv_complete (paiocb, event->res, + event->res2); + break; + case GF_FOP_WRITE: + bd_aio_writev_complete (paiocb, event->res, + event->res2); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "unknown op %d found in piocb", + paiocb->op); + break; + } + } + } + + return NULL; +} + +int +bd_aio_init (xlator_t *this) +{ + bd_priv_t *priv = NULL; + int ret = 0; + + priv = this->private; + + ret = io_setup (BD_AIO_MAX_NR_EVENTS, &priv->ctxp); + if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) { + gf_log (this->name, GF_LOG_WARNING, + "Linux AIO not available at run-time." + " Continuing with synchronous IO"); + ret = 0; + goto out; + } + + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "io_setup() failed. ret=%d, errno=%d", + ret, errno); + goto out; + } + + ret = pthread_create (&priv->aiothread, NULL, + bd_aio_thread, this); + if (ret != 0) { + io_destroy (priv->ctxp); + goto out; + } + + this->fops->readv = bd_aio_readv; + this->fops->writev = bd_aio_writev; +out: + return ret; +} + + +int +bd_aio_on (xlator_t *this) +{ + bd_priv_t *priv = NULL; + int ret = 0; + + priv = this->private; + + if (!priv->aio_init_done) { + ret = bd_aio_init (this); + if (ret == 0) + priv->aio_capable = _gf_true; + else + priv->aio_capable = _gf_false; + priv->aio_init_done = _gf_true; + } + + if (priv->aio_capable) { + this->fops->readv = bd_aio_readv; + this->fops->writev = bd_aio_writev; + } + + return ret; +} + +int +bd_aio_off (xlator_t *this) +{ + this->fops->readv = bd_readv; + this->fops->writev = bd_writev; + + return 0; +} + +#else + +int +bd_aio_on (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +int +bd_aio_off (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +void +__bd_fd_set_odirect (fd_t *fd, struct bd_fd *pfd, int opflags, + off_t offset, size_t size) +{ + xlator_t *this = THIS; + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return; +} +#endif diff --git a/xlators/storage/bd/src/bd-aio.h b/xlators/storage/bd/src/bd-aio.h new file mode 100644 index 000000000..16f686a4c --- /dev/null +++ b/xlators/storage/bd/src/bd-aio.h @@ -0,0 +1,41 @@ +/* + Copyright IBM, Corp. 2013 + + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _BD_AIO_H +#define _BD_AIO_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "glusterfs.h" + +/* + * Maximum number of concurrently submitted IO events. The heaviest load + * GlusterFS has been able to handle had 60-80 concurrent calls + */ +#define BD_AIO_MAX_NR_EVENTS 256 + +/* Maximum number of completed IO operations to reap per getevents syscall */ +#define BD_AIO_MAX_NR_GETEVENTS 16 + +int bd_aio_on (xlator_t *this); +int bd_aio_off (xlator_t *this); + +int bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata); + +#endif /* !_BD_AIO_H */ diff --git a/xlators/storage/bd/src/bd-helper.c b/xlators/storage/bd/src/bd-helper.c new file mode 100644 index 000000000..5525e346b --- /dev/null +++ b/xlators/storage/bd/src/bd-helper.c @@ -0,0 +1,783 @@ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include <lvm2app.h> +#ifdef HAVE_LIBAIO +#include <libaio.h> +#endif + +#include "bd.h" +#include "run.h" + +int +bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO (this->name, ctx, out); + + ctx_int = (long)ctx; + ret = inode_ctx_set (inode, this, &ctx_int); +out: + return ret; +} + +int +bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO (this->name, inode, out); + ret = inode_ctx_get (inode, this, &ctx_int); + if (ret) + return ret; + if (ctx) + *ctx = (bd_attr_t *) ctx_int; +out: + return ret; +} + +void +bd_local_free (xlator_t *this, bd_local_t *local) +{ + if (!local) + return; + if (local->fd) + fd_unref (local->fd); + else if (local->loc.path) + loc_wipe (&local->loc); + if (local->dict) + dict_unref (local->dict); + if (local->inode) + inode_unref (local->inode); + if (local->bdatt) { + GF_FREE (local->bdatt->type); + GF_FREE (local->bdatt); + } + mem_put (local); + local = NULL; +} + +bd_local_t * +bd_local_init (call_frame_t *frame, xlator_t *this) +{ + frame->local = mem_get0 (this->local_pool); + if (!frame->local) + return NULL; + + return frame->local; +} + +/* + * VG are set with the tag in GF_XATTR_VOL_ID_KEY:<uuid> format. + * This function validates this tag agains volume-uuid. Also goes + * through LV list to find out if a thin-pool is configured or not. + */ +int bd_scan_vg (xlator_t *this, bd_priv_t *priv) +{ + vg_t brick = NULL; + data_t *tmp_data = NULL; + struct dm_list *tags = NULL; + int op_ret = -1; + uuid_t dict_uuid = {0, }; + uuid_t vg_uuid = {0, }; + gf_boolean_t uuid = _gf_false; + lvm_str_list_t *strl = NULL; + struct dm_list *lv_dm_list = NULL; + lv_list_t *lv_list = NULL; + struct dm_list *dm_seglist = NULL; + lvseg_list_t *seglist = NULL; + lvm_property_value_t prop = {0, }; + gf_boolean_t thin = _gf_false; + const char *lv_name = NULL; + + brick = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!brick) { + gf_log (this->name, GF_LOG_CRITICAL, "VG %s is not found", + priv->vg); + return ENOENT; + } + + lv_dm_list = lvm_vg_list_lvs (brick); + if (!lv_dm_list) + goto check; + + dm_list_iterate_items (lv_list, lv_dm_list) { + dm_seglist = lvm_lv_list_lvsegs (lv_list->lv); + if (!dm_seglist) + continue; + dm_list_iterate_items (seglist, dm_seglist) { + prop = lvm_lvseg_get_property (seglist->lvseg, + "segtype"); + if (!prop.is_valid || !prop.value.string) + continue; + if (!strcmp (prop.value.string, "thin-pool")) { + thin = _gf_true; + lv_name = lvm_lv_get_name (lv_list->lv); + priv->pool = gf_strdup (lv_name); + gf_log (THIS->name, GF_LOG_INFO, "Thin Pool " + "\"%s\" will be used for thin LVs", + lv_name); + break; + } + } + } + +check: + /* If there is no volume-id set in dict, we cant validate */ + tmp_data = dict_get (this->options, "volume-id"); + if (!tmp_data) { + op_ret = 0; + goto out; + } + + op_ret = uuid_parse (tmp_data->data, dict_uuid); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "wrong volume-id (%s) set in volume file", + tmp_data->data); + op_ret = -1; + goto out; + } + + tags = lvm_vg_get_tags (brick); + if (!tags) { /* no tags in the VG */ + gf_log (this->name, GF_LOG_ERROR, + "Extended attribute trusted.glusterfs." + "volume-id is absent"); + op_ret = -1; + goto out; + } + dm_list_iterate_items (strl, tags) { + if (!strncmp (strl->str, GF_XATTR_VOL_ID_KEY, + strlen (GF_XATTR_VOL_ID_KEY))) { + uuid = _gf_true; + break; + } + } + /* UUID tag is not set in VG */ + if (!uuid) { + gf_log (this->name, GF_LOG_ERROR, + "Extended attribute trusted.glusterfs." + "volume-id is absent"); + op_ret = -1; + goto out; + } + + op_ret = uuid_parse (strl->str + strlen (GF_XATTR_VOL_ID_KEY) + 1, + vg_uuid); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "wrong volume-id (%s) set in VG", strl->str); + op_ret = -1; + goto out; + } + if (uuid_compare (dict_uuid, vg_uuid)) { + gf_log (this->name, GF_LOG_ERROR, + "mismatching volume-id (%s) received. " + "already is a part of volume %s ", + tmp_data->data, vg_uuid); + op_ret = -1; + goto out; + } + + op_ret = 0; + +out: + lvm_vg_close (brick); + + if (!thin) + gf_log (THIS->name, GF_LOG_WARNING, "No thin pool found in " + "VG %s\n", priv->vg); + else + priv->caps |= BD_CAPS_THIN; + + return op_ret; +} + +/* FIXME: Move this code to common place, so posix and bd xlator can use */ +char * +page_aligned_alloc (size_t size, char **aligned_buf) +{ + char *alloc_buf = NULL; + char *buf = NULL; + + alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_common_mt_char); + if (!alloc_buf) + return NULL; + /* page aligned buffer */ + buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); + *aligned_buf = buf; + + return alloc_buf; +} + +static int +__bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd_p) +{ + int ret = -1; + int _fd = -1; + char *devpath = NULL; + bd_fd_t *bdfd = NULL; + uint64_t tmp_bdfd = 0; + bd_priv_t *priv = this->private; + bd_gfid_t gfid = {0, }; + bd_attr_t *bdatt = NULL; + + /* not bd file */ + if (fd->inode->ia_type != IA_IFREG || + bd_inode_ctx_get (fd->inode, this, &bdatt)) + return 0; + + ret = __fd_ctx_get (fd, this, &tmp_bdfd); + if (ret == 0) { + bdfd = (void *)(long) tmp_bdfd; + *bdfd_p = bdfd; + return 0; + } + + uuid_utoa_r (fd->inode->gfid, gfid); + asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid); + if (!devpath) + goto out; + + _fd = open (devpath, O_RDWR | O_LARGEFILE, 0); + if (_fd < 0) { + ret = errno; + gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath, + strerror (ret)); + goto out; + } + bdfd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd); + BD_VALIDATE_MEM_ALLOC (bdfd, ret, out); + + bdfd->fd = _fd; + bdfd->flag = O_RDWR | O_LARGEFILE; + if (__fd_ctx_set (fd, this, (uint64_t)(long)bdfd) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set the fd context fd=%p", fd); + goto out; + } + + *bdfd_p = bdfd; + + ret = 0; +out: + FREE (devpath); + if (ret) { + close (_fd); + GF_FREE (bdfd); + } + return ret; +} + +int +bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd) +{ + int ret; + + /* FIXME: Is it ok to fd->lock here ? */ + LOCK (&fd->lock); + { + ret = __bd_fd_ctx_get (this, fd, bdfd); + } + UNLOCK (&fd->lock); + + return ret; +} + +/* + * Validates if LV exists for given inode or not. + * Returns 0 if LV exists and size also matches. + * If LV does not exist -1 returned + * If LV size mismatches, returnes 1 also lv_size is updated with actual + * size + */ +int +bd_validate_bd_xattr (xlator_t *this, char *bd, char **type, + uint64_t *lv_size, uuid_t uuid) +{ + char *path = NULL; + int ret = -1; + bd_gfid_t gfid = {0, }; + bd_priv_t *priv = this->private; + struct stat stbuf = {0, }; + uint64_t size = 0; + vg_t vg = NULL; + lv_t lv = NULL; + char *bytes = NULL; + + bytes = strrchr (bd, ':'); + if (bytes) { + *bytes = '\0'; + bytes++; + gf_string2bytesize (bytes, &size); + } + + if (strcmp (bd, BD_LV) && strcmp (bd, BD_THIN)) { + gf_log (this->name, GF_LOG_WARNING, + "invalid xattr %s", bd); + return -1; + } + *type = gf_strdup (bd); + + /* + * Check if LV really exist, there could be a failure + * after setxattr and successful LV creation + */ + uuid_utoa_r (uuid, gfid); + gf_asprintf (&path, "/dev/%s/%s", priv->vg, gfid); + if (!path) { + gf_log (this->name, GF_LOG_WARNING, + "insufficient memory"); + return 0; + } + + /* Destination file does not exist */ + if (stat (path, &stbuf)) { + gf_log (this->name, GF_LOG_WARNING, + "lstat failed for path %s", path); + return -1; + } + + vg = lvm_vg_open (priv->handle, priv->vg, "r", 0); + if (!vg) { + gf_log (this->name, GF_LOG_WARNING, + "VG %s does not exist?", priv->vg); + ret = -1; + goto out; + } + + lv = lvm_lv_from_name (vg, gfid); + if (!lv) { + gf_log (this->name, GF_LOG_WARNING, + "LV %s does not exist", gfid); + ret = -1; + goto out; + } + + *lv_size = lvm_lv_get_size (lv); + if (size == *lv_size) { + ret = 0; + goto out; + } + + ret = 1; + +out: + if (vg) + lvm_vg_close (vg); + + GF_FREE (path); + return ret; +} + +static int +create_thin_lv (char *vg, char *pool, char *lv, uint64_t extent) +{ + int ret = -1; + runner_t runner = {0, }; + char *path = NULL; + struct stat stat = {0, }; + + runinit (&runner); + runner_add_args (&runner, LVM_CREATE, NULL); + runner_add_args (&runner, "--thin", NULL); + runner_argprintf (&runner, "%s/%s", vg, pool); + runner_add_args (&runner, "--name", NULL); + runner_argprintf (&runner, "%s", lv); + runner_add_args (&runner, "--virtualsize", NULL); + runner_argprintf (&runner, "%ldB", extent); + runner_start (&runner); + runner_end (&runner); + + gf_asprintf (&path, "/dev/%s/%s", vg, lv); + if (!path) { + ret = ENOMEM; + goto out; + } + if (lstat (path, &stat) < 0) + ret = EAGAIN; + else + ret = 0; +out: + GF_FREE (path); + return ret; +} + +int +bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv) +{ + int ret = 0; + vg_t vg = NULL; + bd_gfid_t gfid = {0, }; + + uuid_utoa_r (uuid, gfid); + + if (!strcmp (type, BD_THIN)) + return create_thin_lv (priv->vg, priv->pool, gfid, + size); + + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + return ENOENT; + } + + if (!lvm_vg_create_lv_linear (vg, gfid, size)) { + gf_log (THIS->name, GF_LOG_WARNING, "lvm_vg_create_lv_linear " + "failed"); + ret = errno; + } + + lvm_vg_close (vg); + + return ret; +} + +int32_t +bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size) +{ + uint64_t new_size = 0; + runner_t runner = {0, }; + bd_gfid_t gfid = {0, }; + int ret = 0; + vg_t vg = NULL; + lv_t lv = NULL; + + uuid_utoa_r (uuid, gfid); + + runinit (&runner); + + runner_add_args (&runner, LVM_RESIZE, NULL); + runner_argprintf (&runner, "%s/%s", priv->vg, gfid); + runner_argprintf (&runner, "-L%ldb", size); + runner_add_args (&runner, "-f", NULL); + + runner_start (&runner); + runner_end (&runner); + + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + return EAGAIN; + } + + lv = lvm_lv_from_name (vg, gfid); + if (!lv) { + gf_log (THIS->name, GF_LOG_WARNING, "LV %s not found", gfid); + ret = EIO; + goto out; + } + new_size = lvm_lv_get_size (lv); + + if (new_size != size) { + gf_log (THIS->name, GF_LOG_WARNING, "resized LV size %ld does " + "not match requested size %ld", new_size, size); + ret = EIO; + } + +out: + lvm_vg_close (vg); + return ret; +} + +uint64_t +bd_get_default_extent (bd_priv_t *priv) +{ + vg_t vg = NULL; + uint64_t size = 0; + + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + return 0; + } + + size = lvm_vg_get_extent_size (vg); + + lvm_vg_close (vg); + + return size; +} + +/* + * Adjusts the user specified size to VG specific extent size + */ +uint64_t +bd_adjust_size (bd_priv_t *priv, uint64_t size) +{ + uint64_t extent = 0; + uint64_t nr_ex = 0; + + extent = bd_get_default_extent (priv); + if (!extent) + return 0; + + nr_ex = size / extent; + if (size % extent) + nr_ex++; + + size = extent * nr_ex; + + return size; +} + +int +bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno) +{ + vg_t vg = NULL; + lv_t lv = NULL; + int ret = -1; + + *op_errno = 0; + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + *op_errno = ENOENT; + return -1; + } + lv = lvm_lv_from_name (vg, lv_name); + if (!lv) { + gf_log (THIS->name, GF_LOG_WARNING, "No such LV %s", lv_name); + *op_errno = ENOENT; + goto out; + } + ret = lvm_vg_remove_lv (lv); + if (ret < 0) { + gf_log (THIS->name, GF_LOG_WARNING, "removing LV %s failed", + lv_name); + *op_errno = errno; + goto out; + } +out: + lvm_vg_close (vg); + + return ret; +} + +inline void +bd_update_amtime(struct iatt *iatt, int flag) +{ + struct timespec ts = {0, }; + + clock_gettime (CLOCK_REALTIME, &ts); + if (flag & GF_SET_ATTR_ATIME) { + iatt->ia_atime = ts.tv_sec; + iatt->ia_atime_nsec = ts.tv_nsec; + } + if (flag & GF_SET_ATTR_MTIME) { + iatt->ia_mtime = ts.tv_sec; + iatt->ia_mtime_nsec = ts.tv_nsec; + } +} + +int +bd_snapshot_create (bd_local_t *local, bd_priv_t *priv) +{ + char *path = NULL; + bd_gfid_t dest = {0, }; + bd_gfid_t origin = {0, }; + int ret = 0; + runner_t runner = {0, }; + struct stat stat = {0, }; + + uuid_utoa_r (local->dloc->gfid, dest); + uuid_utoa_r (local->loc.gfid, origin); + + gf_asprintf (&path, "/dev/%s/%s", priv->vg, dest); + if (!path) { + gf_log (THIS->name, GF_LOG_WARNING, + "Insufficient memory"); + return ENOMEM; + } + + runinit (&runner); + runner_add_args (&runner, LVM_CREATE, NULL); + runner_add_args (&runner, "--snapshot", NULL); + runner_argprintf (&runner, "/dev/%s/%s", priv->vg, origin); + runner_add_args (&runner, "--name", NULL); + runner_argprintf (&runner, "%s", dest); + if (strcmp (local->bdatt->type, BD_THIN)) + runner_argprintf (&runner, "-L%ldB", local->size); + runner_start (&runner); + runner_end (&runner); + + if (lstat (path, &stat) < 0) + ret = EIO; + + GF_FREE (path); + return ret; +} + +int +bd_clone (bd_local_t *local, bd_priv_t *priv) +{ + int ret = ENOMEM; + int fd1 = -1; + int fd2 = -1; + int i = 0; + char *buff = NULL; + ssize_t bytes = 0; + char *spath = NULL; + char *dpath = NULL; + struct iovec *vec = NULL; + bd_gfid_t source = {0, }; + bd_gfid_t dest = {0, }; + void *bufp[IOV_NR] = {0, }; + + vec = GF_CALLOC (IOV_NR, sizeof (struct iovec), gf_common_mt_iovec); + if (!vec) + return ENOMEM; + + for (i = 0; i < IOV_NR; i++) { + bufp[i] = page_aligned_alloc (IOV_SIZE, &buff); + if (!buff) + goto out; + vec[i].iov_base = buff; + vec[i].iov_len = IOV_SIZE; + } + + uuid_utoa_r (local->loc.gfid, source); + uuid_utoa_r (local->dloc->gfid, dest); + + gf_asprintf (&spath, "/dev/%s/%s", priv->vg, source); + gf_asprintf (&dpath, "/dev/%s/%s", priv->vg, dest); + if (!spath || !dpath) + goto out; + + ret = bd_create (local->dloc->gfid, local->size, + local->bdatt->type, priv); + if (ret) + goto out; + + fd1 = open (spath, O_RDONLY | O_DIRECT); + if (fd1 < 0) { + ret = errno; + goto out; + } + fd2 = open (dpath, O_WRONLY | O_DIRECT); + if (fd2 < 0) { + ret = errno; + goto out; + } + + while (1) { + bytes = readv (fd1, vec, IOV_NR); + if (bytes < 0) { + ret = errno; + gf_log (THIS->name, GF_LOG_WARNING, "read failed: %s", + strerror (ret)); + goto out; + } + if (!bytes) + break; + bytes = writev (fd2, vec, IOV_NR); + if (bytes < 0) { + ret = errno; + gf_log (THIS->name, GF_LOG_WARNING, + "write failed: %s", strerror (ret)); + goto out; + } + } + ret = 0; + +out: + for (i = 0; i < IOV_NR; i++) + GF_FREE (bufp[i]); + GF_FREE (vec); + + if (fd1 != -1) + close (fd1); + if (fd2 != -1) + close (fd2); + + FREE (spath); + FREE (dpath); + + return ret; +} + +/* + * Merges snapshot LV to origin LV and returns status + */ +int +bd_merge (bd_priv_t *priv, uuid_t gfid) +{ + bd_gfid_t dest = {0, }; + char *path = NULL; + struct stat stat = {0, }; + runner_t runner = {0, }; + int ret = 0; + + uuid_utoa_r (gfid, dest); + gf_asprintf (&path, "/dev/%s/%s", priv->vg, dest); + + runinit (&runner); + runner_add_args (&runner, LVM_CONVERT, NULL); + runner_add_args (&runner, "--merge", NULL); + runner_argprintf (&runner, "%s", path); + runner_start (&runner); + runner_end (&runner); + + if (!lstat (path, &stat)) + ret = EIO; + + GF_FREE (path); + + return ret; +} + +int +bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict) +{ + vg_t brick = NULL; + lvm_property_value_t prop = {0, }; + lv_t lv = NULL; + int ret = -1; + bd_gfid_t gfid = {0, }; + inode_t *inode = NULL; + char *origin = NULL; + + brick = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!brick) { + gf_log (THIS->name, GF_LOG_CRITICAL, "VG %s is not found", + priv->vg); + return ENOENT; + } + + if (fd) + inode = fd->inode; + else + inode = loc->inode; + + uuid_utoa_r (inode->gfid, gfid); + lv = lvm_lv_from_name (brick, gfid); + if (!lv) { + gf_log (THIS->name, GF_LOG_CRITICAL, "LV %s not found", gfid); + ret = ENOENT; + goto out; + } + + prop = lvm_lv_get_property (lv, "origin"); + if (!prop.is_valid || !prop.value.string) { + ret = ENODATA; + goto out; + } + + origin = gf_strdup (prop.value.string); + ret = dict_set_dynstr (dict, BD_ORIGIN, origin); + +out: + lvm_vg_close (brick); + return ret; +} + diff --git a/xlators/storage/bd/src/bd.c b/xlators/storage/bd/src/bd.c new file mode 100644 index 000000000..405474c58 --- /dev/null +++ b/xlators/storage/bd/src/bd.c @@ -0,0 +1,2404 @@ +/* + BD translator V2 - Exports Block devices on server side as regular + files to client + + Now only exporting Logical volumes supported. + + Copyright IBM, Corp. 2013 + + This file is part of GlusterFS. + + Author: + M. Mohan Kumar <mohan@in.ibm.com> + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include <lvm2app.h> +#include <openssl/md5.h> +#include <time.h> +#include <linux/fs.h> +#include <sys/ioctl.h> +#ifdef HAVE_LIBAIO +#include <libaio.h> +#endif + +#include "bd.h" +#include "bd-aio.h" +#include "defaults.h" +#include "glusterfs3-xdr.h" +#include "run.h" +#include "protocol-common.h" +#include "checksum.h" + +/* + * Call back function for setxattr and removexattr. + * does not do anything. FIXME: How to handle remove/setxattr failure + */ +int +bd_null_rmsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + STACK_DESTROY (frame->root); + return 0; +} + +/* + * returns 0 if a file is mapped to BD or not. + */ +int +bd_get_bd_info (call_frame_t *frame, xlator_t *this, dict_t *xattr, uuid_t gfid, + char **type, uint64_t *size) +{ + char *bd_xattr = NULL; + char *bd = NULL; + int ret = -1; + loc_t loc = {0, }; + dict_t *dict = NULL; + char *p = NULL; + call_frame_t *bd_frame = NULL; + + if (!xattr) + return 1; + + if (dict_get_str (xattr, BD_XATTR, &p)) + return 1; + + bd_xattr = gf_strdup (p); + + memcpy (loc.gfid, gfid, sizeof (uuid_t)); + + bd_frame = copy_frame (frame); + BD_VALIDATE_MEM_ALLOC (bd_frame, ret, out); + + ret = bd_validate_bd_xattr (this, bd_xattr, type, size, gfid); + if (ret < 0) {/* LV does not exist */ + STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->removexattr, &loc, + BD_XATTR, NULL); + + gf_log (this->name, GF_LOG_WARNING, + "Mapped LV not available for posix file <gfid:%s>, " + "deleting mapping", uuid_utoa (gfid)); + } else if (ret == 1) { + /* BD_XATTR size and LV size mismatch. Update BD_XATTR */ + gf_asprintf (&bd, "%s:%ld", *type, *size); + + dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (dict, ret, out); + + ret = dict_set_dynstr (dict, BD_XATTR, bd); + if (ret) + goto out; + + STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setxattr, &loc, dict, 0, + NULL); + } + +out: + dict_del (xattr, BD_XATTR); + GF_FREE (bd_xattr); + GF_FREE (bd); + return ret; +} + +/* + * bd_lookup_cbk: Call back from posix_lookup. + */ +int32_t +bd_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + int ret = -1; + bd_attr_t *bdatt = NULL; + uint64_t size = 0; + char *type = BD_TYPE_NONE; + + /* only regular files are part of BD object */ + if (op_ret < 0 || buf->ia_type != IA_IFREG) + goto out; + + /* iatt already cached */ + if (!bd_inode_ctx_get (inode, this, &bdatt)) + goto next; + + if (bd_get_bd_info (frame, this, xattr, buf->ia_gfid, &type, &size)) + goto out; + + /* BD file, update buf */ + bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + if (!bdatt) { + op_errno = ENOMEM; + goto out; + } + memcpy (&bdatt->iatt, buf, sizeof (struct iatt)); + bdatt->type = type; + + /* Cache LV size in inode_ctx */ + ret = bd_inode_ctx_set (inode, this, bdatt); + if (ret < 0) { + GF_FREE (bdatt); + op_errno = EINVAL; + goto out; + } + + bdatt->iatt.ia_size = size; + bdatt->iatt.ia_blocks = size / 512; + +next: + dict_del (xattr, GF_CONTENT_KEY); + memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, + xattr, postparent); + return 0; +} + +/* + * bd_lookup: Issues posix_lookup to find out if file is mapped to BD + * bd_lookup -> posix_lookup -> bd_lookup_cbk +*/ +int32_t +bd_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + dict_t *bd_xattr = NULL; + bd_attr_t *bdatt = NULL; + int op_errno = EINVAL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (loc->path, out); + VALIDATE_OR_GOTO (this->private, out); + + if (bd_inode_ctx_get (loc->inode, this, &bdatt) < 0) { + if (!xattr_req) { + bd_xattr = dict_new (); + BD_VALIDATE_MEM_ALLOC (bd_xattr, op_errno, out); + xattr_req = bd_xattr; + } + if (dict_set_int8 (xattr_req, BD_XATTR, 1) < 0) + goto out; + } + + STACK_WIND (frame, bd_lookup_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, loc, xattr_req); + + if (bd_xattr) + dict_unref (bd_xattr); + return 0; +out: + BD_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + + return 0; +} + +int +bd_forget (xlator_t *this, inode_t *inode) +{ + int ret = -1; + uint64_t ctx = 0; + bd_attr_t *bdatt = NULL; + + ret = bd_inode_ctx_get (inode, this, &bdatt); + if (!ret) { + inode_ctx_del (inode, this, &ctx); + FREE (bdatt); + } + return 0; +} + +int +bd_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *entries, dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + uint64_t size = 0; + char *type = NULL; + + if (op_ret < 0) + goto out; + + list_for_each_entry (entry, &entries->list, list) { + if (entry->d_type != DT_REG) + continue; + if (!bd_get_bd_info (frame, this, entry->dict, + entry->d_stat.ia_gfid, &type, &size)) { + entry->d_stat.ia_size = size; + entry->d_stat.ia_blocks = size / 512; + FREE (type); + } + } + +out: + BD_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} + +/* + * bd_readdirp: In bd_readdirp_cbk if the file and BD_XATTR_SIZE is set + * ia_size is updated with the LV(BD_XATTR_SIZE) size + */ +int32_t +bd_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict) +{ + int op_errno = EINVAL; + bd_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + if (!dict) { + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + dict = local->dict; + } + + if (dict_set_int8 (dict, BD_XATTR, 0)) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set key %s", BD_XATTR); + goto out; + } + + STACK_WIND (frame, bd_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, dict); + + return 0; +out: + BD_STACK_UNWIND (readdirp, frame, -1, op_errno, NULL, dict); + return 0; +} + +int +bd_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *buf, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + + /* only regular files are part of BD object */ + if (op_ret < 0 || buf->ia_type != IA_IFREG) + goto out; + + BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + + /* update buf with LV size */ + if (!bd_inode_ctx_get (local->inode, this, &bdatt)) + memcpy (buf, bdatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +bd_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int op_errno = EINVAL; + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (loc->path, out); + VALIDATE_OR_GOTO (this->private, out); + + if (!bd_inode_ctx_get (loc->inode, this, &bdatt)) { + BD_STACK_UNWIND (stat, frame, 0, 0, &bdatt->iatt, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + local->inode = inode_ref (loc->inode); + + STACK_WIND(frame, bd_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; +out: + BD_STACK_UNWIND (stat, frame, -1, op_errno, NULL, xdata); + return 0; +} + +int +bd_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *buff, dict_t *xdata) +{ + uint64_t size = 0; + uint64_t fr_size = 0; + bd_priv_t *priv = NULL; + vg_t vg = NULL; + + if (op_ret < 0) + goto out; + + priv = this->private; + + vg = lvm_vg_open (priv->handle, priv->vg, "r", 0); + if (!vg) { + gf_log (this->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + op_ret = -1; + op_errno = EAGAIN; + goto out; + } + size = lvm_vg_get_size (vg); + fr_size = lvm_vg_get_free_size (vg); + lvm_vg_close (vg); + + buff->f_blocks += size / buff->f_frsize; + buff->f_bfree += fr_size / buff->f_frsize; + buff->f_bavail += fr_size / buff->f_frsize; + +out: + BD_STACK_UNWIND (statfs, frame, op_ret, op_errno, buff, xdata); + return 0; +} + +/* + * bd_statfs: Mimics statfs by returning used/free extents in the VG + */ +int +bd_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + STACK_WIND (frame, bd_statfs_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, loc, xdata); + return 0; +out: + BD_STACK_UNWIND (statfs, frame, -1, EINVAL, NULL, NULL); + return 0; +} + +int +bd_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *buf, dict_t *xdata) +{ + bd_attr_t *bdatt = NULL; + bd_local_t *local = frame->local; + + /* only regular files are part of BD object */ + if (op_ret < 0 || buf->ia_type != IA_IFREG) + goto out; + + BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + + /* update buf with LV size */ + if (!bd_inode_ctx_get (local->inode, this, &bdatt)) + memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +bd_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int op_errno = EINVAL; + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + /* if its already cached return it */ + if (!bd_inode_ctx_get (fd->inode, this, &bdatt)) { + BD_STACK_UNWIND (fstat, frame, 0, 0, &bdatt->iatt, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->inode = inode_ref (fd->inode); + + STACK_WIND (frame, bd_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + + return 0; +out: + BD_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, xdata); + return 0; +} + +/* + * bd_readv: If posix file, invokes posix_readv otherwise reads from the BD + * file + */ +int +bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int ret = -1; + int _fd = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + bd_fd_t *bd_fd = NULL; + struct iovec vec = {0, }; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + uint64_t bd_size = 0; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; + } + if (!size) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); + goto out; + } + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto out; + } + _fd = bd_fd->fd; + op_ret = pread (_fd, iobuf->ptr, size, offset); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "read failed on fd=%p: %s", fd, + strerror (op_errno)); + goto out; + } + + vec.iov_base = iobuf->ptr; + vec.iov_len = op_ret; + + iobref = iobref_new (); + iobref_add (iobref, iobuf); + + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + op_errno = EINVAL; + op_ret = -1; + goto out; + } + bd_size = bdatt->iatt.ia_size; + if (!bd_size || (offset + vec.iov_len) >= bd_size) + op_errno = ENOENT; + + op_ret = vec.iov_len; + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_ATIME); + +out: + BD_STACK_UNWIND (readv, frame, op_ret, op_errno, + &vec, 1, &bdatt->iatt, iobref, NULL); + + if (iobref) + iobref_unref (iobref); + if (iobuf) + iobuf_unref (iobuf); + + return 0; +} + +#ifdef BLKDISCARD +/* + * bd_discard: Sends BLKDISCARD ioctl to the block device + */ +int +bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int ret = -1; + int op_errno = EINVAL; + bd_fd_t *bd_fd = NULL; + uint64_t param[2] = {0, }; + bd_attr_t *bdatt = NULL; + struct iatt prebuf = {0, }; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (fd, out); + + /* posix */ + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + STACK_WIND (frame, default_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, + fd, offset, len, xdata); + return 0; + } + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + op_errno = EINVAL; + goto out; + } + + param[0] = offset; + param[1] = len; + ret = ioctl (bd_fd->fd, BLKDISCARD, param); + if (ret < 0) { + if (errno == ENOTTY) + op_errno = ENOSYS; + else + op_errno = errno; + goto out; + } + memcpy (&prebuf, &bdatt->iatt, sizeof (prebuf)); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); + + BD_STACK_UNWIND (discard, frame, ret, op_errno, &prebuf, + &bdatt->iatt, xdata); + return 0; + +out: + BD_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} +#else + +int +bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + BD_STACK_UNWIND (discard, frame, -1, ENOSYS, NULL, NULL, NULL); + return 0; +} +#endif + +/* + * Call back from posix_open for opening the backing posix file + * If it failed, close BD fd + */ +int +bd_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ + bd_fd_t *bd_fd = NULL; + bd_attr_t *bdatt = NULL; + + if (!op_ret) + goto out; + + bd_inode_ctx_get (fd->inode, this, &bdatt); + if (!bdatt) /* posix file */ + goto out; + + /* posix open failed */ + if (bd_fd_ctx_get (this, fd, &bd_fd) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "bd_fd is NULL from fd=%p", fd); + goto out; + } + close (bd_fd->fd); + GF_FREE (bd_fd); + +out: + BD_STACK_UNWIND (open, frame, op_ret, op_errno, fd, NULL); + + return 0; +} + +/* + * bd_open: Opens BD file if given posix file is mapped to BD. Also opens + * posix file. + * fd contains both posix and BD fd + */ +int32_t +bd_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + int32_t ret = EINVAL; + bd_fd_t *bd_fd = NULL; + bd_attr_t *bdatt = NULL; + bd_gfid_t gfid = {0, }; + char *devpath = NULL; + bd_priv_t *priv = this->private; + int _fd = -1; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (fd, out); + + /* not bd file */ + if (fd->inode->ia_type != IA_IFREG || + bd_inode_ctx_get (fd->inode, this, &bdatt)) + goto posix; + + uuid_utoa_r (fd->inode->gfid, gfid); + asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid); + BD_VALIDATE_MEM_ALLOC (devpath, ret, out); + + _fd = open (devpath, flags | O_LARGEFILE, 0); + if (_fd < 0) { + ret = errno; + gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath, + strerror (ret)); + goto out; + } + bd_fd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd); + BD_VALIDATE_MEM_ALLOC (bd_fd, ret, out); + + bd_fd->fd = _fd; + bd_fd->flag = flags | O_LARGEFILE; + + if (fd_ctx_set (fd, this, (uint64_t)(long)bd_fd) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set the fd context fd=%p", fd); + goto out; + } + + ret = 0; + +posix: + + /* open posix equivalant of this file, fd needed for fd related + operations like fsetxattr, ftruncate etc */ + STACK_WIND (frame, bd_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + + return 0; +out: + BD_STACK_UNWIND (open, frame, -1, ret, fd, NULL); + + FREE (devpath); + if (ret) { + close (_fd); + GF_FREE (bd_fd); + } + + return 0; +} + +/* + * call back from posix_setattr after updating iatt to posix file. + */ +int +bd_fsync_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = local->bdatt; + + BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &bdatt->iatt, + &bdatt->iatt, NULL); + return 0; +} + +int +bd_do_fsync (int fd, int datasync) +{ + int op_errno = 0; + +#ifdef HAVE_FDATASYNC + if (datasync) { + if (fdatasync (fd)) { + op_errno = errno; + gf_log (THIS->name, GF_LOG_ERROR, + "fdatasync on fd=%d failed: %s", + fd, strerror (errno)); + } + + } else +#endif + { + if (fsync (fd)) { + op_errno = errno; + gf_log (THIS->name, GF_LOG_ERROR, + "fsync on fd=%d failed: %s", + fd, strerror (op_errno)); + } + } + + return op_errno; +} + +/* + * bd_fsync: Syncs if BD fd, forwards the request to posix + * fsync -> posix_setattr -> posix_fsync +*/ +int32_t +bd_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t datasync, dict_t *xdata) +{ + int ret = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + bd_fd_t *bd_fd = NULL; + bd_priv_t *priv = NULL; + bd_attr_t *bdatt = NULL; + bd_local_t *local = NULL; + int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + struct iatt prebuf = {0, }; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ret = bd_inode_ctx_get (fd->inode, this, &bdatt); + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd || !bdatt) { + STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsync, fd, datasync, + xdata); + return 0; + } + + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + + op_errno = bd_do_fsync (bd_fd->fd, datasync); + if (op_errno) + goto out; + + /* For BD, Update the a|mtime during full fsync only */ + if (!datasync) { + local = bd_local_init (frame, this); + /* In case of mem failure, should posix flush called ? */ + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + local->bdatt->type = gf_strdup (bdatt->type); + memcpy (&local->bdatt->iatt, &bdatt->iatt, sizeof (struct iatt)); + bd_update_amtime (&local->bdatt->iatt, valid); + uuid_copy (local->loc.gfid, fd->inode->gfid); + STACK_WIND (frame, bd_fsync_setattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, &local->loc, + &local->bdatt->iatt, + valid, NULL); + return 0; + } + +out: + BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &prebuf, + &bdatt->iatt, NULL); + return 0; +} + +int +bd_flush_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + BD_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata); + return 0; +} + +int +bd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int ret = -1; + bd_fd_t *bd_fd = NULL; + bd_priv_t *priv = NULL; + bd_attr_t *bdatt = NULL; + int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + bd_local_t *local = NULL; + int op_errno = EINVAL; + loc_t loc = {0, }; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ret = bd_inode_ctx_get (fd->inode, this, &bdatt); + if (!bdatt) + goto out; + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd || !bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "bdfd/bdatt is NULL from fd=%p", fd); + goto out; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->fd = fd_ref (fd); + uuid_copy (loc.gfid, bdatt->iatt.ia_gfid); + + /* Update the a|mtime during flush */ + STACK_WIND (frame, bd_flush_setattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, &loc, &bdatt->iatt, + valid, NULL); + + return 0; + +out: + STACK_WIND (frame, default_flush_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->flush, fd, xdata); + + return 0; +} + +int32_t +bd_release (xlator_t *this, fd_t *fd) +{ + int ret = -1; + bd_fd_t *bd_fd = NULL; + uint64_t tmp_bfd = 0; + bd_attr_t *bdatt = NULL; + bd_priv_t *priv = this->private; + + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (priv, out); + + ret = bd_inode_ctx_get (fd->inode, this, &bdatt); + if (ret || !bdatt) /* posix file */ + goto out; + + /* FIXME: Update amtime during release */ + + ret = fd_ctx_del (fd, this, &tmp_bfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "bfd is NULL from fd=%p", fd); + goto out; + } + bd_fd = (bd_fd_t *)(long)tmp_bfd; + + close (bd_fd->fd); + GF_FREE (bd_fd); +out: + return 0; +} + +/* + * Call back for removexattr after removing BD_XATTR incase of + * bd create failure + */ +int +bd_setx_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + + if (local->fd) + BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata); + else + BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata); + return 0; + +} + +/* + * Call back after setting BD_XATTR. Creates BD. If BD creation is a failure + * invokes posix_removexattr to remove created BD_XATTR + */ +int +bd_setx_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + + if (op_ret < 0) + goto next; + + /* Create LV */ + op_errno = bd_create (local->inode->gfid, local->bdatt->iatt.ia_size, + local->bdatt->type, this->private); + if (!op_errno) + goto out; + + /* LV creation failed, remove BD_XATTR */ + if (local->fd) + STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, + local->fd, BD_XATTR, NULL); + else + STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + &local->loc, BD_XATTR, NULL); + + return 0; +out: + + bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + if (!bdatt) { + op_ret = -1; + op_errno = ENOMEM; + goto next; + } + + memcpy (&bdatt->iatt, &local->bdatt->iatt, sizeof (struct iatt)); + bdatt->type = gf_strdup (local->bdatt->type); + + bd_inode_ctx_set (local->inode, THIS, bdatt); + +next: + if (local->fd) + BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); + else + BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); + return 0; + +} + +/* + * Call back from posix_stat + */ +int +bd_setx_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *iatt, + dict_t *xdata) +{ + char *param = NULL; + char *type = NULL; + char *s_size = NULL; + char *p = NULL; + char *copy = NULL; + bd_local_t *local = frame->local; + bd_priv_t *priv = this->private; + char *bd = NULL; + uint64_t size = 0; + + if (op_ret < 0) + goto out; + + if (!IA_ISREG (iatt->ia_type)) { + op_errno = EOPNOTSUPP; + goto out; + } + + param = copy = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char); + BD_VALIDATE_MEM_ALLOC (param, op_errno, out); + + strncpy (param, local->data->data, local->data->len); + + type = strtok_r (param, ":", &p); + if (!type) { + op_errno = EINVAL; + goto out; + } + + if (strcmp (type, BD_LV) && strcmp (type, BD_THIN)) { + gf_log (this->name, GF_LOG_WARNING, "Invalid bd type %s given", + type); + op_errno = EINVAL; + goto out; + } + + s_size = strtok_r (NULL, ":", &p); + + /* If size not specified get default size */ + if (!s_size) + size = bd_get_default_extent (priv); + else + gf_string2bytesize (s_size, &size); + + gf_asprintf (&bd, "%s:%ld", type, size); + BD_VALIDATE_MEM_ALLOC (bd, op_errno, out); + + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + + local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + if (dict_set_dynstr (local->dict, BD_XATTR, bd) < 0) { + op_errno = EINVAL; + goto out; + } + + local->bdatt->type = gf_strdup (type); + memcpy (&local->bdatt->iatt, iatt, sizeof (struct iatt)); + local->bdatt->iatt.ia_size = size; + + if (local->fd) + STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, local->dict, 0, NULL); + else + STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + &local->loc, local->dict, 0, NULL); + + return 0; + +out: + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, xdata); + else + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, xdata); + + GF_FREE (bd); + GF_FREE (copy); + return 0; +} + +int +bd_offload_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, EIO, NULL); + else + BD_STACK_UNWIND (setxattr, frame, -1, EIO, NULL); + + return 0; +} + +int +bd_offload_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + + if (op_ret < 0) + goto out; + + if (local->offload == BD_OF_SNAPSHOT) + op_ret = bd_snapshot_create (frame->local, this->private); + else + op_ret = bd_clone (frame->local, this->private); + + if (op_ret) { + STACK_WIND (frame, bd_offload_rm_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + local->dloc, BD_XATTR, NULL); + return 0; + } + +out: + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL); + else + BD_STACK_UNWIND (setxattr, frame, op_errno, op_errno, NULL); + + return 0; +} + +int +bd_offload_getx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + char *bd = NULL; + bd_local_t *local = frame->local; + char *type = NULL; + char *p = NULL; + + if (op_ret < 0) + goto out; + + if (dict_get_str (xattr, BD_XATTR, &p)) { + op_errno = EINVAL; + goto out; + } + + type = gf_strdup (p); + BD_VALIDATE_MEM_ALLOC (type, op_errno, out); + + p = strrchr (type, ':'); + if (!p) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, + "source file xattr %s corrupted?", type); + goto out; + } + + *p='\0'; + + /* For clone size is taken from source LV */ + if (!local->size) { + p++; + gf_string2bytesize (p, &local->size); + } + gf_asprintf (&bd, "%s:%ld", type, local->size); + local->bdatt->type = gf_strdup (type); + dict_del (local->dict, BD_XATTR); + dict_del (local->dict, LINKTO); + if (dict_set_dynstr (local->dict, BD_XATTR, bd)) { + op_errno = EINVAL; + goto out; + } + + STACK_WIND (frame, bd_offload_setx_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + local->dloc, local->dict, 0, NULL); + + return 0; + +out: + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); + else + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + + GF_FREE (type); + GF_FREE (bd); + + return 0; +} + +int +bd_offload_dest_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *iatt, + dict_t *xattr, struct iatt *postparent) +{ + bd_local_t *local = frame->local; + char *bd = NULL; + int ret = -1; + char *linkto = NULL; + + if (op_ret < 0 && op_errno != ENODATA) { + op_errno = EINVAL; + goto out; + } + + if (!IA_ISREG (iatt->ia_type)) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "destination gfid is not a " + "regular file"); + goto out; + } + + ret = dict_get_str (xattr, LINKTO, &linkto); + if (linkto) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "destination file not " + "present in same brick"); + goto out; + } + + ret = dict_get_str (xattr, BD_XATTR, &bd); + if (bd) { + op_errno = EEXIST; + goto out; + } + + local->bdatt = CALLOC (1, sizeof (bd_attr_t)); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + STACK_WIND (frame, bd_offload_getx_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + &local->loc, BD_XATTR, NULL); + + return 0; +out: + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); + else + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + + return 0; +} + +int +bd_merge_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + /* FIXME: if delete failed, remove xattr */ + + BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, NULL); + return 0; +} + +int +bd_do_merge(call_frame_t *frame, xlator_t *this) +{ + bd_local_t *local = frame->local; + inode_t *parent = NULL; + char *p = NULL; + int op_errno = 0; + + op_errno = bd_merge (this->private, local->inode->gfid); + if (op_errno) + goto out; + + /* + * posix_unlink needs loc->pargfid to be valid, but setxattr FOP does + * not have loc->pargfid set. Get parent's gfid by getting parents inode + */ + parent = inode_parent (local->inode, NULL, NULL); + if (!parent) { + /* + * FIXME: Snapshot LV already deleted. + * remove xattr, instead of returning failure + */ + op_errno = EINVAL; + goto out; + } + uuid_copy (local->loc.pargfid, parent->gfid); + + p = strrchr (local->loc.path, '/'); + if (p) + p++; + local->loc.name = p; + + STACK_WIND (frame, bd_merge_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + &local->loc, 0, NULL); + + return 0; +out: + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); + + return op_errno; +} + +int +bd_offload (call_frame_t *frame, xlator_t *this, loc_t *loc, + fd_t *fd, bd_offload_t offload) +{ + char *param = NULL; + char *param_copy = NULL; + char *p = NULL; + char *size = NULL; + char *gfid = NULL; + int op_errno = 0; + bd_local_t *local = frame->local; + + param = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char); + BD_VALIDATE_MEM_ALLOC (param, op_errno, out); + param_copy = param; + + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + + local->dloc = CALLOC (1, sizeof (loc_t)); + BD_VALIDATE_MEM_ALLOC (local->dloc, op_errno, out); + + strncpy (param, local->data->data, local->data->len); + + gfid = strtok_r (param, ":", &p); + size = strtok_r (NULL, ":", &p); + if (size) + gf_string2bytesize (size, &local->size); + else if (offload != BD_OF_CLONE) + local->size = bd_get_default_extent (this->private); + + if (dict_set_int8 (local->dict, BD_XATTR, 1) < 0) { + op_errno = EINVAL; + goto out; + } + if (dict_set_int8 (local->dict, LINKTO, 1) < 0) { + op_errno = EINVAL; + goto out; + } + + uuid_parse (gfid, local->dloc->gfid); + local->offload = offload; + + STACK_WIND (frame, bd_offload_dest_lookup_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, local->dloc, + local->dict); + + return 0; + +out: + if (fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); + else + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + + GF_FREE (param_copy); + return 0; +} + +/* + * bd_setxattr: Used to create & map an LV to a posix file using + * BD_XATTR xattr + * bd_setxattr -> posix_stat -> bd_setx_stat_cbk -> posix_setxattr -> + * bd_setx_setx_cbk -> create_lv + * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk + */ +int +bd_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int flags, dict_t *xdata) +{ + int op_errno = 0; + data_t *data = NULL; + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + bd_offload_t cl_type = BD_OF_NONE; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + + if ((data = dict_get (dict, BD_XATTR))) + cl_type = BD_OF_NONE; + else if ((data = dict_get (dict, BD_CLONE))) + cl_type = BD_OF_CLONE; + else if ((data = dict_get (dict, BD_SNAPSHOT))) + cl_type = BD_OF_SNAPSHOT; + else if ((data = dict_get (dict, BD_MERGE))) + cl_type = BD_OF_MERGE; + + bd_inode_ctx_get (loc->inode, this, &bdatt); + if (!cl_type && !data) { + STACK_WIND (frame, default_setxattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setxattr, loc, dict, + flags, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->data = data; + loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + + if (cl_type) { + /* For cloning/snapshot, source file must be mapped to LV */ + if (!bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "%s not mapped to BD", loc->path); + op_errno = EINVAL; + goto out; + } + if (cl_type == BD_OF_MERGE) + bd_do_merge (frame, this); + else + bd_offload (frame, this, loc, NULL, cl_type); + } else if (data) { + if (bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "%s already mapped to BD", loc->path); + op_errno = EEXIST; + goto out; + } + STACK_WIND (frame, bd_setx_stat_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->stat, loc, xdata); + } + + return 0; +out: + if (op_errno) + STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, xdata); + + return 0; +} + +/* + * bd_fsetxattr: Used to create/map an LV to a posix file using + * BD_XATTR xattr + * bd_fsetxattr -> posix_fstat -> bd_setx_stat_cbk -> posix_fsetxattr -> + * bd_setx_setx_cbk -> create_lv + * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk + * -> bd_fsetxattr_cbk + */ +int32_t +bd_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int flags, dict_t *xdata) +{ + int op_errno = 0; + data_t *data = NULL; + bd_attr_t *bdatt = NULL; + bd_local_t *local = NULL; + bd_offload_t cl_type = BD_OF_NONE; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (fd, out); + + bd_inode_ctx_get (fd->inode, this, &bdatt); + + data = dict_get (dict, BD_XATTR); + if ((data = dict_get (dict, BD_XATTR))) + cl_type = BD_OF_NONE; + else if ((data = dict_get (dict, BD_CLONE))) + cl_type = BD_OF_CLONE; + else if ((data = dict_get (dict, BD_SNAPSHOT))) + cl_type = BD_OF_SNAPSHOT; + else if ((data = dict_get (dict, BD_MERGE))) { + /* + * bd_merge is not supported for fsetxattr, because snapshot LV + * is opened and it causes problem in snapshot merge + */ + op_errno = EOPNOTSUPP; + goto out; + } + + bd_inode_ctx_get (fd->inode, this, &bdatt); + + if (!cl_type && !data) { + /* non bd file object */ + STACK_WIND (frame, default_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + fd, dict, flags, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->inode = inode_ref (fd->inode); + local->fd = fd_ref (fd); + local->data = data; + + if (cl_type) { + /* For cloning/snapshot, source file must be mapped to LV */ + if (!bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "fd %p not mapped to BD", fd); + op_errno = EINVAL; + goto out; + + } + bd_offload (frame, this, NULL, fd, cl_type); + } else if (data) { + if (bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "fd %p already mapped to BD", fd); + op_errno = EEXIST; + goto out; + } + STACK_WIND(frame, bd_setx_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + } + + return 0; +out: + + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + + return 0; +} + +int32_t +bd_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + if (!strcmp (name, BD_XATTR)) + goto out; + + STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; +out: + BD_STACK_UNWIND (removexattr, frame, -1, ENODATA, NULL); + return 0; +} + +int32_t +bd_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + if (!strcmp (name, BD_XATTR)) + goto out; + + STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + + return 0; +out: + BD_STACK_UNWIND (fremovexattr, frame, -1, ENODATA, NULL); + return 0; +} + +int +bd_trunc_setxattr_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL); + else + BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL); + + return 0; +} + +/* + * Call back for setxattr after setting BD_XATTR_SIZE. + */ +int +bd_trunc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + struct iatt prebuf = {0, }; + char *bd = NULL; + + if (op_ret < 0) + goto out; + + bd_inode_ctx_get (local->inode, this, &bdatt); + if (!bdatt) + goto revert_xattr; + + op_errno = bd_resize (this->private, local->inode->gfid, + local->bdatt->iatt.ia_size); + if (op_errno) + goto revert_xattr; + + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + /* LV resized, update new size in the cache */ + bdatt->iatt.ia_size = local->bdatt->iatt.ia_size; + + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, 0, 0, &prebuf, &bdatt->iatt, + NULL); + else + BD_STACK_UNWIND (truncate, frame, 0, 0, &prebuf, &bdatt->iatt, + NULL); + + return 0; + +revert_xattr: + /* revert setxattr */ + op_ret = dict_get_str (local->dict, BD_XATTR, &bd); + GF_FREE (bd); + gf_asprintf (&bd, "%s:%ld", bdatt->type, bdatt->iatt.ia_size); + + if (local->fd) + STACK_WIND (frame, bd_trunc_setxattr_setx_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, local->dict, 0, NULL); + else + STACK_WIND (frame, bd_trunc_setxattr_setx_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + &local->loc, local->dict, 0, NULL); + + return 0; +out: + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL); + else + BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL); + + return 0; +} + +/* + * call back from posix_[f]truncate_stat + * If offset > LV size, it resizes the LV and calls posix_setxattr + * to update new LV size in xattr else calls posix_setattr for updating + * the posix file so that truncate fop behaves properly + */ +int +bd_trunc_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, dict_t *xdata) +{ + char *bd = NULL; + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + + if (op_ret < 0) + goto out; + + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + + bd_inode_ctx_get (local->inode, this, &bdatt); + if (!bdatt) { + op_errno = EINVAL; + goto out; + } + + gf_asprintf (&bd, "%s:%ld", bdatt->type, local->bdatt->iatt.ia_size); + if (dict_set_dynstr (local->dict, BD_XATTR, bd)) { + op_errno = EINVAL; + goto out; + } + + if (local->fd) + STACK_WIND (frame, bd_trunc_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, local->dict, 0, NULL); + else + STACK_WIND (frame, bd_trunc_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + &local->loc, local->dict, 0, NULL); + + return 0; +out: + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, + NULL); + else + BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, + NULL); + GF_FREE (bd); + return 0; +} + +void +bd_do_trunc (call_frame_t *frame, xlator_t *this, fd_t *fd, loc_t *loc, + off_t offset, bd_attr_t *bdatt) +{ + bd_local_t *local = NULL; + struct iatt prebuf = {0, }; + int op_errno = 0; + int op_ret = -1; + + /* If requested size is less than LV size, return success */ + if (offset <= bdatt->iatt.ia_size) { + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); + op_ret = 0; + goto out; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + if (fd) { + local->inode = inode_ref (fd->inode); + local->fd = fd_ref (fd); + } else { + local->inode = inode_ref (loc->inode); + loc_copy (&local->loc, loc); + } + + local->bdatt->iatt.ia_size = + bd_adjust_size (this->private, offset); + + STACK_WIND (frame, bd_trunc_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, NULL); + + return; + +out: + if (fd) + BD_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, + &prebuf, &bdatt->iatt, NULL); + else + BD_STACK_UNWIND (truncate, frame, op_ret, op_errno, + &prebuf, &bdatt->iatt, NULL); + return; +} + +/* + * bd_ftruncate: Resizes a LV if fd belongs to BD. + */ +int32_t +bd_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + int op_errno = 0; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + STACK_WIND (frame, default_ftruncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, + offset, xdata); + return 0; + } + + bd_do_trunc (frame, this, fd, NULL, offset, bdatt); + return 0; +out: + BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +/* + * bd_truncate: Resizes a LV if file maps to LV. + */ +int32_t +bd_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + int op_errno = 0; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { + STACK_WIND (frame, default_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, + offset, xdata); + return 0; + } + + bd_do_trunc (frame, this, NULL, loc, offset, bdatt); + return 0; + +out: + BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +__bd_pwritev (int fd, struct iovec *vector, int count, off_t offset, + uint64_t bd_size) +{ + int index = 0; + int retval = 0; + off_t internal_offset = 0; + + if (!vector) + return -EFAULT; + + retval = pwritev (fd, vector, count, offset); + if (retval == -1) { + gf_log (THIS->name, GF_LOG_WARNING, + "base %p, length %ld, offset %ld, message %s", + vector[index].iov_base, vector[index].iov_len, + internal_offset, strerror (errno)); + retval = -errno; + goto err; + } +/* + + + internal_offset = offset; + for (index = 0; index < count; index++) { + if (internal_offset > bd_size) { + op_ret = -ENOSPC; + goto err; + } + if (internal_offset + vector[index].iov_len > bd_size) { + vector[index].iov_len = bd_size - internal_offset; + no_space = 1; + } + retval = pwritev (fd, vector[index].iov_base, + vector[index].iov_len, internal_offset); + if (retval == -1) { + gf_log (THIS->name, GF_LOG_WARNING, + "base %p, length %ld, offset %ld, message %s", + vector[index].iov_base, vector[index].iov_len, + internal_offset, strerror (errno)); + op_ret = -errno; + goto err; + } + op_ret += retval; + internal_offset += retval; + if (no_space) + break; + } +*/ +err: + return retval; +} + +/* + * bd_writev: Writes to LV if its BD file or forwards the request to posix_write + * bd_writev -> posix_writev -> bd_writev_cbk + */ +int +bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdict) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + bd_fd_t *bd_fd = NULL; + int ret = -1; + uint64_t size = 0; + struct iatt prebuf = {0, }; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (vector, out); + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { /* posix fd */ + STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, + offset, flags, iobref, xdict); + return 0; + } + + _fd = bd_fd->fd; + + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + size = bdatt->iatt.ia_size; + + op_ret = __bd_pwritev (_fd, vector, count, offset, size); + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64 + ", %s", offset, strerror (op_errno)); + goto out; + } + + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); +out: + + BD_STACK_UNWIND (writev, frame, op_ret, op_errno, &prebuf, + &bdatt->iatt, NULL); + return 0; +} + +int +bd_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + bd_attr_t *bdatt = NULL; + int *valid = cookie; + bd_local_t *local = frame->local; + + if (op_ret < 0 || !valid || !local) + goto out; + + if (bd_inode_ctx_get (local->inode, this, &bdatt)) + goto out; + + if (*valid & GF_SET_ATTR_UID) + bdatt->iatt.ia_uid = postbuf->ia_uid; + else if (*valid & GF_SET_ATTR_GID) + bdatt->iatt.ia_gid = postbuf->ia_gid; + else if (*valid & GF_SET_ATTR_MODE) { + bdatt->iatt.ia_type = postbuf->ia_type; + bdatt->iatt.ia_prot = postbuf->ia_prot; + } else if (*valid & GF_SET_ATTR_ATIME) { + bdatt->iatt.ia_atime = postbuf->ia_atime; + bdatt->iatt.ia_atime_nsec = postbuf->ia_atime_nsec; + } else if (*valid & GF_SET_ATTR_MTIME) { + bdatt->iatt.ia_mtime = postbuf->ia_mtime; + bdatt->iatt.ia_mtime_nsec = postbuf->ia_mtime_nsec; + } + + bdatt->iatt.ia_ctime = postbuf->ia_ctime; + bdatt->iatt.ia_ctime_nsec = postbuf->ia_ctime_nsec; + + memcpy (postbuf, &bdatt->iatt, sizeof (struct iatt)); +out: + FREE (valid); + BD_STACK_UNWIND (setattr, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; +} + +int +bd_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + int *ck_valid = NULL; + int op_errno = 0; + + if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, + loc, stbuf, valid, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + ck_valid = CALLOC (1, sizeof (valid)); + BD_VALIDATE_MEM_ALLOC (ck_valid, op_errno, out); + + local->inode = inode_ref (loc->inode); + *ck_valid = valid; + + STACK_WIND_COOKIE (frame, bd_setattr_cbk, ck_valid, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, + loc, stbuf, valid, xdata); + + return 0; +out: + BD_STACK_UNWIND (setattr, frame, -1, ENOMEM, NULL, NULL, xdata); + return 0; +} + +int +bd_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + bd_attr_t *bdatt = NULL; + + if (op_ret < 0) + goto out; + + if (bd_inode_ctx_get (inode, this, &bdatt)) + goto out; + + bdatt->iatt.ia_ctime = buf->ia_ctime; + bdatt->iatt.ia_ctime_nsec = buf->ia_ctime_nsec; + bdatt->iatt.ia_nlink = buf->ia_nlink; + memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (link, frame, op_ret, op_errno, inode, buf, + preparent, postparent, NULL); + return 0; +} + +int +bd_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + STACK_WIND (frame, bd_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; +} + +int +bd_handle_special_xattrs (call_frame_t *frame, xlator_t *this, loc_t *loc, + fd_t *fd, const char *name, dict_t *xdata) +{ + dict_t *xattr = NULL; + int op_ret = -1; + int op_errno = ENOMEM;; + bd_priv_t *priv = this->private; + + xattr = dict_new (); + if (!xattr) + goto out; + + if (!strcmp (name, VOL_TYPE)) + op_ret = dict_set_int64 (xattr, (char *)name, 1); + else if (!strcmp (name, VOL_CAPS)) + op_ret = dict_set_int64 (xattr, (char *)name, priv->caps); + else + op_ret = bd_get_origin (this->private, loc, fd, xattr); + +out: + if (loc) + BD_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, + xdata); + else + BD_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, + xdata); + + op_ret = dict_reset (xattr); + dict_unref (xattr); + + return 0; +} + +int +bd_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS) + || !strcmp (name, BD_ORIGIN))) + bd_handle_special_xattrs (frame, this, NULL, fd, name, xdata); + else + STACK_WIND (frame, default_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, + fd, name, xdata); + return 0; +} + +int +bd_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS) + || !strcmp (name, BD_ORIGIN))) + bd_handle_special_xattrs (frame, this, loc, NULL, name, xdata); + else + STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, name, xdata); + + return 0; +} + +int +bd_unlink_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + bd_gfid_t gfid = {0, }; + bd_local_t *local = frame->local; + + if (buf->ia_nlink > 1) + goto posix; + + BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + + uuid_utoa_r (inode->gfid, gfid); + if (bd_delete_lv (this->private, gfid, &op_errno) < 0) { + if (op_errno != ENOENT) + goto out; + } + +posix: + /* remove posix */ + STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + &local->loc, 0, NULL); + + return 0; +out: + BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +bd_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, int xflag, dict_t *xdata) +{ + int op_errno = 0; + bd_attr_t *bdatt = NULL; + bd_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { + STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + loc, xflag, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + loc_copy (&local->loc, loc); + + STACK_WIND (frame, bd_unlink_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, NULL); + return 0; +out: + BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +bd_priv (xlator_t *this) +{ + return 0; +} + +int32_t +bd_inode (xlator_t *this) +{ + return 0; +} + +int32_t +bd_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + int32_t len, dict_t *xdata) +{ + int op_ret = -1; + int op_errno = 0; + int ret = 0; + int _fd = -1; + char *alloc_buf = NULL; + char *buf = NULL; + int32_t weak_checksum = 0; + bd_fd_t *bd_fd = NULL; + unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0}; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + STACK_WIND (frame, default_rchecksum_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->rchecksum, fd, offset, + len, xdata); + return 0; + } + + memset (strong_checksum, 0, MD5_DIGEST_LENGTH); + + alloc_buf = page_aligned_alloc (len, &buf); + if (!alloc_buf) { + op_errno = ENOMEM; + goto out; + } + + _fd = bd_fd->fd; + + LOCK (&fd->lock); + { + ret = pread (_fd, buf, len, offset); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "pread of %d bytes returned %d (%s)", + len, ret, strerror (errno)); + op_errno = errno; + } + } + UNLOCK (&fd->lock); + + if (ret < 0) + goto out; + + weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf, + (size_t) len); + gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len, + (unsigned char *) strong_checksum); + + op_ret = 0; +out: + BD_STACK_UNWIND (rchecksum, frame, op_ret, op_errno, + weak_checksum, strong_checksum, NULL); + + GF_FREE (alloc_buf); + + return 0; +} + +/** + * notify - when parent sends PARENT_UP, send CHILD_UP event from here + */ +int32_t +notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + switch (event) + { + case GF_EVENT_PARENT_UP: + { + /* Tell the parent that bd xlator is up */ + default_notify (this, GF_EVENT_CHILD_UP, data); + } + break; + default: + break; + } + return 0; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_bd_mt_end + 1); + + if (ret != 0) + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ + int ret = -1; + bd_priv_t *priv = this->private; + + GF_OPTION_RECONF ("bd-aio", priv->aio_configured, options, + bool, out); + + if (priv->aio_configured) + bd_aio_on (this); + else + bd_aio_off (this); + + ret = 0; +out: + return ret; +} + +/** + * bd xlator init - Validate configured VG + */ +int +init (xlator_t *this) +{ + int ret = 0; + char *vg_data = NULL; + char *device = NULL; + bd_priv_t *_private = NULL; + + if (!this->children) { + gf_log (this->name, GF_LOG_CRITICAL, + "FATAL: storage/bd needs posix as subvolume"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "Volume is dangling. Please check the volume file."); + } + + GF_OPTION_INIT ("export", vg_data, str, error); + GF_OPTION_INIT ("device", device, str, error); + + /* Now we support only LV device */ + if (strcasecmp (device, BACKEND_VG)) { + gf_log (this->name, GF_LOG_CRITICAL, + "FATAL: unknown %s backend %s", BD_XLATOR, device); + return -1; + } + + this->local_pool = mem_pool_new (bd_local_t, 64); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_CRITICAL, + "FATAL: Failed to create bd memory pool"); + return -1; + } + + ret = 0; + _private = GF_CALLOC (1, sizeof (*_private), gf_bd_private); + if (!_private) + goto error; + + this->private = _private; + _private->vg = gf_strdup (vg_data); + if (!_private->vg) + goto error; + + _private->handle = lvm_init (NULL); + if (!_private->handle) { + gf_log (this->name, GF_LOG_CRITICAL, "lvm_init failed"); + goto error; + } + _private->caps = BD_CAPS_BD; + if (bd_scan_vg (this, _private)) + goto error; + + _private->aio_init_done = _gf_false; + _private->aio_capable = _gf_false; + + GF_OPTION_INIT ("bd-aio", _private->aio_configured, bool, error); + if (_private->aio_configured) { + if (bd_aio_on (this)) { + gf_log (this->name, GF_LOG_ERROR, + "BD AIO init failed"); + ret = -1; + goto error; + } + } + + _private->caps |= BD_CAPS_OFFLOAD_COPY | BD_CAPS_OFFLOAD_SNAPSHOT; + + return 0; +error: + GF_FREE (_private->vg); + if (_private->handle) + lvm_quit (_private->handle); + mem_pool_destroy (this->local_pool); + GF_FREE (_private); + return -1; +} + +void +fini (xlator_t *this) +{ + bd_priv_t *priv = this->private; + mem_pool_destroy (this->local_pool); + this->local_pool = NULL; + if (!priv) + return; + lvm_quit (priv->handle); + GF_FREE (priv->vg); + this->private = NULL; + GF_FREE (priv); + return; +} + +struct xlator_dumpops dumpops = { + .priv = bd_priv, + .inode = bd_inode, +}; + +struct xlator_fops fops = { + .readdirp = bd_readdirp, + .lookup = bd_lookup, + .stat = bd_stat, + .statfs = bd_statfs, + .open = bd_open, + .fstat = bd_fstat, + .rchecksum = bd_rchecksum, + .readv = bd_readv, + .fsync = bd_fsync, + .setxattr = bd_setxattr, + .fsetxattr = bd_fsetxattr, + .removexattr = bd_removexattr, + .fremovexattr=bd_fremovexattr, + .truncate = bd_truncate, + .ftruncate = bd_ftruncate, + .writev = bd_writev, + .getxattr = bd_getxattr, + .fgetxattr = bd_fgetxattr, + .unlink = bd_unlink, + .link = bd_link, + .flush = bd_flush, + .setattr = bd_setattr, + .discard = bd_discard, +}; + +struct xlator_cbks cbks = { + .release = bd_release, + .forget = bd_forget, +}; + +struct volume_options options[] = { + { .key = {"export"}, + .type = GF_OPTION_TYPE_STR}, + { .key = {"device"}, + .type = GF_OPTION_TYPE_STR, + .default_value = BACKEND_VG}, + { + .key = {"bd-aio"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Support for native Linux AIO" + }, + + { .key = {NULL} } +}; diff --git a/xlators/storage/bd/src/bd.h b/xlators/storage/bd/src/bd.h new file mode 100644 index 000000000..34b4c9e22 --- /dev/null +++ b/xlators/storage/bd/src/bd.h @@ -0,0 +1,178 @@ +/* + BD translator - Exports Block devices on server side as regular + files to client + + Copyright IBM, Corp. 2012 + + This file is part of GlusterFS. + + Author: + M. Mohan Kumar <mohan@in.ibm.com> + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _BD_H +#define _BD_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifdef HAVE_LIBAIO +#include <libaio.h> +#endif + +#include "xlator.h" +#include "mem-types.h" + +#define BD_XLATOR "block device mapper xlator" +#define BACKEND_VG "vg" +#define GF_XATTR "user.glusterfs" +#define BD_XATTR GF_XATTR ".bd" + +#define BD_LV "lv" +#define BD_THIN "thin" + +#define LVM_RESIZE "/sbin/lvresize" +#define LVM_CREATE "/sbin/lvcreate" +#define LVM_CONVERT "/sbin/lvconvert" + +#define VOL_TYPE "volume.type" +#define VOL_CAPS "volume.caps" + +#define ALIGN_SIZE 4096 + +#define BD_CAPS_BD 0x01 +#define BD_CAPS_THIN 0x02 +#define BD_CAPS_OFFLOAD_COPY 0x04 +#define BD_CAPS_OFFLOAD_SNAPSHOT 0x08 + +#define BD_CLONE "clone" +#define BD_SNAPSHOT "snapshot" +#define BD_MERGE "merge" +#define BD_ORIGIN "list-origin" + +#define IOV_NR 4 +#define IOV_SIZE (64 * 1024) + +#define ALIGN_SIZE 4096 + +#define LINKTO "trusted.glusterfs.dht.linkto" + +#define BD_VALIDATE_MEM_ALLOC(buff, op_errno, label) \ + if (!buff) { \ + op_errno = ENOMEM; \ + gf_log (this->name, GF_LOG_ERROR, "out of memory"); \ + goto label; \ + } + +#define BD_VALIDATE_LOCAL_OR_GOTO(local, op_errno, label) \ + if (!local) { \ + op_errno = EINVAL; \ + goto label; \ + } + +#define BD_STACK_UNWIND(typ, frame, args ...) do { \ + bd_local_t *__local = frame->local; \ + xlator_t *__this = frame->this; \ + \ + frame->local = NULL; \ + STACK_UNWIND_STRICT (typ, frame, args); \ + if (__local) \ + bd_local_free (__this, __local); \ + } while (0) + +typedef char bd_gfid_t[GF_UUID_BUF_SIZE]; + +enum gf_bd_mem_types_ { + gf_bd_private = gf_common_mt_end + 1, + gf_bd_attr, + gf_bd_fd, + gf_bd_mt_end +}; + +/** + * bd_fd - internal structure + */ +typedef struct bd_fd { + int fd; + int32_t flag; + int odirect; +} bd_fd_t; + +typedef struct bd_priv { + lvm_t handle; + char *vg; + char *pool; + int caps; + gf_boolean_t aio_init_done; + gf_boolean_t aio_capable; + gf_boolean_t aio_configured; +#ifdef HAVE_LIBAIO + io_context_t ctxp; + pthread_t aiothread; +#endif +} bd_priv_t; + + +typedef enum bd_type { + BD_TYPE_NONE, + BD_TYPE_LV, +} bd_type_t; + +typedef struct { + struct iatt iatt; + char *type; +} bd_attr_t; + +typedef enum { + BD_OF_NONE, + BD_OF_CLONE, + BD_OF_SNAPSHOT, + BD_OF_MERGE, +} bd_offload_t; + +typedef struct { + dict_t *dict; + bd_attr_t *bdatt; + inode_t *inode; + loc_t loc; + fd_t *fd; + data_t *data; /* for setxattr */ + bd_offload_t offload; + uint64_t size; + loc_t *dloc; +} bd_local_t; + +/* Prototypes */ +int bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx); +int bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx); +int bd_scan_vg (xlator_t *this, bd_priv_t *priv); +bd_local_t *bd_local_init (call_frame_t *frame, xlator_t *this); +void bd_local_free (xlator_t *this, bd_local_t *local); +int bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd); +char *page_aligned_alloc (size_t size, char **aligned_buf); +int bd_validate_bd_xattr (xlator_t *this, char *bd, char **type, + uint64_t *lv_size, uuid_t uuid); +uint64_t bd_get_default_extent (bd_priv_t *priv); +uint64_t bd_adjust_size (bd_priv_t *priv, uint64_t size); +int bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv); +int bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size); +int bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno); +int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv); +int bd_clone (bd_local_t *local, bd_priv_t *priv); + +int bd_merge (bd_priv_t *priv, uuid_t gfid); +int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict); +inline void bd_update_amtime(struct iatt *iatt, int flag); +int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv); +int bd_clone (bd_local_t *local, bd_priv_t *priv); +int bd_merge (bd_priv_t *priv, uuid_t gfid); +int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict); + +#endif diff --git a/xlators/storage/bd_map/src/bd_map.c b/xlators/storage/bd_map/src/bd_map.c deleted file mode 100644 index 9c8f69c64..000000000 --- a/xlators/storage/bd_map/src/bd_map.c +++ /dev/null @@ -1,2580 +0,0 @@ -/* - BD translator - Exports Block devices on server side as regular - files to client - - Now only exporting Logical volumes supported. - - Copyright IBM, Corp. 2012 - - This file is part of GlusterFS. - - Author: - M. Mohan Kumar <mohan@in.ibm.com> - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include <time.h> -#include <lvm2app.h> -#include <openssl/md5.h> - -#include "bd_map.h" -#include "bd_map_help.h" -#include "defaults.h" -#include "glusterfs3-xdr.h" -#include "run.h" -#include "protocol-common.h" - -/* Regular fops */ - -int -bd_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask, dict_t *xdict) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char path[PATH_MAX] = {0, }; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - sprintf (path, "/dev/mapper/%s", loc->path); - op_ret = access (path, mask & 07); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "access failed on %s: %s", - loc->path, strerror (op_errno)); - goto out; - } - op_ret = 0; -out: - STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, NULL); - - return 0; -} - -#define LV_RENAME "/sbin/lvrename" - -int bd_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdict) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char *new_path = NULL; - char *np = NULL; - struct iatt stbuf = {0, }; - struct iatt preoldparent = {0, }; - struct iatt postoldparent = {0, }; - struct iatt prenewparent = {0, }; - struct iatt postnewparent = {0, }; - bd_priv_t *priv = NULL; - bd_entry_t *lventry = NULL; - bd_entry_t *newp_entry = NULL; - char *path = NULL; - struct stat v_stat = {0, }; - runner_t runner = {0, }; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (oldloc, out); - VALIDATE_OR_GOTO (newloc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - BD_ENTRY (priv, lventry, oldloc->path); - if (lventry->refcnt > 1) { - op_errno = EBUSY; - goto out; - } - - memcpy (&preoldparent, lventry->parent->attr, sizeof(preoldparent)); - - new_path = np = gf_strdup (newloc->path); - if (!new_path) - goto out; - new_path = strrchr (np, '/'); - if (!new_path) { - op_errno = EINVAL; - goto out; - } - - *new_path = '\0'; - BD_ENTRY (priv, newp_entry, np); - - memcpy (&prenewparent, newp_entry->parent->attr, sizeof(preoldparent)); - - runinit (&runner); - - runner_add_args (&runner, LV_RENAME, NULL); - runner_add_args (&runner, lventry->parent->name, NULL); - runner_add_args (&runner, oldloc->name, NULL); - runner_add_args (&runner, newloc->name, NULL); - - runner_start (&runner); - runner_end (&runner); - - /* verify */ - gf_asprintf (&path, "/dev/%s", newloc->path); - if (stat (path, &v_stat) < 0) { - op_errno = EIO; - goto out; - } - BD_ENTRY_UPDATE_MTIME (lventry); - BD_ENTRY_UPDATE_MTIME (newp_entry); - memcpy (&postoldparent, lventry->parent->attr, sizeof(postoldparent)); - memcpy (&postnewparent, newp_entry->parent->attr, - sizeof(postoldparent)); - BD_WR_LOCK (&priv->lock); - strncpy (lventry->name, newloc->name, sizeof(lventry->name)); - memcpy (&stbuf, lventry->attr, sizeof(stbuf)); - BD_UNLOCK (&priv->lock); - op_ret = 0; -out: - if (lventry) - BD_PUT_ENTRY (priv, lventry); - if (newp_entry) - BD_PUT_ENTRY (priv, newp_entry); - if (np) - GF_FREE (np); - if (path) - GF_FREE (path); - - STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, &stbuf, - &preoldparent, &postoldparent, &prenewparent, - &postnewparent, NULL); - return 0; -} - -int32_t -bd_delete_lv (bd_priv_t *priv, bd_entry_t *p_entry, bd_entry_t *lventry, - const char *path, int *op_errno) -{ - vg_t vg = NULL; - lv_t lv = NULL; - int op_ret = -1; - - *op_errno = 0; - BD_WR_LOCK (&priv->lock); - vg = lvm_vg_open (priv->handle, p_entry->name, "w", 0); - if (!vg) { - *op_errno = ENOENT; - BD_UNLOCK (&priv->lock); - goto out; - } - - lv = lvm_lv_from_name (vg, lventry->name); - if (!lv) { - lvm_vg_close (vg); - *op_errno = ENOENT; - BD_UNLOCK (&priv->lock); - goto out; - } - op_ret = lvm_vg_remove_lv (lv); - if (op_ret < 0) { - *op_errno = errno; - lvm_vg_close (vg); - BD_UNLOCK (&priv->lock); - goto out; - } - lvm_vg_close (vg); - - op_ret = bd_entry_rm (path); - if (op_ret < 0) { - *op_errno = EIO; - BD_UNLOCK (&priv->lock); - goto out; - } - BD_ENTRY_UPDATE_MTIME (p_entry); - - op_ret = 0; - op_errno = 0; - - BD_UNLOCK (&priv->lock); - op_ret = 0; -out: - return op_ret; -} - -int32_t -bd_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, int xflag, dict_t *xdata) -{ - int32_t op_ret = -1; - int32_t op_errno = ENOENT; - struct iatt preparent = {0, }; - struct iatt postparent = {0, }; - bd_priv_t *priv = NULL; - bd_entry_t *lventry = NULL; - bd_entry_t *p_entry = NULL; - char *vg_name = NULL; - char *volume = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - volume = vg_name = gf_strdup (loc->path); - if (!volume) - goto out; - volume = strrchr (volume, '/'); - if (!volume) { - op_errno = EINVAL; - goto out; - } - /* creating under non VG directory not permited */ - if (vg_name == volume) { - op_errno = EOPNOTSUPP; - goto out; - } - *volume = '\0'; - - BD_ENTRY (priv, p_entry, vg_name); - BD_ENTRY (priv, lventry, loc->path); - if (!p_entry || !lventry) - goto out; - - memcpy (&preparent, p_entry->attr, sizeof(preparent)); - op_ret = bd_delete_lv (priv, p_entry, lventry, loc->path, &op_errno); - memcpy (&postparent, p_entry->attr, sizeof(postparent)); -out: - if (p_entry) - BD_PUT_ENTRY (priv, p_entry); - if (lventry) - BD_PUT_ENTRY (priv, lventry); - if (vg_name) - GF_FREE (vg_name); - STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, - &preparent, &postparent, NULL); - - return 0; -} - -#define LVM_CREATE "/sbin/lvcreate" - -#define IOV_NR 4 -#define IOV_SIZE (4 * 1024) - -int bd_clone_lv (bd_priv_t *priv, bd_entry_t *p_entry, dict_t *output, - const char *vg_name, const char *lv_name, - const char *dest_lv_name, struct iatt *stbuf) -{ - int32_t ret = -1; - vg_t vg = NULL; - lv_t lv = NULL; - ssize_t size = 0; - uint64_t extent = 0; - int fd1 = -1; - int fd2 = -1; - struct iatt iattr = {0, }; - bd_entry_t *lventry = NULL; - char path[512] = {0, }; - struct iovec *vec = NULL; - int i = 0; - ssize_t bytes = 0; - int nr_iov = 0; - - vec = GF_CALLOC (IOV_NR, sizeof(struct iovec), gf_common_mt_iovec); - if (!vec) - goto out; - - for (i = 0; i < IOV_NR; i++) { - vec[i].iov_base = GF_MALLOC (IOV_SIZE, gf_common_mt_char); - if (!vec[i].iov_base) - goto out; - vec[i].iov_len = IOV_SIZE; - } - - vg = lvm_vg_open (priv->handle, vg_name, "w", 0); - if (!vg) { - gf_log (THIS->name, GF_LOG_ERROR, - "lvm_vg_open %s failed", vg_name); - ret = -1; - goto out; - } - lv = lvm_lv_from_name (vg, lv_name); - if (!lv) { - gf_log (THIS->name, GF_LOG_ERROR, "lvm_lv_from_name failed"); - ret = -1; - goto out; - } - - size = lvm_lv_get_size (lv); - extent = size / lvm_vg_get_extent_size (vg); - - if (lvm_vg_create_lv_linear (vg, dest_lv_name, size) == NULL) { - gf_log (THIS->name, GF_LOG_ERROR, "lv_create:%s", - lvm_errmsg(priv->handle)); - ret = -1; - goto out; - } - sprintf (path, "/dev/%s/%s", vg_name, lv_name); - fd1 = open (path, O_RDONLY); - if (fd1 < 0) { - gf_log (THIS->name, GF_LOG_ERROR, "opening %s failed", path); - goto out; - } - sprintf (path, "/dev/%s/%s", vg_name, dest_lv_name); - fd2 = open (path, O_WRONLY); - if (fd2 < 0) { - gf_log (THIS->name, GF_LOG_ERROR, "opening %s failed", path); - goto out; - } - - bd_entry_istat (path, &iattr, IA_IFREG); - iattr.ia_size = size; - - bytes = size; - while (bytes) { - size = readv(fd1, vec, IOV_NR); - if (size < 0) { - gf_log (THIS->name, GF_LOG_DEBUG, - "read failed:%s", strerror(errno)); - goto out; - } - if (size < IOV_NR * IOV_SIZE) { - vec[size / IOV_SIZE].iov_len = size % IOV_SIZE; - nr_iov = (size / IOV_SIZE) + 1; - } else - nr_iov = IOV_NR; - bytes -= size; - size = writev (fd2, vec, nr_iov); - if (size < 0) { - gf_log (THIS->name, GF_LOG_DEBUG, - "write failed:%s", strerror(errno)); - goto out; - } - } - - lventry = bd_entry_add (p_entry, dest_lv_name, &iattr, IA_IFREG); - if (!lventry) { - ret = EAGAIN; - goto out; - } - - if (stbuf) - memcpy (stbuf, &iattr, sizeof(iattr)); - - ret = 0; - gf_log (THIS->name, GF_LOG_INFO, "Clone completed"); -out: - if (vg) - lvm_vg_close (vg); - if (fd1 != -1) - close (fd1); - if (fd2 != -1) - close (fd2); - if (vec) - iov_free (vec, IOV_NR); - return ret; -} - -int bd_snapshot_lv (bd_priv_t *priv, bd_entry_t *p_entry, dict_t *output, - const char *lv_name, const char *dest_lv, char *size, - struct iatt *stbuf) -{ - int32_t ret = -1; - struct iatt iattr = {0, }; - struct stat stat = {0, }; - bd_entry_t *lventry = NULL; - char *error = NULL; - int retval = -1; - runner_t runner = {0, }; - char *path = NULL; - vg_t vg = NULL; - lv_t lv = NULL; - - runinit (&runner); - - runner_add_args (&runner, LVM_CREATE, NULL); - runner_add_args (&runner, "--snapshot", NULL); - runner_argprintf (&runner, "/dev/%s/%s", p_entry->name, lv_name); - runner_add_args (&runner, "--name", NULL); - runner_argprintf (&runner, "%s", dest_lv); - runner_argprintf (&runner, "-L%s", size); - - runner_start (&runner); - runner_end (&runner); - - gf_asprintf (&path, "/dev/%s/%s", p_entry->name, dest_lv); - if (!path) { - ret = -ENOMEM; - goto out; - } - if (lstat (path, &stat) < 0) { - ret = -EAGAIN; - if (output) - gf_asprintf (&error, "try again"); - goto out; - } - - vg = lvm_vg_open (priv->handle, p_entry->name, "r", 0); - if (!vg) { - ret = -EIO; - if (output) - gf_asprintf (&error, "can't open vg %s", p_entry->name); - goto out; - } - lv = lvm_lv_from_name (vg, lv_name); - if (!lv) { - ret = -EIO; - if (output) - gf_asprintf (&error, "can't open lv %s", lv_name); - goto out; - } - bd_entry_istat (path, &iattr, IA_IFREG); - iattr.ia_size = lvm_lv_get_size (lv); - lventry = bd_entry_add (p_entry, dest_lv, &iattr, IA_IFREG); - if (!lventry) { - if (output) - gf_asprintf (&error, "try again"); - ret = -EAGAIN; - goto out; - } - if (stbuf) - memcpy (stbuf, &iattr, sizeof(iattr)); - ret = 0; -out: - if (vg) - lvm_vg_close (vg); - if (error && output) - retval = dict_set_str (output, "error", error); - GF_FREE (path); - return ret; -} - -/* - * Creates a snapshot of given LV - */ -int -bd_symlink (call_frame_t *frame, xlator_t *this, - const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - bd_priv_t *priv = NULL; - struct iatt stbuf = {0, }; - struct iatt preparent = {0, }; - struct iatt postparent = {0, }; - bd_entry_t *lventry = NULL; - char *name = NULL; - char *np = NULL; - char *volume = NULL; - char *vg_name = NULL; - char *path = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - if (strchr (loc->path, '/')) { - vg_name = gf_strdup (loc->path); - volume = strrchr (vg_name, '/'); - if (!volume) { - op_errno = EINVAL; - goto out; - } - /* creating under non VG directory not permited */ - if (vg_name == volume) { - op_errno = EOPNOTSUPP; - goto out; - } - GF_FREE (vg_name); - vg_name = NULL; - } - - /* - * symlink creation for BD xlator is different - * source (LV) has to exist for creation of symbolic link (snapshot) - */ - if (strchr (linkname, '/')) { - op_errno = EOPNOTSUPP; - goto out; - } - gf_asprintf (&path, "%s/%s", priv->vg, linkname); - if (!path) { - op_errno = -ENOMEM; - goto out; - } - BD_ENTRY (priv, lventry, path); - if (!lventry) { - op_errno = ENOENT; - goto out; - } - - name = np = gf_strdup (loc->path); - if (!name) - goto out; - - /* Get LV name from loc->path */ - name = strrchr (loc->path, '/'); - if (name != loc->path) - name++; - - memcpy (&preparent, lventry->parent->attr, sizeof(preparent)); - if (bd_snapshot_lv (priv, lventry->parent, NULL, lventry->name, - name, "1", &stbuf) < 0) { - op_errno = EAGAIN; - goto out; - } - BD_ENTRY_UPDATE_MTIME (lventry->parent); - memcpy (&postparent, lventry->parent->attr, sizeof (postparent)); - op_ret = 0; -out: - if (lventry) - BD_PUT_ENTRY (priv, lventry); - if (np) - GF_FREE (np); - if (vg_name) - GF_FREE (vg_name); - if (path) - GF_FREE (path); - - STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, - (loc)?loc->inode:NULL, &stbuf, &preparent, - &postparent, NULL); - return 0; -} - -/* - * bd_link: Does full clone of given logical volume - * A new logical volume with source logical volume's size created - * and entire content copied - */ -int -bd_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - bd_priv_t *priv = NULL; - struct iatt stbuf = {0, }; - struct iatt preparent = {0, }; - struct iatt postparent = {0, }; - bd_entry_t *lventry = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (oldloc, out); - VALIDATE_OR_GOTO (newloc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - BD_ENTRY (priv, lventry, oldloc->path); - if (!lventry) { - op_errno = ENOENT; - goto out; - } - memcpy (&postparent, lventry->parent->attr, sizeof (postparent)); - if (bd_clone_lv (priv, lventry->parent, NULL, lventry->parent->name, - lventry->name, newloc->name, &stbuf) < 0) { - op_errno = EAGAIN; - goto out; - } - BD_ENTRY_UPDATE_MTIME (lventry->parent); - memcpy (&preparent, lventry->parent->attr, sizeof (preparent)); - op_ret = 0; -out: - if (lventry) - BD_PUT_ENTRY (priv, lventry); - - - STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, - (oldloc)?oldloc->inode:NULL, &stbuf, &preparent, - &postparent, NULL); - return 0; -} - -int32_t -bd_open (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t _fd = -1; - bd_fd_t *bd_fd = NULL; - bd_entry_t *lventry = NULL; - bd_priv_t *priv = NULL; - char *devpath = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - BD_ENTRY (priv, lventry, loc->path); - if (!lventry) { - op_errno = ENOENT; - goto out; - } - - gf_asprintf (&devpath, "/dev/%s/%s", lventry->parent->name, - lventry->name); - _fd = open (devpath, flags, 0); - if (_fd == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "open on %s: %s", devpath, strerror (op_errno)); - goto out; - } - - bd_fd = GF_CALLOC (1, sizeof(*bd_fd), gf_bd_fd); - if (!bd_fd) { - op_errno = errno; - goto out; - } - bd_fd->entry = lventry; - bd_fd->fd = _fd; - - op_ret = fd_ctx_set (fd, this, (uint64_t)(long)bd_fd); - if (op_ret) { - gf_log (this->name, GF_LOG_WARNING, - "failed to set the fd context path=%s fd=%p", - loc->name, fd); - goto out; - } - - op_ret = 0; -out: - if (op_ret == -1) { - if (_fd != -1) - close (_fd); - /* FIXME: Should we call fd_ctx_set with NULL? */ - if (bd_fd) - GF_FREE (bd_fd); - if (lventry) - BD_PUT_ENTRY (priv, lventry); - } - if (devpath) - GF_FREE (devpath); - - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, NULL); - - return 0; -} - -int -bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) -{ - uint64_t tmp_bd_fd = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - int _fd = -1; - bd_priv_t *priv = NULL; - struct iobuf *iobuf = NULL; - struct iobref *iobref = NULL; - struct iovec vec = {0, }; - bd_fd_t *bd_fd = NULL; - int ret = -1; - struct iatt stbuf = {0, }; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - if (ret < 0) { - op_errno = -EINVAL; - gf_log (this->name, GF_LOG_WARNING, - "bd_fd is NULL from fd=%p", fd); - goto out; - } - bd_fd = (bd_fd_t *)(long)tmp_bd_fd; - if (!size) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); - goto out; - } - iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); - if (!iobuf) { - op_errno = ENOMEM; - goto out; - } - _fd = bd_fd->fd; - op_ret = pread (_fd, iobuf->ptr, size, offset); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "read failed on fd=%p: %s", fd, - strerror (op_errno)); - goto out; - } - - vec.iov_base = iobuf->ptr; - vec.iov_len = op_ret; - - iobref = iobref_new (); - iobref_add (iobref, iobuf); - BD_ENTRY_UPDATE_ATIME (bd_fd->entry); - - memcpy (&stbuf, bd_fd->entry->attr, sizeof(stbuf)); - - /* Hack to notify higher layers of EOF. */ - if (bd_fd->entry->size == 0) - op_errno = ENOENT; - else if ((offset + vec.iov_len) >= bd_fd->entry->size) - op_errno = ENOENT; - op_ret = vec.iov_len; -out: - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, - &vec, 1, &stbuf, iobref, NULL); - - if (iobref) - iobref_unref (iobref); - if (iobuf) - iobuf_unref (iobuf); - return 0; -} - -#define LVM_RESIZE "/sbin/lvresize" - -int32_t -bd_resize (bd_priv_t *priv, bd_entry_t *lventry, off_t *size) -{ - bd_entry_t *vgentry = NULL; - uint64_t extent = 0; - int32_t op_ret = -1; - vg_t vg = NULL; - uint32_t nr_ex = 0; - lv_t lv = NULL; - uint64_t new_size = 0; - runner_t runner = {0, }; - - BD_ENTRY (priv, vgentry, lventry->parent->name); - if (!vgentry) { - op_ret = ENOENT; - goto out; - } - - BD_WR_LOCK (&priv->lock); - vg = lvm_vg_open (priv->handle, vgentry->name, "w", 0); - if (!vg) { - op_ret = lvm_errno (priv->handle); - BD_UNLOCK (&priv->lock); - goto out; - } - - extent = lvm_vg_get_extent_size (vg); - lvm_vg_close (vg); - BD_UNLOCK (&priv->lock); - - nr_ex = *size / extent; - if (*size % extent) - nr_ex++; - *size = extent * nr_ex; - - runinit (&runner); - - runner_add_args (&runner, LVM_RESIZE, NULL); - runner_argprintf (&runner, "/dev/%s/%s", lventry->parent->name, - lventry->name); - runner_argprintf (&runner, "-l%ld", nr_ex); - runner_add_args (&runner, "-f", NULL); - - runner_start (&runner); - runner_end (&runner); - - BD_WR_LOCK (&priv->lock); - vg = lvm_vg_open (priv->handle, vgentry->name, "w", 0); - if (!vg) { - op_ret = lvm_errno (priv->handle); - BD_UNLOCK (&priv->lock); - goto out; - } - - lv = lvm_lv_from_name (vg, lventry->name); - if (!lv) { - op_ret = lvm_errno (priv->handle); - lvm_vg_close (vg); - BD_UNLOCK (&priv->lock); - goto out; - } - new_size = lvm_lv_get_size (lv); - lvm_vg_close (vg); - if (new_size != *size) { - op_ret = EIO; - BD_UNLOCK (&priv->lock); - goto out; - } - - BD_UNLOCK (&priv->lock); - op_ret = 0; - -out: - if (vgentry) - BD_PUT_ENTRY (priv, vgentry); - - return op_ret; -} - - int32_t -bd_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset, dict_t *xdict) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - struct iatt preop = {0, }; - struct iatt postop = {0, }; - bd_fd_t *bd_fd = NULL; - int ret = -1; - uint64_t tmp_bd_fd = 0; - bd_priv_t *priv = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "bd_fd is NULL, fd=%p", fd); - op_errno = -ret; - goto out; - } - bd_fd = (bd_fd_t *)(long)tmp_bd_fd; - - memcpy (&preop, bd_fd->entry->attr, sizeof(preop)); - if (offset > bd_fd->entry->size) { - op_errno = bd_resize (priv, bd_fd->entry, &offset); - if (op_errno) - goto out; - if (offset > bd_fd->entry->size) { - bd_fd->entry->attr->ia_size = offset; - bd_fd->entry->size = offset; - } - } - /* If the requested size is less then current size - * we will not update that in bd_fd->entry->attr - * because it will result in showing size of this file less - * instead we will return 0 for less size truncation - */ - BD_ENTRY_UPDATE_MTIME (bd_fd->entry); - memcpy (&postop, bd_fd->entry->attr, sizeof(postop)); - - op_ret = 0; -out: - STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, &preop, - &postop, NULL); - return 0; -} - -int32_t -bd_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset, dict_t *xdict) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - struct iatt prebuf = {0, }; - struct iatt postbuf = {0, }; - bd_entry_t *lventry = NULL; - bd_priv_t *priv = NULL; - off_t size = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - BD_ENTRY (priv, lventry, loc->path); - if (!lventry) { - op_errno = ENOENT; - gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - memcpy (&prebuf, lventry->attr, sizeof(prebuf)); - if (offset > lventry->size) { - op_errno = bd_resize (priv, lventry, &size); - if (op_errno) - goto out; - if (lventry->size < offset) { - lventry->attr->ia_size = offset; - lventry->size = size; - } - } - BD_ENTRY_UPDATE_MTIME (lventry); - memcpy (&postbuf, lventry->attr, sizeof(postbuf)); - BD_PUT_ENTRY (priv, lventry); - op_ret = 0; -out: - if (lventry) - BD_PUT_ENTRY (priv, lventry); - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, - &prebuf, &postbuf, NULL); - return 0; -} - -int32_t -__bd_pwritev (int fd, struct iovec *vector, int count, off_t offset, - uint64_t bd_size) -{ - int32_t op_ret = 0; - int index = 0; - int retval = 0; - off_t internal_offset = 0; - int no_space = 0; - - if (!vector) - return -EFAULT; - - internal_offset = offset; - for (index = 0; index < count; index++) { - if (internal_offset >= bd_size) { - op_ret = -ENOSPC; - goto err; - } - if (internal_offset + vector[index].iov_len >= bd_size) { - vector[index].iov_len = bd_size - internal_offset; - no_space = 1; - } - - retval = pwrite (fd, vector[index].iov_base, - vector[index].iov_len, internal_offset); - if (retval == -1) { - gf_log (THIS->name, GF_LOG_WARNING, - "base %p, length %ld, offset %ld, message %s", - vector[index].iov_base, vector[index].iov_len, - internal_offset, strerror (errno)); - op_ret = -errno; - goto err; - } - op_ret += retval; - internal_offset += retval; - if (no_space) - break; - } -err: - return op_ret; -} - -int bd_create_lv (bd_priv_t *priv, bd_entry_t *p_entry, const char *vg_name, - const char *lv_name, char *size, mode_t mode) -{ - vg_t vg = NULL; - int ret = -1; - char *path = NULL; - struct iatt iattr = {0, }; - bd_entry_t *lventry = NULL; - uint64_t extent = 0; - - BD_WR_LOCK (&priv->lock); - vg = lvm_vg_open (priv->handle, vg_name, "w", 0); - if (!vg) { - ret = -1; - goto out; - } - extent = lvm_vg_get_extent_size (vg); - if (size) - gf_string2bytesize (size, &extent); - - if (lvm_vg_create_lv_linear (vg, lv_name, extent) == NULL) { - ret = -EAGAIN; - lvm_vg_close (vg); - goto out; - } - lvm_vg_close (vg); - - gf_asprintf (&path, "/dev/%s/%s", vg_name, lv_name); - if (!path) { - ret = -ENOMEM; - lvm_vg_close (vg); - goto out; - } - bd_entry_istat (path, &iattr, IA_IFREG); - iattr.ia_size = extent; - if (!mode) - mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; - - iattr.ia_type = ia_type_from_st_mode (mode); - iattr.ia_prot = ia_prot_from_st_mode (mode); - lventry = bd_entry_add (p_entry, lv_name, &iattr, IA_IFREG); - if (!lventry) { - ret = -EAGAIN; - goto out; - } - ret = 0; -out: - BD_UNLOCK (&priv->lock); - if (path) - GF_FREE (path); - return ret; -} - -int bd_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *params) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t _fd = -1; - bd_priv_t *priv = NULL; - struct iatt stbuf = {0, }; - struct iatt preparent = {0, }; - struct iatt postparent = {0, }; - bd_entry_t *p_entry = NULL; - bd_entry_t *lventry = NULL; - bd_fd_t *pfd = NULL; - char *vg_name = NULL; - char *volume = NULL; - char *path = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - volume = vg_name = gf_strdup (loc->path); - if (!volume) - goto out; - volume = strrchr (volume, '/'); - if (!volume) { - op_errno = EINVAL; - goto out; - } - /* creating under non VG directory not permited */ - if (vg_name == volume) { - op_errno = EOPNOTSUPP; - goto out; - } - *volume = '\0'; - - BD_ENTRY (priv, p_entry, vg_name); - if (!p_entry) { - op_errno = ENOENT; - goto out; - } - - memcpy (&preparent, p_entry->attr, sizeof(preparent)); - - op_errno = bd_create_lv (priv, p_entry, p_entry->name, loc->name, 0, - mode); - if (op_errno) - goto out; - - BD_ENTRY (priv, lventry, loc->path); - if (!lventry) { - gf_log (this->name, GF_LOG_WARNING, - "newly created LV not available %s", loc->path); - op_errno = EAGAIN; - goto out; - } - - /* Mask O_CREATE since we created LV */ - flags &= ~(O_CREAT | O_EXCL); - - gf_asprintf (&path, "/dev/%s/%s", p_entry->name, loc->name); - if (!path) { - op_errno = ENOMEM; - goto out; - } - _fd = open (path, flags, 0); - if (_fd == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "open on %s: %s", path, strerror (op_errno)); - goto out; - } - - memcpy (&stbuf, lventry->attr, sizeof(stbuf)); - - pfd = GF_CALLOC (1, sizeof(*pfd), gf_bd_fd); - if (!pfd) { - op_errno = errno; - goto out; - } - pfd->flag = flags; - pfd->fd = _fd; - pfd->entry = lventry; - - if (fd_ctx_set (fd, this, (uint64_t)(long)pfd)) { - gf_log (this->name, GF_LOG_WARNING, - "failed to set the fd context path=%s fd=%p", - loc->name, fd); - goto out; - } - - op_ret = 0; - - memcpy (&postparent, p_entry->attr, sizeof(postparent)); -out: - if (p_entry) - BD_PUT_ENTRY (priv, p_entry); - if (path) - GF_FREE (path); - if (op_ret < 0 && lventry) - BD_PUT_ENTRY (priv, lventry); - if (vg_name) - GF_FREE (vg_name); - - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, - (loc)?loc->inode:NULL, &stbuf, &preparent, - &postparent, NULL); - return 0; -} - -/* - * We don't do actual setattr on devices on the host side, we just update - * the entries in server process & they are not persistent - */ -int bd_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid, dict_t *xdata) -{ - struct iatt statpre = {0, }; - struct iatt statpost = {0, }; - int32_t op_ret = -1; - int32_t op_errno = 0; - bd_priv_t *priv = NULL; - bd_fd_t *pfd = NULL; - int ret = 0; - uint64_t tmp_pfd = 0; - int _fd = -1; - - priv = this->private; - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "pfd is NULL, fd=%p", fd); - op_errno = -ret; - goto out; - } - pfd = (bd_fd_t *)(long)tmp_pfd; - - _fd = pfd->fd; - memcpy (&statpre, pfd->entry->attr, sizeof(statpre)); - op_ret = 0; - - if (valid & GF_SET_ATTR_MODE) - pfd->entry->attr->ia_prot = stbuf->ia_prot; - if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) { - if (valid & GF_SET_ATTR_UID) - pfd->entry->attr->ia_uid = stbuf->ia_uid; - if (valid & GF_SET_ATTR_GID) - pfd->entry->attr->ia_gid = stbuf->ia_gid; - } - if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) { - pfd->entry->attr->ia_atime = stbuf->ia_atime; - pfd->entry->attr->ia_atime_nsec = stbuf->ia_atime_nsec; - pfd->entry->attr->ia_mtime = stbuf->ia_mtime; - pfd->entry->attr->ia_mtime_nsec = stbuf->ia_mtime_nsec; - } - memcpy (&statpost, pfd->entry->attr, sizeof(statpost)); - op_errno = 0; -out: - STACK_UNWIND_STRICT (setattr, frame, 0, 0, &statpre, &statpost, NULL); - return 0; -} - -int bd_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid, dict_t *xdata) -{ - struct iatt statpre = {0, }; - struct iatt statpost = {0, }; - bd_entry_t *lventry = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; - bd_priv_t *priv = NULL; - char path[PATH_MAX] = {0, }; - - priv = this->private; - - /* - * We don't allow to do setattr on / on host side - * ie /dev - */ - if (!strcmp (loc->path, "/")) { - op_ret = 0; - goto out; - } - - BD_ENTRY (priv, lventry, loc->path); - if (!lventry) { - op_errno = ENOENT; - goto out; - } - sprintf (path, "/dev/%s/%s", lventry->parent->name, lventry->name); - - memcpy (&statpre, lventry->attr, sizeof(statpre)); - if (valid & GF_SET_ATTR_MODE) - lventry->attr->ia_prot = stbuf->ia_prot; - if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) { - if (valid & GF_SET_ATTR_UID) - lventry->attr->ia_uid = stbuf->ia_uid; - if (valid & GF_SET_ATTR_GID) - lventry->attr->ia_gid = stbuf->ia_gid; - } - if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) { - lventry->attr->ia_atime = stbuf->ia_atime; - lventry->attr->ia_atime_nsec = stbuf->ia_atime_nsec; - lventry->attr->ia_mtime = stbuf->ia_mtime; - lventry->attr->ia_mtime_nsec = stbuf->ia_mtime_nsec; - } - memcpy (&statpost, lventry->attr, sizeof(statpost)); - op_errno = 0; -out: - if (lventry) - BD_PUT_ENTRY (priv, lventry); - STACK_UNWIND_STRICT (setattr, frame, 0, 0, &statpre, &statpost, NULL); - return 0; -} - -int -bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - uint32_t flags, struct iobref *iobref, dict_t *xdict) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int _fd = -1; - bd_priv_t *priv = NULL; - bd_fd_t *bd_fd = NULL; - int ret = -1; - struct iatt preop = {0, }; - struct iatt postop = {0, }; - uint64_t tmp_bd_fd = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (vector, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_WARNING, - "bd_fd is NULL from fd=%p", fd); - goto out; - } - bd_fd = (bd_fd_t *)(long)tmp_bd_fd; - _fd = bd_fd->fd; - - memcpy (&preop, bd_fd->entry->attr, sizeof(preop)); - op_ret = __bd_pwritev (_fd, vector, count, offset, bd_fd->entry->size); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64 - ", %s", offset, strerror (op_errno)); - goto out; - } - BD_ENTRY_UPDATE_MTIME (bd_fd->entry); - memcpy (&postop, bd_fd->entry->attr, sizeof(postop)); - -out: - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, - &postop, NULL); - - return 0; -} - -int32_t -bd_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) -{ - struct iatt buf = {0, }; - int32_t op_ret = -1; - int32_t entry_ret = 0; - int32_t op_errno = 0; - char *pathdup = NULL; - bd_entry_t *bdentry = NULL; - struct iatt postparent = {0, }; - bd_priv_t *priv = NULL; - char *p = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (loc->path, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - BD_ENTRY (priv, bdentry, loc->path); - if (!bdentry) { - op_errno = ENOENT; - entry_ret = -1; - goto parent; - } - memcpy (&buf, bdentry->attr, sizeof(buf)); - BD_PUT_ENTRY (priv, bdentry); - -parent: - if (loc->parent) { - pathdup = p = gf_strdup (loc->path); - if (!pathdup) { - op_errno = ENOMEM; - entry_ret = -1; - goto out; - } - p = strrchr (pathdup, '/'); - if (p == pathdup) - *(p+1) = '\0'; - else - *p = '\0'; - BD_ENTRY (priv, bdentry, pathdup); - if (!bdentry) { - op_errno = ENOENT; - gf_log (this->name, GF_LOG_ERROR, - "post-operation lookup on parent of %s " - "failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - memcpy (&postparent, bdentry->attr, sizeof(postparent)); - BD_PUT_ENTRY (priv, bdentry); - } - - op_ret = entry_ret; -out: - if (pathdup) - GF_FREE (pathdup); - - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, - (loc)?loc->inode:NULL, &buf, NULL, &postparent); - - return 0; -} - -int32_t -bd_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) -{ - struct iatt buf = {0,}; - int32_t op_ret = -1; - int32_t op_errno = 0; - bd_entry_t *bdentry = NULL; - bd_priv_t *priv = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - BD_ENTRY (priv, bdentry, loc->path); - if (!bdentry) { - op_errno = ENOENT; - gf_log (this->name, GF_LOG_ERROR, "stat on %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - memcpy (&buf, bdentry->attr, sizeof(buf)); - BD_PUT_ENTRY (priv, bdentry); - op_ret = 0; - -out: - STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, &buf, NULL); - - return 0; -} - -int32_t -bd_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) -{ - int ret = -1; - int32_t op_ret = -1; - int32_t op_errno = 0; - uint64_t tmp_bd_fd = 0; - struct iatt buf = {0, }; - bd_fd_t *bd_fd = NULL; - int _fd = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "bd_fd is NULL, fd=%p", fd); - op_errno = -EINVAL; - goto out; - } - bd_fd = (bd_fd_t *)(long)tmp_bd_fd; - _fd = bd_fd->fd; - - memcpy (&buf, bd_fd->entry->attr, sizeof(buf)); - op_ret = 0; - -out: - STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, &buf, NULL); - return 0; -} - -int32_t -bd_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd, dict_t *xdata) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - bd_fd_t *bd_fd = NULL; - bd_entry_t *bdentry = NULL; - bd_priv_t *priv = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (loc->path, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - BD_ENTRY (priv, bdentry, loc->path); - if (!bdentry) { - op_errno = ENOENT; - gf_log (this->name, GF_LOG_ERROR, "opendir failed on %s: %s", - loc->path, strerror (op_errno)); - goto out; - } - bd_fd = GF_CALLOC (1, sizeof(*bd_fd), gf_bd_fd); - if (!bd_fd) { - op_errno = errno; - BD_PUT_ENTRY (priv, bdentry); - goto out; - } - - bd_fd->p_entry = bdentry; - - bdentry = list_entry ((&bdentry->child)->next, typeof(*bdentry), child); - if (!bdentry) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_ERROR, "bd_entry NULL"); - goto out; - } - bdentry = list_entry ((&bdentry->sibling), typeof(*bdentry), sibling); - if (!bdentry) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_ERROR, "bd_entry NULL"); - goto out; - } - - bd_fd->entry = bdentry; - - op_ret = fd_ctx_set (fd, this, (uint64_t) (long)bd_fd); - if (op_ret) { - gf_log (this->name, GF_LOG_ERROR, - "failed to set the fd context path=%s fd=%p", - loc->path, fd); - goto out; - } - - op_ret = 0; -out: - if (op_ret == -1) { - BD_PUT_ENTRY (priv, bd_fd->p_entry); - if (bd_fd) - GF_FREE (bd_fd); - } - - STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, NULL); - return 0; -} - -int32_t -bd_releasedir (xlator_t *this, fd_t *fd) -{ - bd_fd_t *bd_fd = NULL; - uint64_t tmp_bd_fd = 0; - int ret = 0; - bd_priv_t *priv = NULL; - - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = fd_ctx_del (fd, this, &tmp_bd_fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, "bd_fd from fd=%p is NULL", - fd); - goto out; - } - bd_fd = (bd_fd_t *) (long)tmp_bd_fd; - BD_PUT_ENTRY (priv, bd_fd->p_entry); - - bd_fd = (bd_fd_t *) (long)tmp_bd_fd; - GF_FREE (bd_fd); -out: - return 0; -} - -/* - * bd_statfs: Mimics statfs by returning used/free extents in the VG - * TODO: IF more than one VG allowed per volume, this functions needs some - * change - */ -int32_t -bd_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xdata) -{ - int32_t op_ret = -1; - int32_t ret = -1; - int32_t op_errno = 0; - bd_priv_t *priv = NULL; - struct statvfs buf = {0, }; - vg_t vg = NULL; - char *vg_name = NULL; - uint64_t size = 0; - uint64_t fr_size = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = dict_get_str (this->options, "export", &vg_name); - if (ret) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: storage/bd does not specify volume groups"); - op_errno = EINVAL; - goto out; - } - - BD_RD_LOCK (&priv->lock); - - vg = lvm_vg_open (priv->handle, vg_name, "r", 0); - size += lvm_vg_get_size (vg); - fr_size += lvm_vg_get_free_size (vg); - lvm_vg_close (vg); - - BD_UNLOCK (&priv->lock); - - if (statvfs ("/", &buf) < 0) { - op_errno = errno; - goto out; - } - op_ret = 0; - buf.f_blocks = size / buf.f_frsize; - buf.f_bfree = fr_size / buf.f_frsize; - buf.f_bavail = fr_size / buf.f_frsize; -out: - STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, &buf, NULL); - return 0; -} - -int32_t -bd_release (xlator_t *this, fd_t *fd) -{ - bd_fd_t *bd_fd = NULL; - int ret = -1; - uint64_t tmp_bd_fd = 0; - bd_priv_t *priv = NULL; - - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "bd_fd is NULL from fd=%p", - fd); - goto out; - } - bd_fd = (bd_fd_t *) (long)tmp_bd_fd; - close (bd_fd->fd); - BD_PUT_ENTRY (priv, bd_fd->entry); - - GF_FREE (bd_fd); -out: - return 0; -} - -int32_t -bd_fsync (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t datasync, dict_t *xdata) -{ - int _fd = -1; - int ret = -1; - int32_t op_ret = -1; - int32_t op_errno = 0; - uint64_t tmp_bd_fd = 0; - bd_fd_t *bd_fd = NULL; - struct iatt preop = {0, }; - struct iatt postop = {0, }; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "bd_fd is NULL, fd=%p", fd); - op_errno = -ret; - goto out; - } - bd_fd = (bd_fd_t *)(long)tmp_bd_fd; - - _fd = bd_fd->fd; - memcpy (&preop, &bd_fd->entry->attr, sizeof(preop)); - if (datasync) { - ; -#ifdef HAVE_FDATASYNC - op_ret = fdatasync (_fd); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "fdatasync on fd=%p failed: %s", - fd, strerror (errno)); - } -#endif - } else { - op_ret = fsync (_fd); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "fsync on fd=%p failed: %s", - fd, strerror (op_errno)); - goto out; - } - } - - memcpy (&postop, bd_fd->entry->attr, sizeof(postop)); - op_ret = 0; - -out: - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, &preop, - &postop, NULL); - - return 0; -} - -int32_t -bd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdict) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int ret = -1; - uint64_t tmp_bd_fd = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - if (ret < 0) { - op_errno = -EINVAL; - gf_log (this->name, GF_LOG_WARNING, - "bd_fd is NULL on fd=%p", fd); - goto out; - } - op_ret = 0; -out: - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, NULL); - - return 0; -} - -int -__bd_fill_readdir (pthread_rwlock_t *bd_lock, bd_fd_t *bd_fd, off_t off, - size_t size, gf_dirent_t *entries) -{ - size_t filled = 0; - int count = 0; - struct dirent entry = {0, }; - int32_t this_size = -1; - gf_dirent_t *this_entry = NULL; - bd_entry_t *bdentry = NULL; - bd_entry_t *cur_entry = NULL; - bd_entry_t *n_entry = NULL; - - BD_RD_LOCK (bd_lock); - - bdentry = list_entry ((&bd_fd->p_entry->child)->next, typeof(*n_entry), - child); - - if (off) { - int i = 0; - list_for_each_entry (n_entry, &bd_fd->entry->sibling, sibling) { - if (i == off && strcmp (n_entry->name, "")) { - bd_fd->entry = n_entry; - break; - } - } - } else - bd_fd->entry = list_entry ((&bdentry->sibling), - typeof(*n_entry), sibling); - - while (filled <= size) { - cur_entry = bd_fd->entry; - - n_entry = list_entry ((&bd_fd->entry->sibling)->next, - typeof (*cur_entry), sibling); - if (&n_entry->sibling == (&bdentry->sibling)) - break; - - strcpy (entry.d_name, n_entry->name); - entry.d_ino = n_entry->attr->ia_ino; - entry.d_off = off; - if (n_entry->attr->ia_type == IA_IFDIR) - entry.d_type = DT_DIR; - else - entry.d_type = DT_REG; - - this_size = max (sizeof(gf_dirent_t), - sizeof (gfs3_dirplist)) - + strlen (entry.d_name) + 1; - - if (this_size + filled > size) - break; - - bd_fd->entry = n_entry; - - this_entry = gf_dirent_for_name (entry.d_name); - if (!this_entry) { - gf_log (THIS->name, GF_LOG_ERROR, - "could not create gf_dirent for entry %s", - entry.d_name); - goto out; - } - this_entry->d_off = off; - this_entry->d_ino = entry.d_ino; - this_entry->d_type = entry.d_type; - off++; - - list_add_tail (&this_entry->list, &entries->list); - - filled += this_size; - count++; - } -out: - BD_UNLOCK (bd_lock); - return count; -} - -int32_t -bd_do_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, int whichop) -{ - uint64_t tmp_bd_fd = 0; - bd_fd_t *bd_fd = NULL; - int ret = -1; - int count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - gf_dirent_t entries; - gf_dirent_t *tmp_entry = NULL; - bd_entry_t *bdentry = NULL; - bd_priv_t *priv = NULL; - char *devpath = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - INIT_LIST_HEAD (&entries.list); - - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "bd_fd is NULL, fd=%p", fd); - op_errno = -EINVAL; - goto out; - } - bd_fd = (bd_fd_t *) (long)tmp_bd_fd; - LOCK (&fd->lock); - { - count = __bd_fill_readdir (&priv->lock, bd_fd, off, - size, &entries); - } - UNLOCK (&fd->lock); - - /* pick ENOENT to indicate EOF */ - op_errno = errno; - op_ret = count; - - if (whichop != GF_FOP_READDIRP) - goto out; - - BD_RD_LOCK (&priv->lock); - list_for_each_entry (tmp_entry, &entries.list, list) { - char path[PATH_MAX]; - sprintf (path, "%s/%s", bd_fd->p_entry->name, - tmp_entry->d_name); - bdentry = bd_entry_get (path); - if (!bdentry) { - gf_log (this->name, GF_LOG_WARNING, - "entry failed %s\n", tmp_entry->d_name); - continue; - } - if (bdentry->attr->ia_ino) - tmp_entry->d_ino = bdentry->attr->ia_ino; - memcpy (&tmp_entry->d_stat, - bdentry->attr, sizeof (tmp_entry->d_stat)); - bd_entry_put (bdentry); - GF_FREE (devpath); - } - BD_UNLOCK (&priv->lock); - -out: - STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, NULL); - - gf_dirent_free (&entries); - - return 0; -} - -int32_t -bd_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, dict_t *dict) -{ - bd_do_readdir (frame, this, fd, size, off, GF_FOP_READDIR); - return 0; -} - - -int32_t -bd_readdirp (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, dict_t *dict) -{ - bd_do_readdir (frame, this, fd, size, off, GF_FOP_READDIRP); - return 0; -} - -int32_t -bd_priv (xlator_t *this) -{ - return 0; -} - -int32_t -bd_inode (xlator_t *this) -{ - return 0; -} - -/* unsupported interfaces */ -int32_t -bd_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size, dict_t *xdata) -{ - struct iatt stbuf = {0, }; - char *dest = NULL; - - dest = alloca (size + 1); - STACK_UNWIND_STRICT (readlink, frame, -1, ENOSYS, dest, &stbuf, NULL); - return 0; -} - -int -bd_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t dev, mode_t umask, dict_t *xdata) -{ - struct iatt stbuf = {0, }; - struct iatt preparent = {0, }; - struct iatt postparent = {0, }; - - STACK_UNWIND_STRICT (mknod, frame, -1, ENOSYS, - (loc)?loc->inode:NULL, &stbuf, &preparent, - &postparent, NULL); - return 0; -} - -int -bd_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - mode_t umask, dict_t *xdata) -{ - struct iatt stbuf = {0, }; - struct iatt preparent = {0, }; - struct iatt postparent = {0, }; - - STACK_UNWIND_STRICT (mkdir, frame, -1, ENOSYS, - (loc)?loc->inode:NULL, &stbuf, &preparent, - &postparent, NULL); - return 0; -} - -int -bd_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, - dict_t *xdata) -{ - struct iatt preparent = {0, }; - struct iatt postparent = {0, }; - - STACK_UNWIND_STRICT (rmdir, frame, -1, ENOSYS, - &preparent, &postparent, NULL); - return 0; -} - -int32_t -bd_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int flags, dict_t *xdata) -{ - STACK_UNWIND_STRICT (setxattr, frame, -1, ENOSYS, NULL); - return 0; -} - -int32_t -bd_fsetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *dict, int flags, dict_t *xdata) -{ - STACK_UNWIND_STRICT (setxattr, frame, -1, ENOSYS, NULL); - return 0; -} - -int32_t -bd_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) -{ - STACK_UNWIND_STRICT (getxattr, frame, -1, ENOSYS, NULL, NULL); - return 0; -} - -int32_t -bd_fgetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata) -{ - STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOSYS, NULL, NULL); - - return 0; -} - -int32_t -bd_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) -{ - STACK_UNWIND_STRICT (removexattr, frame, -1, ENOSYS, NULL); - return 0; -} - -int32_t -bd_fremovexattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata) -{ - STACK_UNWIND_STRICT (fremovexattr, frame, -1, ENOSYS, NULL); - return 0; -} - -int32_t -bd_fsyncdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, int datasync, dict_t *xdata) -{ - STACK_UNWIND_STRICT (fsyncdir, frame, -1, ENOSYS, NULL); - return 0; -} - -static int gf_bd_lk_log; -int32_t -bd_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata) -{ - struct gf_flock nullock = {0, }; - - GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is " - "not loaded. You need to use it for proper " - "functioning of your application."); - - STACK_UNWIND_STRICT (lk, frame, -1, ENOSYS, &nullock, NULL); - return 0; -} - -int32_t -bd_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, - struct gf_flock *lock, dict_t *xdata) -{ - GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is " - "not loaded. You need to use it for proper " - "functioning of your application."); - - STACK_UNWIND_STRICT (inodelk, frame, -1, ENOSYS, NULL); - return 0; -} - -int32_t -bd_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, - struct gf_flock *lock, dict_t *xdata) -{ - GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is " - "not loaded. You need to use it for proper " - "functioning of your application."); - - STACK_UNWIND_STRICT (finodelk, frame, -1, ENOSYS, NULL); - return 0; -} - - -int32_t -bd_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) -{ - GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is " - "not loaded. You need to use it for proper " - "functioning of your application."); - - STACK_UNWIND_STRICT (entrylk, frame, -1, ENOSYS, NULL); - return 0; -} - -int32_t -bd_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) -{ - GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is " - "not loaded. You need to use it for proper " - "functioning of your application."); - - STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOSYS, NULL); - return 0; -} - -int32_t -bd_rchecksum (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset, int32_t len, dict_t *xdata) -{ - int32_t weak_checksum = 0; - unsigned char strong_checksum[MD5_DIGEST_LENGTH]; - - STACK_UNWIND_STRICT (rchecksum, frame, -1, ENOSYS, - weak_checksum, strong_checksum, NULL); - return 0; -} - -int -bd_xattrop (call_frame_t *frame, xlator_t *this, - loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr, - dict_t *xdata) -{ - STACK_UNWIND_STRICT (xattrop, frame, -1, ENOSYS, xattr, NULL); - return 0; -} - - -int -bd_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr, - dict_t *xdata) -{ - STACK_UNWIND_STRICT (xattrop, frame, -1, ENOSYS, xattr, NULL); - return 0; -} - -int bd_xl_op_create (bd_priv_t *priv, dict_t *input, dict_t *output) -{ - char *vg = NULL; - char *lv = NULL; - char *path = NULL; - bd_entry_t *p_entry = NULL; - bd_entry_t *lventry = NULL; - char *size = 0; - int ret = -1; - char *error = NULL; - int retval = -1; - char *buff = NULL; - char *buffp = NULL; - char *save = NULL; - - ret = dict_get_str (input, "size", &size); - if (ret) { - gf_asprintf (&error, "no size specified"); - goto out; - } - ret = dict_get_str (input, "path", &path); - if (ret) { - gf_asprintf (&error, "no path specified"); - goto out; - } - - buff = buffp = gf_strdup (path); - - vg = strtok_r (buff, "/", &save); - lv = strtok_r (NULL, "/", &save); - - if (!vg || !lv) { - gf_asprintf (&error, "invalid path %s", path); - ret = -1; - goto out; - } - - BD_ENTRY (priv, p_entry, vg); - if (!p_entry) { - ret = -ENOENT; - goto out; - } - BD_ENTRY (priv, lventry, path); - if (lventry) { - ret = -EEXIST; - gf_asprintf (&error, "%s already exists", lv); - BD_PUT_ENTRY (priv, lventry); - goto out; - } - - ret = bd_create_lv (priv, p_entry, vg, lv, size, 0); - if (ret < 0) { - gf_asprintf (&error, "bd_create_lv error %d", -ret); - goto out; - } - ret = 0; -out: - if (p_entry) - BD_PUT_ENTRY (priv, p_entry); - - if (buffp) - GF_FREE (buffp); - - if (error) - retval = dict_set_dynstr (output, "error", error); - return ret; -} - -int bd_xl_op_delete (bd_priv_t *priv, dict_t *input, dict_t *output) -{ - char *vg = NULL; - char *path = NULL; - bd_entry_t *p_entry = NULL; - bd_entry_t *lventry = NULL; - int ret = -1; - char *error = NULL; - int retval = -1; - char *buff = NULL; - char *buffp = NULL; - char *save = NULL; - int op_errno = 0; - - ret = dict_get_str (input, "path", &path); - if (ret) { - gf_asprintf (&error, "no path specified"); - goto out; - } - - buff = buffp = gf_strdup (path); - - vg = strtok_r (buff, "/", &save); - if (!vg) { - gf_asprintf (&error, "invalid path %s", path); - op_errno = EINVAL; - ret = -1; - goto out; - } - - BD_ENTRY (priv, p_entry, vg); - BD_ENTRY (priv, lventry, path); - if (!p_entry || !lventry) { - op_errno = -ENOENT; - gf_asprintf (&error, "%s not found", path); - ret = -1; - goto out; - } - ret = bd_delete_lv (priv, p_entry, lventry, path, &op_errno); - if (ret < 0) { - gf_asprintf (&error, "bd_delete_lv error, error:%d", op_errno); - goto out; - } - ret = 0; -out: - if (p_entry) - BD_PUT_ENTRY (priv, p_entry); - if (lventry) - BD_PUT_ENTRY (priv, lventry); - if (buffp) - GF_FREE (buffp); - if (error) - retval = dict_set_dynstr (output, "error", error); - return ret; -} - -int bd_xl_op_clone(bd_priv_t *priv, int subop, dict_t *input, dict_t *output) -{ - bd_entry_t *p_entry = NULL; - bd_entry_t *lventry = NULL; - int ret = -1; - char *error = NULL; - int retval = -1; - char *vg = NULL; - char *lv = NULL; - char *dest_lv = NULL; - char *size = NULL; - char *buff = NULL; - char *buffp = NULL; - char *path = NULL; - char *save = NULL; - char *npath = NULL; - - ret = dict_get_str (input, "path", &path); - ret = dict_get_str (input, "dest_lv", &dest_lv); - ret = dict_get_str (input, "size", &size); - - if (!path || !dest_lv) { - gf_asprintf (&error, "invalid arguments"); - ret = -1; - goto out; - } - - buff = buffp = gf_strdup (path); - - vg = strtok_r (buff, "/", &save); - lv = strtok_r (NULL, "/", &save); - if (!lv) { - gf_asprintf (&error, "lv not given %s", path); - ret = -1; - goto out; - } - - BD_ENTRY (priv, p_entry, vg); - if (!p_entry) { - gf_asprintf (&error, "%s does not exist", vg); - retval = dict_set_str (output, "error", error); - goto out; - } - - BD_ENTRY (priv, lventry, path); - if (!lventry) { - gf_asprintf (&error, "%s does not exist", path); - ret = -1; - goto out; - } - BD_PUT_ENTRY (priv, lventry); - lventry = NULL; - gf_asprintf (&npath, "/%s/%s", vg, dest_lv); - BD_ENTRY (priv, lventry, npath); - if (lventry) { - gf_asprintf (&error, "%s already exists", dest_lv); - BD_PUT_ENTRY (priv, lventry); - ret = -1; - goto out; - } - - if (subop == GF_BD_OP_SNAPSHOT_BD) { - if (!size) { - gf_asprintf (&error, "size not given"); - ret = -1; - goto out; - } - ret = bd_snapshot_lv (priv, p_entry, output, lv, dest_lv, - size, NULL); - } else - ret = bd_clone_lv (priv, p_entry, output, vg, lv, dest_lv, - NULL); - - if (ret) - goto out; - ret = 0; -out: - if (error) - retval = dict_set_dynstr (output, "error", error); - if (p_entry) - BD_PUT_ENTRY (priv, p_entry); - if (npath) - GF_FREE (npath); - if (buffp) - GF_FREE (buffp); - return ret; -} - -int32_t -bd_notify (xlator_t *this, dict_t *input, dict_t *output) -{ - int ret = -1; - int retval = -1; - int32_t bdop = -1; - bd_priv_t *priv = NULL; - char *error = NULL; - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = dict_get_int32 (input, "bd-op", (int32_t *)&bdop); - if (ret) { - gf_asprintf (&error, "no sub-op specified"); - goto out; - } - - switch (bdop) - { - case GF_BD_OP_NEW_BD: - ret = bd_xl_op_create (priv, input, output); - break; - case GF_BD_OP_DELETE_BD: - ret = bd_xl_op_delete (priv, input, output); - break; - case GF_BD_OP_CLONE_BD: - case GF_BD_OP_SNAPSHOT_BD: - ret = bd_xl_op_clone (priv, bdop, input, output); - break; - default: - gf_asprintf (&error, "invalid bd-op %d specified", bdop); - retval = dict_set_dynstr (output, "error", error); - goto out; - } - -out: - return ret; -} - -/** - * notify - when parent sends PARENT_UP, send CHILD_UP event from here - */ -int32_t -notify (xlator_t *this, - int32_t event, - void *data, - ...) -{ - va_list ap; - int ret = 0; - void *data2 = NULL; - dict_t *input = NULL; - dict_t *output = NULL; - - va_start (ap, data); - data2 = va_arg (ap, dict_t *); - va_end (ap); - - switch (event) - { - case GF_EVENT_PARENT_UP: - { - /* Tell the parent that bd xlator is up */ - default_notify (this, GF_EVENT_CHILD_UP, data); - } - break; - case GF_EVENT_TRANSLATOR_OP: - input = data; - output = data2; - if (!output) - output = dict_new (); - ret = bd_notify (this, input, output); - break; - - default: - break; - } - return ret; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_bd_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - - -/** - * init - Constructs lists of LVs in the given VG - */ -int -init (xlator_t *this) -{ - bd_priv_t *_private = NULL; - int ret = 0; - char *vg = NULL; - char *device = NULL; - - LOCK_INIT (&inode_lk); - - bd_rootp = bd_entry_add_root (); - if (!bd_rootp) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: adding root entry failed"); - return -1; - } - - if (this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: storage/bd cannot have subvolumes"); - ret = -1; - goto out; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "Volume is dangling. Please check the volume file."); - } - - ret = dict_get_str (this->options, "device", &device); - if (ret) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: storage/bd does not specify backend"); - return -1; - } - - /* Now we support only LV device */ - if (strcasecmp (device, BACKEND_VG)) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: unknown %s backend %s", BD_XLATOR, device); - return -1; - } - - ret = dict_get_str (this->options, "export", &vg); - if (ret) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: storage/bd does not specify volume groups"); - return -1; - } - - ret = 0; - _private = GF_CALLOC (1, sizeof(*_private), gf_bd_private); - if (!_private) - goto error; - - pthread_rwlock_init (&_private->lock, NULL); - this->private = (void *)_private; - _private->handle = NULL; - _private->vg = gf_strdup (vg); - if (!_private->vg) { - goto error; - } - - if (bd_build_lv_list (this->private, vg) < 0) - goto error; - -out: - return 0; -error: - BD_WR_LOCK (&_private->lock); - bd_entry_cleanup (); - lvm_quit (_private->handle); - if (_private->vg) - GF_FREE (_private->vg); - GF_FREE (_private); - return -1; -} - -void -fini (xlator_t *this) -{ - bd_priv_t *priv = this->private; - if (!priv) - return; - lvm_quit (priv->handle); - BD_WR_LOCK (&priv->lock); - bd_entry_cleanup (); - BD_UNLOCK (&priv->lock); - GF_FREE (priv->vg); - this->private = NULL; - GF_FREE (priv); - return; -} - -struct xlator_dumpops dumpops = { - .priv = bd_priv, - .inode = bd_inode, -}; - -struct xlator_fops fops = { - /* Not supported */ - .readlink = bd_readlink, - .mknod = bd_mknod, - .mkdir = bd_mkdir, - .rmdir = bd_rmdir, - .setxattr = bd_setxattr, - .fsetxattr = bd_fsetxattr, - .getxattr = bd_getxattr, - .fgetxattr = bd_fgetxattr, - .removexattr = bd_removexattr, - .fremovexattr= bd_fremovexattr, - .fsyncdir = bd_fsyncdir, - .lk = bd_lk, - .inodelk = bd_inodelk, - .finodelk = bd_finodelk, - .entrylk = bd_entrylk, - .fentrylk = bd_fentrylk, - .rchecksum = bd_rchecksum, - .xattrop = bd_xattrop, - - /* Supported */ - .lookup = bd_lookup, - .opendir = bd_opendir, - .readdir = bd_readdir, - .readdirp = bd_readdirp, - .stat = bd_stat, - .statfs = bd_statfs, - .open = bd_open, - .access = bd_access, - .flush = bd_flush, - .readv = bd_readv, - .fstat = bd_fstat, - .truncate = bd_truncate, - .ftruncate = bd_ftruncate, - .fsync = bd_fsync, - .writev = bd_writev, - .fstat = bd_fstat, - .create = bd_create, - .setattr = bd_setattr, - .fsetattr = bd_fsetattr, - .unlink = bd_unlink, - .link = bd_link, - .symlink = bd_symlink, - .rename = bd_rename, -}; - -struct xlator_cbks cbks = { - .releasedir = bd_releasedir, - .release = bd_release, -}; - -struct volume_options options[] = { - { .key = {"export"}, - .type = GF_OPTION_TYPE_STR}, - { .key = {"device"}, - .type = GF_OPTION_TYPE_STR}, - { .key = {NULL} } -}; diff --git a/xlators/storage/bd_map/src/bd_map.h b/xlators/storage/bd_map/src/bd_map.h deleted file mode 100644 index 1a0f4248e..000000000 --- a/xlators/storage/bd_map/src/bd_map.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - BD translator - Exports Block devices on server side as regular - files to client - - Copyright IBM, Corp. 2012 - - This file is part of GlusterFS. - - Author: - M. Mohan Kumar <mohan@in.ibm.com> - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _BD_MAP_H -#define _BD_MAP_H - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "mem-types.h" - -#define BD_XLATOR "block device mapper xlator" - -#define BACKEND_VG "vg" - -enum gf_bd_mem_types_ { - gf_bd_fd = gf_common_mt_end + 1, - gf_bd_private, - gf_bd_entry, - gf_bd_attr, - gf_bd_mt_end -}; - -/* - * Each BD/LV is represented by this data structure - * Usually root entry will have only children and there is no sibling for that - * All other entries may have children and/or sibling entries - * If an entry is a Volume Group it will have child (. & .. and Logical - * Volumes) and also other Volume groups will be a sibling for this - */ -typedef struct bd_entry { - struct list_head child; /* List to child */ - struct list_head sibling; /* List of siblings */ - struct bd_entry *parent;/* Parent of this node */ - struct bd_entry *link; /* Link to actual entry, if its . or .. */ - char name[NAME_MAX]; - struct iatt *attr; - int refcnt; - uint64_t size; - pthread_rwlock_t lock; -} bd_entry_t; - -/** - * bd_fd - internal structure common to file and directory fd's - */ -typedef struct bd_fd { - bd_entry_t *entry; - bd_entry_t *p_entry; /* Parent entry */ - int fd; - int32_t flag; -} bd_fd_t; - -typedef struct bd_priv { - lvm_t handle; - pthread_rwlock_t lock; - char *vg; -} bd_priv_t; - -#endif diff --git a/xlators/storage/bd_map/src/bd_map_help.c b/xlators/storage/bd_map/src/bd_map_help.c deleted file mode 100644 index 0613aa383..000000000 --- a/xlators/storage/bd_map/src/bd_map_help.c +++ /dev/null @@ -1,501 +0,0 @@ -/* - BD translator - Exports Block devices on server side as regular - files to client - - Copyright IBM, Corp. 2012 - - This file is part of GlusterFS. - - Author: - M. Mohan Kumar <mohan@in.ibm.com> - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#define __XOPEN_SOURCE 500 - -#include <libgen.h> -#include <time.h> -#include <lvm2app.h> - -#include "bd_map.h" -#include "bd_map_help.h" -#include "defaults.h" -#include "glusterfs3-xdr.h" - -#define CHILD_ENTRY(node) list_entry ((&node->child)->next, typeof(*node), \ - child) - -bd_entry_t *bd_rootp; -gf_lock_t inode_lk; -static uint64_t bd_entry_ino = 5000; /* Starting inode */ - -static void bd_entry_get_ino (uint64_t *inode) -{ - LOCK (&inode_lk); - { - *inode = bd_entry_ino++; - } - UNLOCK (&inode_lk); -} - -void bd_update_time (bd_entry_t *entry, int type) -{ - struct timespec ts; - - clock_gettime (CLOCK_REALTIME, &ts); - if (type == 0) { - entry->attr->ia_mtime = ts.tv_sec; - entry->attr->ia_mtime_nsec = ts.tv_nsec; - entry->attr->ia_atime = ts.tv_sec; - entry->attr->ia_atime_nsec = ts.tv_nsec; - } else if (type == 1) { - entry->attr->ia_mtime = ts.tv_sec; - entry->attr->ia_mtime_nsec = ts.tv_nsec; - } else { - entry->attr->ia_atime = ts.tv_sec; - entry->attr->ia_atime_nsec = ts.tv_nsec; - } -} - -static bd_entry_t *bd_entry_init (const char *name) -{ - bd_entry_t *bdentry; - - bdentry = GF_MALLOC (sizeof(bd_entry_t), gf_bd_entry); - if (!bdentry) - return NULL; - - bdentry->attr = GF_MALLOC (sizeof(struct iatt), gf_bd_attr); - if (!bdentry->attr) { - GF_FREE (bdentry); - return NULL; - } - - strcpy (bdentry->name, name); - INIT_LIST_HEAD (&bdentry->sibling); - INIT_LIST_HEAD (&bdentry->child); - bdentry->link = NULL; - bdentry->refcnt = 0; - return bdentry; -} - -static bd_entry_t *bd_entry_clone (bd_entry_t *orig, char *name) -{ - bd_entry_t *bdentry; - - bdentry = GF_MALLOC (sizeof(bd_entry_t), gf_bd_entry); - if (!bdentry) - return NULL; - - bdentry->attr = orig->attr; - - strcpy (bdentry->name, name); - INIT_LIST_HEAD (&bdentry->sibling); - INIT_LIST_HEAD (&bdentry->child); - bdentry->link = orig; - bdentry->refcnt = 0; - return bdentry; -} - -static void bd_entry_init_iattr (struct iatt *attr, int type) -{ - struct timespec ts = {0, }; - - clock_gettime (CLOCK_REALTIME, &ts); - attr->ia_dev = ia_makedev (0, 0); /* FIXME: */ - attr->ia_type = type; - attr->ia_prot = ia_prot_from_st_mode (0750); - attr->ia_nlink = 2; - attr->ia_uid = 0; - attr->ia_gid = 0; - attr->ia_rdev = ia_makedev (0, 0); - - attr->ia_size = 4096; /* FIXME */ - attr->ia_blksize = 4096; - attr->ia_blocks = 0; - - attr->ia_atime = ts.tv_sec; - attr->ia_atime_nsec = ts.tv_nsec; - attr->ia_mtime = ts.tv_sec; - attr->ia_mtime_nsec = ts.tv_nsec; - attr->ia_ctime = ts.tv_sec; - attr->ia_ctime_nsec = ts.tv_nsec; -} - -/* - * bd_entry_istat: Initialize iatt strucutre for a given path on success - */ -void bd_entry_istat (const char *path, struct iatt *attr, int type) -{ - struct stat stbuf = {0, }; - - if (stat (path, &stbuf) < 0) - bd_entry_init_iattr (attr, type); - else - iatt_from_stat (attr, &stbuf); - sprintf ((char *)attr->ia_gfid, "%lx", stbuf.st_ino); -} - -/* - * Adds the root entry and required entries - * ie header entry followed by . and .. entries - */ -bd_entry_t *bd_entry_add_root (void) -{ - bd_entry_t *bdentry = NULL; - bd_entry_t *h_entry = NULL; - bd_entry_t *d_entry = NULL; - bd_entry_t *dd_entry = NULL; - - bdentry = bd_entry_init ("/"); - if (!bdentry) - return NULL; - - bdentry->parent = bdentry; - - bd_entry_get_ino (&bdentry->attr->ia_ino); - sprintf ((char *)bdentry->attr->ia_gfid, "%ld", - bdentry->attr->ia_ino << 2); - bd_entry_init_iattr (bdentry->attr, IA_IFDIR); - - h_entry = bd_entry_clone (bdentry, ""); - bdentry->child.next = &h_entry->child; - bdentry->child.prev = &h_entry->child; - - d_entry = bd_entry_clone (bdentry, "."); - dd_entry = bd_entry_clone (bdentry, ".."); - - list_add_tail (&d_entry->sibling, &h_entry->sibling); - list_add_tail (&dd_entry->sibling, &h_entry->sibling); - return bdentry; -} - -bd_entry_t *bd_entry_add (bd_entry_t *parent, const char *name, - struct iatt *iattr, ia_type_t type) -{ - bd_entry_t *bdentry = NULL; - bd_entry_t *h_entry = NULL; - bd_entry_t *d_entry = NULL; - bd_entry_t *dd_entry = NULL; - bd_entry_t *sentry = NULL; - struct timespec ts = { 0, }; - - if (!parent) - parent = bd_rootp; - - if (type != IA_IFREG && type != IA_IFDIR) - return NULL; - - bdentry = bd_entry_init (name); - if (!bdentry) - return NULL; - - bdentry->parent = parent; - - iattr->ia_type = type; - - bd_entry_get_ino (&iattr->ia_ino); - if (IA_ISDIR(type)) { - h_entry = bd_entry_clone (bdentry, ""); - parent->attr->ia_nlink++; - bdentry->child.next = &h_entry->child; - bdentry->child.prev = &h_entry->child; - - d_entry = bd_entry_clone (bdentry, "."); - dd_entry = bd_entry_clone (bdentry, ".."); - - list_add_tail (&d_entry->sibling, &h_entry->sibling); - list_add_tail (&dd_entry->sibling, &h_entry->sibling); - } - memcpy (bdentry->attr, iattr, sizeof(*iattr)); - - clock_gettime (CLOCK_REALTIME, &ts); - parent->attr->ia_mtime = ts.tv_sec; - parent->attr->ia_mtime_nsec = ts.tv_nsec; - bdentry->size = iattr->ia_size; - - sentry = CHILD_ENTRY (parent); - list_add_tail (&bdentry->sibling, &sentry->sibling); - return bdentry; -} - -bd_entry_t *bd_entry_get_list (const char *name, bd_entry_t *parent) -{ - bd_entry_t *centry = NULL; - bd_entry_t *bdentry = NULL; - - if (!parent) - parent = bd_rootp; - - if (parent->child.next == &parent->child) - return NULL; - - centry = CHILD_ENTRY (parent); - if (!strcmp (centry->name, name)) - return centry; - - list_for_each_entry (bdentry, ¢ry->sibling, sibling) { - if (!strcmp (bdentry->name, name)) - return bdentry; - } - return NULL; -} - -/* FIXME: Do we need hashing here? */ -bd_entry_t *bd_entry_find_by_gfid (const char *path) -{ - bd_entry_t *h = NULL; - bd_entry_t *tmp = NULL; - bd_entry_t *tmp2 = NULL; - bd_entry_t *node = NULL; - bd_entry_t *cnode = NULL; - bd_entry_t *leaf = NULL; - char *gfid = NULL; - char *cp = NULL; - char *bgfid = NULL; - bd_entry_t *entry = NULL; - - gfid = GF_MALLOC (strlen(path) + 1, gf_common_mt_char); - sscanf (path, "<gfid:%s", gfid); - if (!gfid) - return NULL; - - cp = strchr(gfid, '>'); - *cp = '\0'; - - node = CHILD_ENTRY (bd_rootp); - - bgfid = GF_MALLOC (GF_UUID_BUF_SIZE, gf_common_mt_char); - if (!bgfid) - return NULL; - - list_for_each_entry_safe (h, tmp, &node->sibling, sibling) { - uuid_utoa_r (h->attr->ia_gfid, bgfid); - if (!h->link && !strcmp (gfid, bgfid)) { - entry = h; - goto out; - } - - /* if we have children for this node */ - if (h->child.next != &h->child) { - cnode = CHILD_ENTRY (h); - uuid_utoa_r (cnode->attr->ia_gfid, bgfid); - if (!cnode->link && !strcmp (gfid, bgfid)) { - entry = cnode; - goto out; - } - - list_for_each_entry_safe (leaf, tmp2, (&cnode->sibling), - sibling) { - uuid_utoa_r (leaf->attr->ia_gfid, bgfid); - if (!leaf->link && !strcmp (gfid, bgfid)) { - entry = leaf; - goto out; - } - - } - } - } -out: - if (bgfid) - GF_FREE (bgfid); - - return entry; -} - -/* Called with priv->bd_lock held */ -bd_entry_t *bd_entry_get (const char *name) -{ - bd_entry_t *pentry = NULL; - char *path = NULL; - char *comp = NULL; - char *save = NULL; - - if (!strncmp (name, "<gfid:", 5)) { - pentry = bd_entry_find_by_gfid (name); - if (pentry) - pentry->refcnt++; - return pentry; - } - - if (!strcmp (name, "/")) { - bd_rootp->refcnt++; - return bd_rootp; - } - - path = gf_strdup (name); - comp = strtok_r (path, "/", &save); - pentry = bd_entry_get_list (comp, NULL); - if (!pentry) - goto out; - while (comp) { - comp = strtok_r (NULL, "/", &save); - if (!comp) - break; - pentry = bd_entry_get_list (comp, pentry); - if (!pentry) - goto out; - } - - pentry->refcnt++; -out: - GF_FREE (path); - return pentry; -} - -int bd_entry_rm (const char *path) -{ - bd_entry_t *bdentry = NULL; - int ret = -1; - - bdentry = bd_entry_get (path); - if (!bdentry) - goto out; - - list_del_init (&bdentry->sibling); - list_del_init (&bdentry->child); - GF_FREE (bdentry); - - ret = 0; -out: - return ret; -} - - - -/* Called with priv->bd_lock held */ -void bd_entry_put (bd_entry_t *entry) -{ - entry->refcnt--; -} - -int bd_build_lv_list (bd_priv_t *priv, char *vg_name) -{ - struct dm_list *lv_dm_list = NULL; - struct lvm_lv_list *lv_list = NULL; - struct iatt iattr = {0, }; - char path[PATH_MAX] = {0, }; - vg_t vg = NULL; - bd_entry_t *vg_map = NULL; - bd_entry_t *bd = NULL; - int ret = -1; - const char *lv_name = NULL; - - priv->handle = lvm_init (NULL); - if (!priv->handle) { - gf_log (THIS->name, GF_LOG_CRITICAL, "FATAL: bd_init failed"); - return -1; - } - - BD_WR_LOCK (&priv->lock); - - vg = lvm_vg_open (priv->handle, vg_name, "r", 0); - if (!vg) { - gf_log (THIS->name, GF_LOG_CRITICAL, - "opening vg %s failed", vg_name); - goto out; - } - /* get list of LVs associated with this VG */ - lv_dm_list = lvm_vg_list_lvs (vg); - sprintf (path, "/dev/%s", vg_name); - bd_entry_istat (path, &iattr, IA_IFDIR); - vg_map = bd_entry_add (bd_rootp, vg_name, &iattr, - IA_IFDIR); - if (!vg_map) { - gf_log (THIS->name, GF_LOG_CRITICAL, - "bd_add_entry failed"); - goto out; - } - ret = 0; - if (!lv_dm_list) /* no lvs for this VG */ - goto out; - - dm_list_iterate_items (lv_list, lv_dm_list) { - if (!lv_list) - continue; - lv_name = lvm_lv_get_name (lv_list->lv); - /* snapshot%d is reserved name */ - if (!strncmp (lv_name, "snapshot", 8)) - continue; - /* get symbolic path for this LV */ - sprintf (path, "/dev/%s/%s", vg_name, lv_name); - bd_entry_istat (path, &iattr, IA_IFREG); - /* Make the file size equivalant to BD size */ - iattr.ia_size = lvm_lv_get_size (lv_list->lv); - /* got LV, add it to our tree */ - bd = bd_entry_add (vg_map, - lvm_lv_get_name (lv_list->lv), - &iattr, IA_IFREG); - if (bd == NULL) { - gf_log (THIS->name, GF_LOG_ERROR, - "bd_add_entry failed"); - goto out; - } - } -out: - if (vg) - lvm_vg_close (vg); - - BD_UNLOCK (&priv->lock); - return ret; -} - -/* - * Called with bd_lock held to cleanup entire list. If there was a - * reference to any one of the entry, nothing cleared. - * Return 0 on success -1 in case if there is a reference to the entry - */ -int bd_entry_cleanup (void) -{ - bd_entry_t *node = NULL; - bd_entry_t *tmp = NULL; - bd_entry_t *tmp2 = NULL; - bd_entry_t *cnode = NULL; - bd_entry_t *h = NULL; - bd_entry_t *leaf = NULL; - - if (!bd_rootp) - return 0; - - node = CHILD_ENTRY (bd_rootp); - if (node->refcnt) { - gf_log (THIS->name, GF_LOG_WARNING, - "entry %s is inuse\n", node->name); - return -1; - } - list_for_each_entry_safe (h, tmp, &node->sibling, sibling) { - /* if we have children for this node */ - if (h->child.next != &h->child) { - cnode = CHILD_ENTRY (h); - list_for_each_entry_safe (leaf, tmp2, (&cnode->sibling), - sibling) { - list_del_init (&leaf->sibling); - list_del_init (&leaf->child); - if (!leaf->link) - GF_FREE (leaf->attr); - GF_FREE (leaf); - } - list_del_init (&cnode->sibling); - list_del_init (&cnode->child); - if (!cnode->link) - GF_FREE (cnode->attr); - GF_FREE (cnode); - } - if (!h->link) - GF_FREE (h->attr); - GF_FREE (h); - } - GF_FREE (h); - GF_FREE (bd_rootp->attr); - GF_FREE (bd_rootp); - return 0; -} diff --git a/xlators/storage/bd_map/src/bd_map_help.h b/xlators/storage/bd_map/src/bd_map_help.h deleted file mode 100644 index 9fafa2d13..000000000 --- a/xlators/storage/bd_map/src/bd_map_help.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - BD translator - Exports Block devices on server side as regular - files to client. - - Copyright IBM, Corp. 2012 - - This file is part of GlusterFS. - - Author: - M. Mohan Kumar <mohan@in.ibm.com> - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ -#ifndef _BD_MAP_HELP_H -#define _BD_MAP_HELP_H - -#define BD_RD_LOCK(lock) \ - pthread_rwlock_rdlock (lock); - -#define BD_WR_LOCK(lock) \ - pthread_rwlock_wrlock (lock); - -#define BD_UNLOCK(lock) \ - pthread_rwlock_unlock (lock); - -#define BD_WR_ENTRY(priv, bdentry, path) \ - do { \ - BD_WR_LOCK (&priv->lock); \ - bdentry = bd_entry_get (path); \ - BD_UNLOCK (&priv->lock); \ - } while (0) - -#define BD_ENTRY(priv, bdentry, path) \ - do { \ - BD_RD_LOCK (&priv->lock); \ - bdentry = bd_entry_get (path); \ - BD_UNLOCK (&priv->lock); \ - } while (0) - -#define BD_PUT_ENTRY(priv, bdentry) \ - do { \ - BD_RD_LOCK (&priv->lock); \ - bd_entry_put (bdentry); \ - BD_UNLOCK (&priv->lock); \ - } while (0) - -#define BD_ENTRY_UPDATE_TIME(bdentry) bd_update_time (bdentry, 0) -#define BD_ENTRY_UPDATE_ATIME(bdentry) bd_update_time (bdentry, 2) -#define BD_ENTRY_UPDATE_MTIME(bdentry) bd_update_time (bdentry, 1) - -extern bd_entry_t *bd_rootp; -extern gf_lock_t inode_lk; - -void bd_entry_istat (const char *path, struct iatt *attr, int type); -bd_entry_t *bd_entry_add_root (void); -bd_entry_t *bd_entry_add (bd_entry_t *parent, const char *name, - struct iatt *iattr, ia_type_t type); -bd_entry_t *bd_entry_get_list (const char *name, bd_entry_t *parent); -bd_entry_t *bd_entry_get (const char *name); -void bd_entry_put (bd_entry_t *entry); -int bd_build_lv_list (bd_priv_t *priv, char *vg); -int bd_entry_cleanup (void); -void bd_update_time (bd_entry_t *entry, int type); -int bd_entry_rm (const char *path); - -#endif diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am index 03623cf04..88efcc784 100644 --- a/xlators/storage/posix/src/Makefile.am +++ b/xlators/storage/posix/src/Makefile.am @@ -2,7 +2,7 @@ xlator_LTLIBRARIES = posix.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage -posix_la_LDFLAGS = -module -avoidversion +posix_la_LDFLAGS = -module -avoid-version posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c posix-aio.c posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBAIO) diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c index f807618ce..c3bbddd67 100644 --- a/xlators/storage/posix/src/posix-aio.c +++ b/xlators/storage/posix/src/posix-aio.c @@ -136,11 +136,7 @@ posix_aio_readv_complete (struct posix_aio_cb *paiocb, int res, int res2) /* Hack to notify higher layers of EOF. */ - if (postbuf.ia_size == 0) - op_errno = ENOENT; - else if ((offset + iov.iov_len) == postbuf.ia_size) - op_errno = ENOENT; - else if (offset > postbuf.ia_size) + if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size) op_errno = ENOENT; LOCK (&priv->lock); @@ -490,8 +486,8 @@ posix_aio_init (xlator_t *this) goto out; } - ret = pthread_create (&priv->aiothread, NULL, - posix_aio_thread, this); + ret = gf_thread_create (&priv->aiothread, NULL, + posix_aio_thread, this); if (ret != 0) { io_destroy (priv->ctxp); goto out; @@ -566,7 +562,7 @@ __posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, { xlator_t *this = THIS; gf_log (this->name, GF_LOG_INFO, - "Linux AIO not availble at build-time." + "Linux AIO not available at build-time." " Continuing with synchronous IO"); return; } diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c index 33bf3db56..219a582c9 100644 --- a/xlators/storage/posix/src/posix-handle.c +++ b/xlators/storage/posix/src/posix-handle.c @@ -573,13 +573,6 @@ posix_handle_hard (xlator_t *this, const char *oldpath, uuid_t gfid, struct stat } } - ret = lstat (newpath, &newbuf); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "lstat on %s failed (%s)", newpath, strerror (errno)); - return -1; - } - if (newbuf.st_ino != oldbuf->st_ino || newbuf.st_dev != oldbuf->st_dev) { gf_log (this->name, GF_LOG_WARNING, diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index 58708a347..e295f8850 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -22,6 +22,7 @@ #include <pthread.h> #include <ftw.h> #include <sys/stat.h> +#include <signal.h> #ifndef GF_BSD_HOST_OS #include <alloca.h> @@ -44,16 +45,9 @@ #include "timer.h" #include "glusterfs3-xdr.h" #include "hashfn.h" +#include "glusterfs-acl.h" #include <fnmatch.h> -typedef struct { - xlator_t *this; - const char *real_path; - dict_t *xattr; - struct iatt *stbuf; - loc_t *loc; -} posix_xattr_filler_t; - char *marker_xattrs[] = {"trusted.glusterfs.quota.*", "trusted.glusterfs.*.xtime", NULL}; @@ -181,14 +175,9 @@ _posix_xattr_get_set (dict_t *xattr_req, } } else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) { loc = filler->loc; - if (loc && !list_empty (&loc->inode->fd_list)) { - ret = dict_set_uint32 (filler->xattr, key, 1); - if (ret < 0) - gf_log (filler->this->name, GF_LOG_WARNING, - "Failed to set dictionary value for %s", - key); - } else { - ret = dict_set_uint32 (filler->xattr, key, 0); + if (loc) { + ret = dict_set_uint32 (filler->xattr, key, + loc->inode->fd_count); if (ret < 0) gf_log (filler->this->name, GF_LOG_WARNING, "Failed to set dictionary value for %s", @@ -896,8 +885,8 @@ posix_spawn_janitor_thread (xlator_t *this) LOCK (&priv->lock); { if (!priv->janitor_present) { - ret = pthread_create (&priv->janitor, NULL, - posix_janitor_thread_proc, this); + ret = gf_thread_create (&priv->janitor, NULL, + posix_janitor_thread_proc, this); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, @@ -913,6 +902,74 @@ unlock: UNLOCK (&priv->lock); } +static int +is_fresh_file (struct stat *stat) +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + + if ((stat->st_ctime >= (tv.tv_sec - 1)) + && (stat->st_ctime <= tv.tv_sec)) + return 1; + + return 0; +} + + +int +posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) +{ + /* The purpose of this function is to prevent a race + where an inode creation FOP (like mkdir/mknod/create etc) + races with lookup in the following way: + + {create thread} | {lookup thread} + | + t0 + mkdir ("name") | + t1 + | posix_gfid_set ("name", 2); + t2 + posix_gfid_set ("name", 1); | + t3 + lstat ("name"); | lstat ("name"); + + In the above case mkdir FOP would have resulted with GFID 2 while + it should have been GFID 1. It matters in the case where GFID would + have gotten set to 1 on other subvolumes of replciate/distribute + + The "solution" here is that, if we detect lookup is attempting to + set a GFID on a file which is created very recently, but does not + yet have a GFID (i.e, between t1 and t2), then "fake" it as though + posix_gfid_heal was called at t0 instead. + */ + + uuid_t uuid_curr; + int ret = 0; + struct stat stat = {0, }; + + if (!xattr_req) + goto out; + + if (sys_lstat (path, &stat) != 0) + goto out; + + ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); + if (ret != 16) { + if (is_fresh_file (&stat)) { + ret = -1; + errno = ENOENT; + goto out; + } + } + + ret = posix_gfid_set (this, path, loc, xattr_req); +out: + return ret; +} + + int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req) { @@ -926,17 +983,17 @@ posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req) if (sys_lstat (path, &stat) != 0) goto out; - data = dict_get (xattr_req, "system.posix_acl_access"); + data = dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR); if (data) { - ret = sys_lsetxattr (path, "system.posix_acl_access", + ret = sys_lsetxattr (path, POSIX_ACL_ACCESS_XATTR, data->data, data->len, 0); if (ret != 0) goto out; } - data = dict_get (xattr_req, "system.posix_acl_default"); + data = dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR); if (data) { - ret = sys_lsetxattr (path, "system.posix_acl_default", + ret = sys_lsetxattr (path, POSIX_ACL_DEFAULT_XATTR, data->data, data->len, 0); if (ret != 0) goto out; @@ -946,35 +1003,47 @@ out: return ret; } +static int +_handle_entry_create_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + int ret = -1; + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + if (!strcmp (GFID_XATTR_KEY, k) || + !strcmp ("gfid-req", k) || + !strcmp (POSIX_ACL_DEFAULT_XATTR, k) || + !strcmp (POSIX_ACL_ACCESS_XATTR, k) || + ZR_FILE_CONTENT_REQUEST(k)) { + return 0; + } + + ret = posix_handle_pair (filler->this, filler->real_path, k, v, + XATTR_CREATE); + if (ret < 0) { + errno = -ret; + return -1; + } + return 0; +} + int posix_entry_create_xattr_set (xlator_t *this, const char *path, dict_t *dict) { int ret = -1; + posix_xattr_filler_t filler = {0,}; + if (!dict) goto out; - int _handle_keyvalue_pair (dict_t *d, char *k, data_t *v, - void *tmp) - { - if (!strcmp (GFID_XATTR_KEY, k) || - !strcmp ("gfid-req", k) || - !strcmp ("system.posix_acl_default", k) || - !strcmp ("system.posix_acl_access", k) || - ZR_FILE_CONTENT_REQUEST(k)) { - return 0; - } + filler.this = this; + filler.real_path = path; - ret = posix_handle_pair (this, path, k, v, XATTR_CREATE); - if (ret < 0) { - errno = -ret; - return -1; - } - return 0; - } - - ret = dict_foreach (dict, _handle_keyvalue_pair, NULL); + ret = dict_foreach (dict, _handle_entry_create_keyvalue_pair, &filler); out: return ret; @@ -1064,3 +1133,259 @@ posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd) return ret; } + +static void * +posix_health_check_thread_proc (void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + uint32_t interval = 0; + int ret = -1; + struct stat sb = {0, }; + + this = data; + priv = this->private; + + /* prevent races when the interval is updated */ + interval = priv->health_check_interval; + if (interval == 0) + goto out; + + gf_log (this->name, GF_LOG_DEBUG, "health-check thread started, " + "interval = %d seconds", interval); + + while (1) { + /* aborting sleep() is a request to exit this thread, sleep() + * will normally not return when cancelled */ + ret = sleep (interval); + if (ret > 0) + break; + + /* prevent thread errors while doing the health-check(s) */ + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + + /* Do the health-check, it should be moved to its own function + * in case it gets more complex. */ + ret = stat (priv->base_path, &sb); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "stat() on %s returned: %s", priv->base_path, + strerror (errno)); + goto abort; + } + + pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL); + } + +out: + gf_log (this->name, GF_LOG_DEBUG, "health-check thread exiting"); + + LOCK (&priv->lock); + { + priv->health_check_active = _gf_false; + } + UNLOCK (&priv->lock); + + return NULL; + +abort: + /* health-check failed */ + gf_log (this->name, GF_LOG_EMERG, "health-check failed, going down"); + xlator_notify (this->parents->xlator, GF_EVENT_CHILD_DOWN, this); + + ret = sleep (30); + if (ret == 0) { + gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGTERM"); + kill (getpid(), SIGTERM); + } + + ret = sleep (30); + if (ret == 0) { + gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGKILL"); + kill (getpid(), SIGKILL); + } + + return NULL; +} + +void +posix_spawn_health_check_thread (xlator_t *xl) +{ + struct posix_private *priv = NULL; + int ret = -1; + + priv = xl->private; + + LOCK (&priv->lock); + { + /* cancel the running thread */ + if (priv->health_check_active == _gf_true) { + pthread_cancel (priv->health_check); + priv->health_check_active = _gf_false; + } + + /* prevent scheduling a check in a tight loop */ + if (priv->health_check_interval == 0) + goto unlock; + + ret = gf_thread_create (&priv->health_check, NULL, + posix_health_check_thread_proc, xl); + if (ret < 0) { + priv->health_check_interval = 0; + priv->health_check_active = _gf_false; + gf_log (xl->name, GF_LOG_ERROR, + "unable to setup health-check thread: %s", + strerror (errno)); + goto unlock; + } + + /* run the thread detached, resources will be freed on exit */ + pthread_detach (priv->health_check); + priv->health_check_active = _gf_true; + } +unlock: + UNLOCK (&priv->lock); +} + +int +posix_fsyncer_pick (xlator_t *this, struct list_head *head) +{ + struct posix_private *priv = NULL; + int count = 0; + + priv = this->private; + pthread_mutex_lock (&priv->fsync_mutex); + { + while (list_empty (&priv->fsyncs)) + pthread_cond_wait (&priv->fsync_cond, + &priv->fsync_mutex); + + count = priv->fsync_queue_count; + priv->fsync_queue_count = 0; + list_splice_init (&priv->fsyncs, head); + } + pthread_mutex_unlock (&priv->fsync_mutex); + + return count; +} + + +void +posix_fsyncer_process (xlator_t *this, call_stub_t *stub, gf_boolean_t do_fsync) +{ + struct posix_fd *pfd = NULL; + int ret = -1; + struct posix_private *priv = NULL; + + priv = this->private; + + ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not get fdctx for fd(%s)", + uuid_utoa (stub->args.fd->inode->gfid)); + call_unwind_error (stub, -1, EINVAL); + return; + } + + if (do_fsync) { +#ifdef HAVE_FDATASYNC + if (stub->args.datasync) + ret = fdatasync (pfd->fd); + else +#endif + ret = fsync (pfd->fd); + } else { + ret = 0; + } + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "could not fstat fd(%s)", + uuid_utoa (stub->args.fd->inode->gfid)); + call_unwind_error (stub, -1, errno); + return; + } + + call_unwind_error (stub, 0, 0); +} + + +static void +posix_fsyncer_syncfs (xlator_t *this, struct list_head *head) +{ + call_stub_t *stub = NULL; + struct posix_fd *pfd = NULL; + int ret = -1; + + stub = list_entry (head->prev, call_stub_t, list); + ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); + if (ret) + return; + +#ifdef GF_LINUX_HOST_OS + /* syncfs() is not "declared" in RHEL's glibc even though + the kernel has support. + */ +#include <sys/syscall.h> +#include <unistd.h> +#ifdef SYS_syncfs + syscall (SYS_syncfs, pfd->fd); +#else + sync(); +#endif +#else + sync(); +#endif +} + + +void * +posix_fsyncer (void *d) +{ + xlator_t *this = d; + struct posix_private *priv = NULL; + call_stub_t *stub = NULL; + call_stub_t *tmp = NULL; + struct list_head list; + int count = 0; + gf_boolean_t do_fsync = _gf_true; + + priv = this->private; + + for (;;) { + INIT_LIST_HEAD (&list); + + count = posix_fsyncer_pick (this, &list); + + usleep (priv->batch_fsync_delay_usec); + + gf_log (this->name, GF_LOG_DEBUG, + "picked %d fsyncs", count); + + switch (priv->batch_fsync_mode) { + case BATCH_NONE: + case BATCH_REVERSE_FSYNC: + break; + case BATCH_SYNCFS: + case BATCH_SYNCFS_SINGLE_FSYNC: + case BATCH_SYNCFS_REVERSE_FSYNC: + posix_fsyncer_syncfs (this, &list); + break; + } + + if (priv->batch_fsync_mode == BATCH_SYNCFS) + do_fsync = _gf_false; + else + do_fsync = _gf_true; + + list_for_each_entry_safe_reverse (stub, tmp, &list, list) { + list_del_init (&stub->list); + + posix_fsyncer_process (this, stub, do_fsync); + + if (priv->batch_fsync_mode == BATCH_SYNCFS_SINGLE_FSYNC) + do_fsync = _gf_false; + } + } +} diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index cf4e08663..fb45c7a67 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -23,6 +23,8 @@ #include <pthread.h> #include <ftw.h> #include <sys/stat.h> +#include <signal.h> +#include <sys/uio.h> #ifndef GF_BSD_HOST_OS #include <alloca.h> @@ -50,6 +52,7 @@ #include "glusterfs3-xdr.h" #include "hashfn.h" #include "posix-aio.h" +#include "glusterfs-acl.h" extern char *marker_xattrs[]; #define ALIGN_SIZE 4096 @@ -128,7 +131,7 @@ posix_lookup (call_frame_t *frame, xlator_t *this, MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf); if (uuid_is_null (loc->inode->gfid)) { - posix_gfid_set (this, real_path, loc, xdata); + posix_gfid_heal (this, real_path, loc, xdata); MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf); } @@ -561,6 +564,289 @@ out: return 0; } +static int32_t +posix_do_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + off_t offset, size_t len, struct iatt *statpre, + struct iatt *statpost) +{ + struct posix_fd *pfd = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "fallocate (fstat) failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + + ret = sys_fallocate(pfd->fd, flags, offset, len); + if (ret == -1) { + ret = -errno; + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "fallocate (fstat) failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + +out: + SET_TO_OLD_FS_ID (); + + return ret; +} + +char* +_page_aligned_alloc (size_t size, char **aligned_buf) +{ + char *alloc_buf = NULL; + char *buf = NULL; + + alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char); + if (!alloc_buf) + goto out; + /* page aligned buffer */ + buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); + *aligned_buf = buf; +out: + return alloc_buf; +} + +static int32_t +_posix_do_zerofill(int fd, off_t offset, size_t len, int o_direct) +{ + size_t num_vect = 0; + int32_t num_loop = 1; + int32_t idx = 0; + int32_t op_ret = -1; + int32_t vect_size = VECTOR_SIZE; + size_t remain = 0; + size_t extra = 0; + struct iovec *vector = NULL; + char *iov_base = NULL; + char *alloc_buf = NULL; + + if (len == 0) + return 0; + if (len < VECTOR_SIZE) + vect_size = len; + + num_vect = len / (vect_size); + remain = len % vect_size ; + if (num_vect > MAX_NO_VECT) { + extra = num_vect % MAX_NO_VECT; + num_loop = num_vect / MAX_NO_VECT; + num_vect = MAX_NO_VECT; + } + + vector = GF_CALLOC (num_vect, sizeof(struct iovec), + gf_common_mt_iovec); + if (!vector) + return -1; + if (o_direct) { + alloc_buf = _page_aligned_alloc(vect_size, &iov_base); + if (!alloc_buf) { + gf_log ("_posix_do_zerofill", GF_LOG_DEBUG, + "memory alloc failed, vect_size %d: %s", + vect_size, strerror(errno)); + GF_FREE(vector); + return -1; + } + } else { + iov_base = GF_CALLOC (vect_size, sizeof(char), + gf_common_mt_char); + if (!iov_base) { + GF_FREE(vector); + return -1; + } + } + + for (idx = 0; idx < num_vect; idx++) { + vector[idx].iov_base = iov_base; + vector[idx].iov_len = vect_size; + } + lseek(fd, offset, SEEK_SET); + for (idx = 0; idx < num_loop; idx++) { + op_ret = writev(fd, vector, num_vect); + if (op_ret < 0) + goto err; + } + if (extra) { + op_ret = writev(fd, vector, extra); + if (op_ret < 0) + goto err; + } + if (remain) { + vector[0].iov_len = remain; + op_ret = writev(fd, vector , 1); + if (op_ret < 0) + goto err; + } +err: + if (o_direct) + GF_FREE(alloc_buf); + else + GF_FREE(iov_base); + GF_FREE(vector); + return op_ret; +} + +static int32_t +posix_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, struct iatt *statpre, + struct iatt *statpost) +{ + struct posix_fd *pfd = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "pre-operation fstat failed on fd = %p: %s", fd, + strerror (errno)); + goto out; + } + ret = _posix_do_zerofill(pfd->fd, offset, len, pfd->flags & O_DIRECT); + if (ret < 0) { + ret = -errno; + gf_log(this->name, GF_LOG_ERROR, + "zerofill failed on fd %d length %ld %s", + pfd->fd, len, strerror(errno)); + goto out; + } + if (pfd->flags & (O_SYNC|O_DSYNC)) { + ret = fsync (pfd->fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fsync() in writev on fd %d failed: %s", + pfd->fd, strerror (errno)); + ret = -errno; + goto out; + } + } + + ret = posix_fdstat (this, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "post operation fstat failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + +out: + SET_TO_OLD_FS_ID (); + + return ret; +} + +static int32_t +_posix_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) +{ + int32_t ret; + int32_t flags = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + if (keep_size) + flags = FALLOC_FL_KEEP_SIZE; + + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(fallocate, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(fallocate, frame, -1, -ret, NULL, NULL, NULL); + return 0; +} + +static int32_t +posix_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret; + int32_t flags = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(discard, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(discard, frame, -1, -ret, NULL, NULL, NULL); + return 0; + +} + +static int32_t +posix_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + ret = posix_do_zerofill(frame, this, fd, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(zerofill, frame, -1, -ret, NULL, NULL, NULL); + return 0; + +} + int32_t posix_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, dict_t *xdata) @@ -1718,6 +2004,9 @@ posix_create (call_frame_t *frame, xlator_t *this, goto out; } + if (was_present) + goto fill_stat; + op_ret = posix_gfid_set (this, real_path, loc, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, @@ -1748,6 +2037,7 @@ posix_create (call_frame_t *frame, xlator_t *this, strerror (errno)); } +fill_stat: op_ret = posix_fdstat (this, _fd, &stbuf); if (op_ret == -1) { op_errno = errno; @@ -1965,11 +2255,7 @@ posix_readv (call_frame_t *frame, xlator_t *this, } /* Hack to notify higher layers of EOF. */ - if (stbuf.ia_size == 0) - op_errno = ENOENT; - else if ((offset + vec.iov_len) == stbuf.ia_size) - op_errno = ENOENT; - else if (offset > stbuf.ia_size) + if (!stbuf.ia_size || (offset + vec.iov_len) >= stbuf.ia_size) op_errno = ENOENT; op_ret = vec.iov_len; @@ -2014,22 +2300,6 @@ err: return op_ret; } -char* -_page_aligned_alloc (size_t size, char **aligned_buf) -{ - char *alloc_buf = NULL; - char *buf = NULL; - - alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char); - if (!alloc_buf) - goto out; - /* page aligned buffer */ - buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); - *aligned_buf = buf; -out: - return alloc_buf; -} - int32_t __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, int odirect) @@ -2078,6 +2348,48 @@ err: return op_ret; } +dict_t* +_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append) +{ + dict_t *rsp_xdata = NULL; + int32_t ret = 0; + inode_t *inode = NULL; + + if (fd) + inode = fd->inode; + + if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid Args: " + "fd: %p inode: %p gfid:%s", fd, inode?inode:0, + inode?uuid_utoa(inode->gfid):"N/A"); + goto out; + } + + if (!xdata || !dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) + goto out; + + rsp_xdata = dict_new(); + if (!rsp_xdata) + goto out; + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, + fd->inode->fd_count); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_OPEN_FD_COUNT); + } + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, + is_append); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_WRITE_IS_APPEND); + } +out: + return rsp_xdata; +} int32_t posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, @@ -2092,6 +2404,9 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt preop = {0,}; struct iatt postop = {0,}; int ret = -1; + dict_t *rsp_xdata = NULL; + int is_append = 0; + gf_boolean_t locked = _gf_false; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -2113,6 +2428,17 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, _fd = pfd->fd; + if (xdata && dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { + /* The write_is_append check and write must happen + atomically. Else another write can overtake this + write after the check and get written earlier. + + So lock before preop-stat and unlock after write. + */ + locked = _gf_true; + LOCK(&fd->inode->lock); + } + op_ret = posix_fdstat (this, _fd, &preop); if (op_ret == -1) { op_errno = errno; @@ -2122,8 +2448,19 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, goto out; } + if (locked) { + if (preop.ia_size == offset || (fd->flags & O_APPEND)) + is_append = 1; + } + op_ret = __posix_writev (_fd, vector, count, offset, (pfd->flags & O_DIRECT)); + + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + if (op_ret < 0) { op_errno = -op_ret; op_ret = -1; @@ -2139,14 +2476,21 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, UNLOCK (&priv->lock); if (op_ret >= 0) { + rsp_xdata = _fill_writev_xdata (fd, xdata, this, is_append); /* wiretv successful, we also need to get the stat of * the file we wrote to */ - if (pfd->flushwrites) { - /* NOTE: ignore the error, if one occurs at this - * point */ - fsync (_fd); + if (flags & (O_SYNC|O_DSYNC)) { + ret = fsync (_fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fsync() in writev on fd %d failed: %s", + _fd, strerror (errno)); + op_ret = -1; + op_errno = errno; + goto out; + } } ret = posix_fdstat (this, _fd, &postop); @@ -2162,9 +2506,16 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, out: + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop, - NULL); + rsp_xdata); + if (rsp_xdata) + dict_unref (rsp_xdata); return 0; } @@ -2291,6 +2642,33 @@ out: } +int +posix_batch_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int datasync, dict_t *xdata) +{ + call_stub_t *stub = NULL; + struct posix_private *priv = NULL; + + priv = this->private; + + stub = fop_fsync_stub (frame, default_fsync, fd, datasync, xdata); + if (!stub) { + STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + pthread_mutex_lock (&priv->fsync_mutex); + { + list_add_tail (&stub->list, &priv->fsyncs); + priv->fsync_queue_count++; + pthread_cond_signal (&priv->fsync_cond); + } + pthread_mutex_unlock (&priv->fsync_mutex); + + return 0; +} + + int32_t posix_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, dict_t *xdata) @@ -2302,6 +2680,7 @@ posix_fsync (call_frame_t *frame, xlator_t *this, int ret = -1; struct iatt preop = {0,}; struct iatt postop = {0,}; + struct posix_private *priv = NULL; DECLARE_OLD_FS_ID_VAR; @@ -2317,6 +2696,12 @@ posix_fsync (call_frame_t *frame, xlator_t *this, goto out; #endif + priv = this->private; + if (priv->batch_fsync_mode && xdata && dict_get (xdata, "batch-fsync")) { + posix_batch_fsync (frame, this, fd, datasync, xdata); + return 0; + } + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; @@ -2378,6 +2763,17 @@ out: } static int gf_posix_xattr_enotsup_log; +static int +_handle_setxattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + return posix_handle_pair (filler->this, filler->real_path, k, v, + filler->flags); +} int32_t posix_setxattr (call_frame_t *frame, xlator_t *this, @@ -2386,7 +2782,8 @@ posix_setxattr (call_frame_t *frame, xlator_t *this, int32_t op_ret = -1; int32_t op_errno = 0; char * real_path = NULL; - int ret = -1; + + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; SET_FS_ID (frame->root->uid, frame->root->gid); @@ -2401,17 +2798,13 @@ posix_setxattr (call_frame_t *frame, xlator_t *this, op_ret = -1; dict_del (dict, GFID_XATTR_KEY); - - int _handle_every_keyvalue_pair (dict_t *d, char *k, data_t *v, - void *tmp) - { - ret = posix_handle_pair (this, real_path, k, v, flags); - if (ret < 0) { - op_errno = -ret; - } - return ret; - } - op_ret = dict_foreach (dict, _handle_every_keyvalue_pair, NULL); + filler.real_path = real_path; + filler.this = this; + filler.flags = flags; + op_ret = dict_foreach (dict, _handle_setxattr_keyvalue_pair, + &filler); + if (op_ret < 0) + op_errno = -op_ret; out: SET_TO_OLD_FS_ID (); @@ -2421,6 +2814,53 @@ out: return 0; } + +int +posix_xattr_get_real_filename (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *dict, dict_t *xdata) +{ + char *real_path = NULL; + struct dirent *dirent = NULL; + DIR *fd = NULL; + const char *fname = NULL; + char *found = NULL; + int ret = -1; + int op_ret = -1; + + MAKE_INODE_HANDLE (real_path, this, loc, NULL); + + fd = opendir (real_path); + if (!fd) + return -errno; + + fname = key + strlen (GF_XATTR_GET_REAL_FILENAME_KEY); + + while ((dirent = readdir (fd))) { + if (strcasecmp (dirent->d_name, fname) == 0) { + found = gf_strdup (dirent->d_name); + if (!found) { + closedir (fd); + return -ENOMEM; + } + break; + } + } + + closedir (fd); + + if (!found) + return -ENOENT; + + ret = dict_set_dynstr (dict, (char *)key, found); + if (ret) { + GF_FREE (found); + return -ENOMEM; + } + ret = strlen (found) + 1; + + return ret; +} + /** * posix_getxattr - this function returns a dictionary with all the * key:value pair present as xattr. used for @@ -2475,9 +2915,29 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, dict = dict_new (); if (!dict) { + op_errno = ENOMEM; goto out; } + if (loc->inode && name && + (strncmp (name, GF_XATTR_GET_REAL_FILENAME_KEY, + strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)) { + ret = posix_xattr_get_real_filename (frame, this, loc, + name, dict, xdata); + if (ret < 0) { + op_ret = -1; + op_errno = -ret; + gf_log (this->name, (op_errno == ENOENT) ? + GF_LOG_DEBUG : GF_LOG_WARNING, + "Failed to get real filename (%s, %s): %s", + loc->path, name, strerror (op_errno)); + goto out; + } + + size = ret; + goto done; + } + if (loc->inode && name && !strcmp (name, GLUSTERFS_OPEN_FD_COUNT)) { if (!list_empty (&loc->inode->fd_list)) { ret = dict_set_uint32 (dict, (char *)name, 1); @@ -2501,8 +2961,13 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, else rpath = real_path; - (void) snprintf (host_buf, 1024, "<POSIX(%s):%s:%s>", - priv->base_path, priv->hostname, rpath); + (void) snprintf (host_buf, 1024, + "<POSIX(%s):%s:%s>", priv->base_path, + ((priv->node_uuid_pathinfo + && !uuid_is_null(priv->glusterd_uuid)) + ? uuid_utoa (priv->glusterd_uuid) + : priv->hostname), + rpath); dyn_rpath = gf_strdup (host_buf); if (!dyn_rpath) { @@ -2578,6 +3043,11 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, "supported (try remounting" " brick with 'user_xattr' " "flag)"); + } else if (op_errno == ENOATTR || + op_errno == ENODATA) { + gf_log (this->name, GF_LOG_DEBUG, + "No such attribute:%s for file %s", + key, real_path); } else { gf_log (this->name, GF_LOG_ERROR, "getxattr failed on %s: %s (%s)", @@ -2895,6 +3365,17 @@ out: return 0; } +static int +_handle_fsetxattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + return posix_fhandle_pair (filler->this, filler->fd, k, v, + filler->flags); +} int32_t posix_fsetxattr (call_frame_t *frame, xlator_t *this, @@ -2904,7 +3385,9 @@ posix_fsetxattr (call_frame_t *frame, xlator_t *this, int32_t op_errno = 0; struct posix_fd * pfd = NULL; int _fd = -1; - int ret = -1; + int ret = -1; + + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; SET_FS_ID (frame->root->uid, frame->root->gid); @@ -2925,17 +3408,13 @@ posix_fsetxattr (call_frame_t *frame, xlator_t *this, dict_del (dict, GFID_XATTR_KEY); - int _handle_every_keyvalue_pair (dict_t *d, char *k, data_t *v, - void *tmp) - { - ret = posix_fhandle_pair (this, _fd, k, v, flags); - if (ret < 0) { - op_errno = -ret; - } - return ret; - } - - op_ret = dict_foreach (dict, _handle_every_keyvalue_pair, NULL); + filler.fd = _fd; + filler.this = this; + filler.flags = flags; + op_ret = dict_foreach (dict, _handle_fsetxattr_keyvalue_pair, + &filler); + if (op_ret < 0) + op_errno = -op_ret; out: SET_TO_OLD_FS_ID (); @@ -2945,6 +3424,28 @@ out: return 0; } +int +_posix_remove_xattr (dict_t *dict, char *key, data_t *value, void *data) +{ + int32_t op_ret = 0; + xlator_t *this = NULL; + posix_xattr_filler_t *filler = NULL; + + filler = (posix_xattr_filler_t *) data; + this = filler->this; + + op_ret = sys_lremovexattr (filler->real_path, key); + if (op_ret == -1) { + filler->op_errno = errno; + if (errno != ENOATTR && errno != EPERM) + gf_log (this->name, GF_LOG_ERROR, + "removexattr failed on %s (for %s): %s", + filler->real_path, key, strerror (errno)); + } + + return op_ret; +} + int32_t posix_removexattr (call_frame_t *frame, xlator_t *this, @@ -2953,6 +3454,7 @@ posix_removexattr (call_frame_t *frame, xlator_t *this, int32_t op_ret = -1; int32_t op_errno = 0; char * real_path = NULL; + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; @@ -2968,6 +3470,22 @@ posix_removexattr (call_frame_t *frame, xlator_t *this, SET_FS_ID (frame->root->uid, frame->root->gid); + /** + * sending an empty key name with xdata containing the + * list of key(s) to be removed implies "bulk remove request" + * for removexattr. + */ + if (name && (strcmp (name, "") == 0) && xdata) { + filler.real_path = real_path; + filler.this = this; + op_ret = dict_foreach (xdata, _posix_remove_xattr, &filler); + if (op_ret) { + op_errno = filler.op_errno; + } + + goto out; + } + op_ret = sys_lremovexattr (real_path, name); if (op_ret == -1) { op_errno = errno; @@ -3126,6 +3644,159 @@ __add_long_array (int64_t *dest, int64_t *src, int count) } } +static int +_posix_handle_xattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + int size = 0; + int count = 0; + int op_ret = 0; + int op_errno = 0; + gf_xattrop_flags_t optype = 0; + char *array = NULL; + inode_t *inode = NULL; + xlator_t *this = NULL; + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + optype = (gf_xattrop_flags_t)(filler->flags); + this = filler->this; + inode = filler->inode; + + count = v->len; + array = GF_CALLOC (count, sizeof (char), gf_posix_mt_char); + + LOCK (&inode->lock); + { + if (filler->real_path) { + size = sys_lgetxattr (filler->real_path, k, + (char *)array, v->len); + } else { + size = sys_fgetxattr (filler->fd, k, (char *)array, + v->len); + } + + op_errno = errno; + if ((size == -1) && (op_errno != ENODATA) && + (op_errno != ENOATTR)) { + if (op_errno == ENOTSUP) { + GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, + this->name, GF_LOG_WARNING, + "Extended attributes not " + "supported by filesystem"); + } else if (op_errno != ENOENT || + !posix_special_xattr (marker_xattrs, + k)) { + if (filler->real_path) + gf_log (this->name, GF_LOG_ERROR, + "getxattr failed on %s while doing " + "xattrop: Key:%s (%s)", + filler->real_path, + k, strerror (op_errno)); + else + gf_log (this->name, GF_LOG_ERROR, + "fgetxattr failed on fd=%d while doing " + "xattrop: Key:%s (%s)", + filler->fd, + k, strerror (op_errno)); + } + + op_ret = -1; + goto unlock; + } + + switch (optype) { + + case GF_XATTROP_ADD_ARRAY: + __add_array ((int32_t *) array, (int32_t *) v->data, + v->len / 4); + break; + + case GF_XATTROP_ADD_ARRAY64: + __add_long_array ((int64_t *) array, (int64_t *) v->data, + v->len / 8); + break; + + case GF_XATTROP_OR_ARRAY: + __or_array ((int32_t *) array, + (int32_t *) v->data, + v->len / 4); + break; + + case GF_XATTROP_AND_ARRAY: + __and_array ((int32_t *) array, + (int32_t *) v->data, + v->len / 4); + break; + + default: + gf_log (this->name, GF_LOG_ERROR, + "Unknown xattrop type (%d) on %s. Please send " + "a bug report to gluster-devel@nongnu.org", + optype, filler->real_path); + op_ret = -1; + op_errno = EINVAL; + goto unlock; + } + + if (filler->real_path) { + size = sys_lsetxattr (filler->real_path, k, array, + v->len, 0); + } else { + size = sys_fsetxattr (filler->fd, k, (char *)array, + v->len, 0); + } + } +unlock: + UNLOCK (&inode->lock); + + if (op_ret == -1) + goto out; + + op_errno = errno; + if (size == -1) { + if (filler->real_path) + gf_log (this->name, GF_LOG_ERROR, + "setxattr failed on %s while doing xattrop: " + "key=%s (%s)", filler->real_path, + k, strerror (op_errno)); + else + gf_log (this->name, GF_LOG_ERROR, + "fsetxattr failed on fd=%d while doing xattrop: " + "key=%s (%s)", filler->fd, + k, strerror (op_errno)); + + op_ret = -1; + goto out; + } else { + size = dict_set_bin (d, k, array, v->len); + + if (size != 0) { + if (filler->real_path) + gf_log (this->name, GF_LOG_DEBUG, + "dict_set_bin failed (path=%s): " + "key=%s (%s)", filler->real_path, + k, strerror (-size)); + else + gf_log (this->name, GF_LOG_DEBUG, + "dict_set_bin failed (fd=%d): " + "key=%s (%s)", filler->fd, + k, strerror (-size)); + + op_ret = -1; + op_errno = EINVAL; + goto out; + } + array = NULL; + } + + array = NULL; + +out: + return op_ret; +} + /** * xattrop - xattr operations - for internal use by GlusterFS * @optype: ADD_ARRAY: @@ -3137,32 +3808,24 @@ int do_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) { - char *real_path = NULL; - char *array = NULL; - int size = 0; - int count = 0; - - int op_ret = 0; - int op_errno = 0; - - int ret = 0; - int _fd = -1; - struct posix_fd *pfd = NULL; - - char * path = NULL; - inode_t * inode = NULL; + int op_ret = 0; + int op_errno = 0; + int _fd = -1; + char *real_path = NULL; + struct posix_fd *pfd = NULL; + inode_t *inode = NULL; + posix_xattr_filler_t filler = {0,}; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (xattr, out); VALIDATE_OR_GOTO (this, out); if (fd) { - ret = posix_fd_ctx_get (fd, this, &pfd); - if (ret < 0) { + op_ret = posix_fd_ctx_get (fd, this, &pfd); + if (op_ret < 0) { gf_log (this->name, GF_LOG_WARNING, "failed to get pfd from fd=%p", fd); - op_ret = -1; op_errno = EBADFD; goto out; } @@ -3173,152 +3836,21 @@ do_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, MAKE_INODE_HANDLE (real_path, this, loc, NULL); if (real_path) { - path = gf_strdup (real_path); inode = loc->inode; } else if (fd) { inode = fd->inode; } - int _handle_every_keyvalue_pair (dict_t *d, char *k, data_t *v, - void *tmp) - { - - count = v->len; - array = GF_CALLOC (count, sizeof (char), gf_posix_mt_char); - - LOCK (&inode->lock); - { - if (loc) { - size = sys_lgetxattr (real_path, k, - (char *)array, v->len); - } else { - size = sys_fgetxattr (_fd, k, (char *)array, - v->len); - } - - op_errno = errno; - if ((size == -1) && (op_errno != ENODATA) && - (op_errno != ENOATTR)) { - if (op_errno == ENOTSUP) { - GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, - this->name,GF_LOG_WARNING, - "Extended attributes not " - "supported by filesystem"); - } else if (op_errno != ENOENT || - !posix_special_xattr (marker_xattrs, - k)) { - if (loc) - gf_log (this->name, GF_LOG_ERROR, - "getxattr failed on %s while doing " - "xattrop: Key:%s (%s)", path, - k, strerror (op_errno)); - else - gf_log (this->name, GF_LOG_ERROR, - "fgetxattr failed on fd=%d while doing " - "xattrop: Key:%s (%s)", _fd, - k, strerror (op_errno)); - } - - op_ret = -1; - goto unlock; - } - - switch (optype) { - - case GF_XATTROP_ADD_ARRAY: - __add_array ((int32_t *) array, (int32_t *) v->data, - v->len / 4); - break; - - case GF_XATTROP_ADD_ARRAY64: - __add_long_array ((int64_t *) array, (int64_t *) v->data, - v->len / 8); - break; - - case GF_XATTROP_OR_ARRAY: - __or_array ((int32_t *) array, - (int32_t *) v->data, - v->len / 4); - break; - - case GF_XATTROP_AND_ARRAY: - __and_array ((int32_t *) array, - (int32_t *) v->data, - v->len / 4); - break; - - default: - gf_log (this->name, GF_LOG_ERROR, - "Unknown xattrop type (%d) on %s. Please send " - "a bug report to gluster-devel@nongnu.org", - optype, path); - op_ret = -1; - op_errno = EINVAL; - goto unlock; - } - - if (loc) { - size = sys_lsetxattr (real_path, k, array, - v->len, 0); - } else { - size = sys_fsetxattr (_fd, k, (char *)array, - v->len, 0); - } - } - unlock: - UNLOCK (&inode->lock); - - if (op_ret == -1) - goto out; - - op_errno = errno; - if (size == -1) { - if (loc) - gf_log (this->name, GF_LOG_ERROR, - "setxattr failed on %s while doing xattrop: " - "key=%s (%s)", path, - k, strerror (op_errno)); - else - gf_log (this->name, GF_LOG_ERROR, - "fsetxattr failed on fd=%d while doing xattrop: " - "key=%s (%s)", _fd, - k, strerror (op_errno)); - - op_ret = -1; - goto out; - } else { - size = dict_set_bin (xattr, k, array, v->len); - - if (size != 0) { - if (loc) - gf_log (this->name, GF_LOG_DEBUG, - "dict_set_bin failed (path=%s): " - "key=%s (%s)", path, - k, strerror (-size)); - else - gf_log (this->name, GF_LOG_DEBUG, - "dict_set_bin failed (fd=%d): " - "key=%s (%s)", _fd, - k, strerror (-size)); - - op_ret = -1; - op_errno = EINVAL; - goto out; - } - array = NULL; - } + filler.this = this; + filler.fd = _fd; + filler.real_path = real_path; + filler.flags = (int)optype; + filler.inode = inode; - array = NULL; - - out: - return op_ret; - } - op_ret = dict_foreach (xattr, _handle_every_keyvalue_pair, NULL); + op_ret = dict_foreach (xattr, _posix_handle_xattr_keyvalue_pair, + &filler); out: - GF_FREE (array); - - GF_FREE (path); STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, xattr, NULL); return 0; @@ -3723,6 +4255,8 @@ posix_readdirp_fill (xlator_t *this, fd_t *fd, gf_dirent_t *entries, dict_t *dic struct iatt stbuf = {0, }; uuid_t gfid; + if (list_empty(&entries->list)) + return 0; itable = fd->inode->table; @@ -3811,8 +4345,23 @@ posix_do_readdir (call_frame_t *frame, xlator_t *this, */ ret = dict_get_int32 (dict, GF_READDIR_SKIP_DIRS, &skip_dirs); - count = posix_fill_readdir (fd, dir, off, size, &entries, this, - skip_dirs); + LOCK (&fd->lock); + { + /* posix_fill_readdir performs multiple separate individual + readdir() calls to fill up the buffer. + + In case of NFS where the same anonymous FD is shared between + different applications, reading a common directory can + result in the anonymous fd getting re-used unsafely between + the two readdir requests (in two different io-threads). + + It would also help, in the future, to replace the loop + around readdir() with a single large getdents() call. + */ + count = posix_fill_readdir (fd, dir, off, size, &entries, this, + skip_dirs); + } + UNLOCK (&fd->lock); /* pick ENOENT to indicate EOF */ op_errno = errno; @@ -4015,6 +4564,27 @@ posix_set_owner (xlator_t *this, uid_t uid, gid_t gid) return ret; } + +static int +set_batch_fsync_mode (struct posix_private *priv, const char *str) +{ + if (strcmp (str, "none") == 0) + priv->batch_fsync_mode = BATCH_NONE; + else if (strcmp (str, "syncfs") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS; + else if (strcmp (str, "syncfs-single-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_SINGLE_FSYNC; + else if (strcmp (str, "syncfs-reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_REVERSE_FSYNC; + else if (strcmp (str, "reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_REVERSE_FSYNC; + else + return -1; + + return 0; +} + + int reconfigure (xlator_t *this, dict_t *options) { @@ -4022,6 +4592,7 @@ reconfigure (xlator_t *this, dict_t *options) struct posix_private *priv = NULL; uid_t uid = -1; gid_t gid = -1; + char *batch_fsync_mode_str = NULL; priv = this->private; @@ -4029,6 +4600,18 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("brick-gid", gid, options, uint32, out); posix_set_owner (this, uid, gid); + GF_OPTION_RECONF ("batch-fsync-delay-usec", priv->batch_fsync_delay_usec, + options, uint32, out); + + GF_OPTION_RECONF ("batch-fsync-mode", batch_fsync_mode_str, + options, str, out); + + if (set_batch_fsync_mode (priv, batch_fsync_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", + batch_fsync_mode_str); + goto out; + } + GF_OPTION_RECONF ("linux-aio", priv->aio_configured, options, bool, out); @@ -4037,6 +4620,20 @@ reconfigure (xlator_t *this, dict_t *options) else posix_aio_off (this); + GF_OPTION_RECONF ("node-uuid-pathinfo", priv->node_uuid_pathinfo, + options, bool, out); + + if (priv->node_uuid_pathinfo && + (uuid_is_null (priv->glusterd_uuid))) { + gf_log (this->name, GF_LOG_INFO, + "glusterd uuid is NULL, pathinfo xattr would" + " fallback to <hostname>:<export>"); + } + + GF_OPTION_RECONF ("health-check-interval", priv->health_check_interval, + options, uint32, out); + posix_spawn_health_check_thread (this); + ret = 0; out: return ret; @@ -4066,6 +4663,7 @@ init (xlator_t *this) char *guuid = NULL; uid_t uid = -1; gid_t gid = -1; + char *batch_fsync_mode_str; dir_data = dict_get (this->options, "directory"); @@ -4218,7 +4816,7 @@ init (xlator_t *this) } } - size = sys_lgetxattr (dir_data->data, "system.posix_acl_access", + size = sys_lgetxattr (dir_data->data, POSIX_ACL_ACCESS_XATTR, NULL, 0); if ((size < 0) && (errno == ENOTSUP)) gf_log (this->name, GF_LOG_WARNING, @@ -4398,11 +4996,48 @@ init (xlator_t *this) } } + GF_OPTION_INIT ("node-uuid-pathinfo", + _private->node_uuid_pathinfo, bool, out); + if (_private->node_uuid_pathinfo && + (uuid_is_null (_private->glusterd_uuid))) { + gf_log (this->name, GF_LOG_INFO, + "glusterd uuid is NULL, pathinfo xattr would" + " fallback to <hostname>:<export>"); + } + + _private->health_check_active = _gf_false; + GF_OPTION_INIT ("health-check-interval", + _private->health_check_interval, uint32, out); + if (_private->health_check_interval) + posix_spawn_health_check_thread (this); + pthread_mutex_init (&_private->janitor_lock, NULL); pthread_cond_init (&_private->janitor_cond, NULL); INIT_LIST_HEAD (&_private->janitor_fds); posix_spawn_janitor_thread (this); + + pthread_mutex_init (&_private->fsync_mutex, NULL); + pthread_cond_init (&_private->fsync_cond, NULL); + INIT_LIST_HEAD (&_private->fsyncs); + + ret = gf_thread_create (&_private->fsyncer, NULL, posix_fsyncer, this); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fsyncer thread" + " creation failed (%s)", strerror (errno)); + goto out; + } + + GF_OPTION_INIT ("batch-fsync-mode", batch_fsync_mode_str, str, out); + + if (set_batch_fsync_mode (_private, batch_fsync_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", + batch_fsync_mode_str); + goto out; + } + + GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec, + uint32, out); out: return ret; } @@ -4468,6 +5103,9 @@ struct xlator_fops fops = { .fxattrop = posix_fxattrop, .setattr = posix_setattr, .fsetattr = posix_fsetattr, + .fallocate = _posix_fallocate, + .discard = posix_discard, + .zerofill = posix_zerofill, }; struct xlator_cbks cbks = { @@ -4504,12 +5142,51 @@ struct volume_options options[] = { { .key = {"brick-uid"}, .type = GF_OPTION_TYPE_INT, - .description = "Support for setting uid of brick's root" + .min = 0, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Support for setting uid of brick's owner" }, { .key = {"brick-gid"}, .type = GF_OPTION_TYPE_INT, - .description = "Support for setting gid of brick's root" + .min = 0, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Support for setting gid of brick's owner" }, + { .key = {"node-uuid-pathinfo"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "return glusterd's node-uuid in pathinfo xattr" + " string instead of hostname" + }, + { + .key = {"health-check-interval"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "30", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Interval in seconds for a filesystem health check, " + "set to 0 to disable" + }, + { .key = {"batch-fsync-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "reverse-fsync", + .description = "Possible values:\n" + "\t- syncfs: Perform one syncfs() on behalf oa batch" + "of fsyncs.\n" + "\t- syncfs-single-fsync: Perform one syncfs() on behalf of a batch" + " of fsyncs and one fsync() per batch.\n" + "\t- syncfs-reverse-fsync: Preform one syncfs() on behalf of a batch" + " of fsyncs and fsync() each file in the batch in reverse order.\n" + " in reverse order.\n" + "\t- reverse-fsync: Perform fsync() of each file in the batch in" + " reverse order." + }, + { .key = {"batch-fsync-delay-usec"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "0", + .description = "Num of usecs to wait for aggregating fsync" + " requests", + }, { .key = {NULL} } }; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 45ee35963..3121db271 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -43,12 +43,15 @@ #include "timer.h" #include "posix-mem-types.h" #include "posix-handle.h" +#include "call-stub.h" #ifdef HAVE_LIBAIO #include <libaio.h> #include "posix-aio.h" #endif +#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/ +#define MAX_NO_VECT 1024 /** * posix_fd - internal structure common to file and directory fd's */ @@ -57,7 +60,6 @@ struct posix_fd { int fd; /* fd returned by the kernel */ int32_t flags; /* flags for open/creat */ DIR * dir; /* handle returned by the kernel */ - int flushwrites; int odirect; struct list_head list; /* to add to the janitor list */ }; @@ -125,8 +127,45 @@ struct posix_private { io_context_t ctxp; pthread_t aiothread; #endif + + /* node-uuid in pathinfo xattr */ + gf_boolean_t node_uuid_pathinfo; + + pthread_t fsyncer; + struct list_head fsyncs; + pthread_mutex_t fsync_mutex; + pthread_cond_t fsync_cond; + int fsync_queue_count; + + enum { + BATCH_NONE = 0, + BATCH_SYNCFS, + BATCH_SYNCFS_SINGLE_FSYNC, + BATCH_REVERSE_FSYNC, + BATCH_SYNCFS_REVERSE_FSYNC + } batch_fsync_mode; + + uint32_t batch_fsync_delay_usec; + + /* seconds to sleep between health checks */ + uint32_t health_check_interval; + pthread_t health_check; + gf_boolean_t health_check_active; }; +typedef struct { + xlator_t *this; + const char *real_path; + dict_t *xattr; + struct iatt *stbuf; + loc_t *loc; + inode_t *inode; /* for all do_xattrop() key handling */ + int fd; + int flags; + int32_t op_errno; +} posix_xattr_filler_t; + + #define POSIX_BASE_PATH(this) (((struct posix_private *)this->private)->base_path) #define POSIX_BASE_PATH_LEN(this) (((struct posix_private *)this->private)->base_path_length) @@ -151,7 +190,7 @@ int posix_get_file_contents (xlator_t *this, uuid_t pargfid, int posix_set_file_contents (xlator_t *this, const char *path, char *key, data_t *value, int flags); int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req); -int posix_gfid_heal (xlator_t *this, const char *path, dict_t *xattr_req); +int posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req); int posix_entry_create_xattr_set (xlator_t *this, const char *path, dict_t *dict); @@ -163,4 +202,7 @@ gf_boolean_t posix_special_xattr (char **pattern, char *key); void __posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, off_t offset, size_t size); +void posix_spawn_health_check_thread (xlator_t *this); + +void *posix_fsyncer (void *); #endif /* _POSIX_H */ |
