summaryrefslogtreecommitdiffstats
path: root/xlators/storage
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/storage')
-rw-r--r--xlators/storage/Makefile.am3
-rw-r--r--xlators/storage/bd/Makefile.am (renamed from xlators/storage/bd_map/Makefile.am)0
-rw-r--r--xlators/storage/bd/src/Makefile.am (renamed from xlators/storage/bd_map/src/Makefile.am)11
-rw-r--r--xlators/storage/bd/src/bd-aio.c527
-rw-r--r--xlators/storage/bd/src/bd-aio.h41
-rw-r--r--xlators/storage/bd/src/bd-helper.c783
-rw-r--r--xlators/storage/bd/src/bd.c2404
-rw-r--r--xlators/storage/bd/src/bd.h178
-rw-r--r--xlators/storage/bd_map/src/bd_map.c2580
-rw-r--r--xlators/storage/bd_map/src/bd_map.h76
-rw-r--r--xlators/storage/bd_map/src/bd_map_help.c501
-rw-r--r--xlators/storage/bd_map/src/bd_map_help.h69
-rw-r--r--xlators/storage/posix/src/Makefile.am2
-rw-r--r--xlators/storage/posix/src/posix-aio.c12
-rw-r--r--xlators/storage/posix/src/posix-handle.c7
-rw-r--r--xlators/storage/posix/src/posix-helpers.c407
-rw-r--r--xlators/storage/posix/src/posix.c1103
-rw-r--r--xlators/storage/posix/src/posix.h46
18 files changed, 5245 insertions, 3505 deletions
diff --git a/xlators/storage/Makefile.am b/xlators/storage/Makefile.am
index e1316a127..c08e8e41b 100644
--- a/xlators/storage/Makefile.am
+++ b/xlators/storage/Makefile.am
@@ -1,6 +1,7 @@
SUBDIRS = posix
if ENABLE_BD_XLATOR
-SUBDIRS += bd_map
+SUBDIRS += bd
endif
+
CLEANFILES =
diff --git a/xlators/storage/bd_map/Makefile.am b/xlators/storage/bd/Makefile.am
index a985f42a8..a985f42a8 100644
--- a/xlators/storage/bd_map/Makefile.am
+++ b/xlators/storage/bd/Makefile.am
diff --git a/xlators/storage/bd_map/src/Makefile.am b/xlators/storage/bd/src/Makefile.am
index be43d2abb..3d93f7442 100644
--- a/xlators/storage/bd_map/src/Makefile.am
+++ b/xlators/storage/bd/src/Makefile.am
@@ -1,14 +1,13 @@
-
if ENABLE_BD_XLATOR
-xlator_LTLIBRARIES = bd_map.la
+xlator_LTLIBRARIES = bd.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage
-bd_map_la_LDFLAGS = -module -avoidversion
+bd_la_LDFLAGS = -module -avoid-version
LIBBD = -llvm2app -lrt
-bd_map_la_SOURCES = bd_map.c bd_map_help.c
-bd_map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBBD)
+bd_la_SOURCES = bd.c bd-helper.c bd-aio.c
+bd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBBD) $(LIBAIO)
-noinst_HEADERS = bd_map.h bd_map_help.h
+noinst_HEADERS = bd.h bd-aio.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
-I$(top_srcdir)/rpc/xdr/src \
diff --git a/xlators/storage/bd/src/bd-aio.c b/xlators/storage/bd/src/bd-aio.c
new file mode 100644
index 000000000..62d4590f7
--- /dev/null
+++ b/xlators/storage/bd/src/bd-aio.c
@@ -0,0 +1,527 @@
+/*
+ Copyright IBM, Corp. 2013
+
+ This file is part of GlusterFS.
+
+ Author: M. Mohan Kumar <mohan@in.ibm.com>
+
+ Based on posix-aio.c
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <lvm2app.h>
+#include <sys/uio.h>
+
+#include "xlator.h"
+#include "glusterfs.h"
+#include "defaults.h"
+#include "bd.h"
+#include "bd-aio.h"
+
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+
+struct bd_aio_cb {
+ struct iocb iocb;
+ call_frame_t *frame;
+ struct iobuf *iobuf;
+ struct iobref *iobref;
+ struct iatt prebuf;
+ int op;
+ off_t offset;
+ fd_t *fd;
+};
+
+void
+__bd_fd_set_odirect (fd_t *fd, bd_fd_t *bd_fd, int opflags,
+ off_t offset, size_t size)
+{
+ int odirect = 0;
+ int flags = 0;
+ int ret = 0;
+
+ odirect = bd_fd->odirect;
+
+ if ((fd->flags|opflags) & O_DIRECT) {
+ /* if instructed, use O_DIRECT always */
+ odirect = 1;
+ } else {
+ /* else use O_DIRECT when feasible */
+ if ((offset|size) & 0xfff)
+ odirect = 0;
+ else
+ odirect = 1;
+ }
+
+ if (!odirect && bd_fd->odirect) {
+ flags = fcntl (bd_fd->fd, F_GETFL);
+ ret = fcntl (bd_fd->fd, F_SETFL, (flags & (~O_DIRECT)));
+ bd_fd->odirect = 0;
+ }
+
+ if (odirect && !bd_fd->odirect) {
+ flags = fcntl (bd_fd->fd, F_GETFL);
+ ret = fcntl (bd_fd->fd, F_SETFL, (flags | O_DIRECT));
+ bd_fd->odirect = 1;
+ }
+
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "fcntl() failed (%s). fd=%d flags=%d pfd->odirect=%d",
+ strerror (errno), bd_fd->fd, flags, bd_fd->odirect);
+ }
+}
+
+int
+bd_aio_readv_complete (struct bd_aio_cb *paiocb, int res, int res2)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct iobuf *iobuf = NULL;
+ struct iatt postbuf = {0,};
+ int op_ret = -1;
+ int op_errno = 0;
+ struct iovec iov;
+ struct iobref *iobref = NULL;
+ off_t offset = 0;
+ bd_attr_t *bdatt = NULL;
+
+ frame = paiocb->frame;
+ this = frame->this;
+ iobuf = paiocb->iobuf;
+ offset = paiocb->offset;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_log (this->name, GF_LOG_ERROR,
+ "readv(async) failed fd=%p,size=%lu,offset=%llu (%d/%s)",
+ paiocb->fd, paiocb->iocb.u.c.nbytes,
+ (unsigned long long) paiocb->offset,
+ res, strerror (op_errno));
+ goto out;
+ }
+
+ bd_inode_ctx_get (paiocb->fd->inode, this, &bdatt);
+ memcpy (&postbuf, &bdatt->iatt, sizeof (struct iatt));
+
+ op_ret = res;
+ op_errno = 0;
+
+ iobref = iobref_new ();
+ if (!iobref) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ iobref_add (iobref, iobuf);
+
+ iov.iov_base = iobuf_ptr (iobuf);
+ iov.iov_len = op_ret;
+
+ /* Hack to notify higher layers of EOF. */
+ if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size)
+ op_errno = ENOENT;
+
+out:
+ STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, &iov, 1,
+ &postbuf, iobref, NULL);
+ if (iobuf)
+ iobuf_unref (iobuf);
+ if (iobref)
+ iobref_unref (iobref);
+
+ GF_FREE (paiocb);
+
+ return 0;
+}
+
+int
+bd_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ size_t size, off_t offset, uint32_t flags, dict_t *xdata)
+{
+ int32_t op_errno = EINVAL;
+ int _fd = -1;
+ struct iobuf *iobuf = NULL;
+ bd_fd_t *bd_fd = NULL;
+ int ret = -1;
+ struct bd_aio_cb *paiocb = NULL;
+ bd_priv_t *priv = NULL;
+ struct iocb *iocb = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ priv = this->private;
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_readv_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->readv, fd, size, offset,
+ flags, xdata);
+ return 0;
+ }
+ _fd = bd_fd->fd;
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (!size) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size);
+ goto err;
+ }
+
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
+ if (!iobuf) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ paiocb = CALLOC (1, sizeof (*paiocb));
+ if (!paiocb) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ paiocb->frame = frame;
+ paiocb->iobuf = iobuf;
+ paiocb->offset = offset;
+ paiocb->op = GF_FOP_READ;
+ paiocb->fd = fd;
+
+ paiocb->iocb.data = paiocb;
+ paiocb->iocb.aio_fildes = _fd;
+ paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD;
+ paiocb->iocb.aio_reqprio = 0;
+ paiocb->iocb.u.c.buf = iobuf_ptr (iobuf);
+ paiocb->iocb.u.c.nbytes = size;
+ paiocb->iocb.u.c.offset = offset;
+
+ iocb = &paiocb->iocb;
+
+ LOCK (&fd->lock);
+ {
+ __bd_fd_set_odirect (fd, bd_fd, flags, offset, size);
+
+ ret = io_submit (priv->ctxp, 1, &iocb);
+ }
+ UNLOCK (&fd->lock);
+
+ if (ret != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_submit() returned %d", ret);
+ op_errno = -ret;
+ goto err;
+ }
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
+ if (iobuf)
+ iobuf_unref (iobuf);
+
+ if (paiocb)
+ GF_FREE (paiocb);
+
+ return 0;
+}
+
+int
+bd_aio_writev_complete (struct bd_aio_cb *paiocb, int res, int res2)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct iatt prebuf = {0,};
+ struct iatt postbuf = {0,};
+ int op_ret = -1;
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+
+ frame = paiocb->frame;
+ prebuf = paiocb->prebuf;
+ this = frame->this;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_log (this->name, GF_LOG_ERROR,
+ "writev(async) failed fd=%p,offset=%llu (%d/%s)",
+ paiocb->fd, (unsigned long long) paiocb->offset, res,
+ strerror (op_errno));
+
+ goto out;
+ }
+
+ bd_inode_ctx_get (paiocb->fd->inode, this, &bdatt);
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+ memcpy (&postbuf, &bdatt->iatt, sizeof (struct iatt));
+
+ op_ret = res;
+ op_errno = 0;
+
+out:
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &prebuf, &postbuf,
+ NULL);
+
+ if (paiocb) {
+ if (paiocb->iobref)
+ iobref_unref (paiocb->iobref);
+ GF_FREE (paiocb);
+ }
+
+ return 0;
+}
+
+int
+bd_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *iov, int count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ int32_t op_errno = EINVAL;
+ int _fd = -1;
+ bd_fd_t *bd_fd = NULL;
+ int ret = -1;
+ struct bd_aio_cb *paiocb = NULL;
+ bd_priv_t *priv = NULL;
+ struct iocb *iocb = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ priv = this->private;
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_writev_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev,
+ fd, iov, count, offset, flags, iobref, xdata);
+ return 0;
+ }
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+
+ _fd = bd_fd->fd;
+
+ paiocb = CALLOC (1, sizeof (*paiocb));
+ if (!paiocb) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+
+ paiocb->frame = frame;
+ paiocb->offset = offset;
+ paiocb->op = GF_FOP_WRITE;
+ paiocb->fd = fd;
+
+ paiocb->iocb.data = paiocb;
+ paiocb->iocb.aio_fildes = _fd;
+ paiocb->iobref = iobref_ref (iobref);
+ paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV;
+ paiocb->iocb.aio_reqprio = 0;
+ paiocb->iocb.u.v.vec = iov;
+ paiocb->iocb.u.v.nr = count;
+ paiocb->iocb.u.v.offset = offset;
+
+ iocb = &paiocb->iocb;
+
+ memcpy (&paiocb->prebuf, &bdatt->iatt, sizeof (struct iatt));
+ LOCK (&fd->lock);
+ {
+ __bd_fd_set_odirect (fd, bd_fd, flags, offset,
+ iov_length (iov, count));
+
+ ret = io_submit (priv->ctxp, 1, &iocb);
+ }
+ UNLOCK (&fd->lock);
+
+ if (ret != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_submit() returned %d", ret);
+ op_errno = -ret;
+ goto err;
+ }
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (writev, frame, -1, op_errno, 0, 0, 0);
+
+ if (paiocb) {
+ if (paiocb->iobref)
+ iobref_unref (paiocb->iobref);
+ GF_FREE (paiocb);
+ }
+
+ return 0;
+}
+
+void *
+bd_aio_thread (void *data)
+{
+ xlator_t *this = NULL;
+ bd_priv_t *priv = NULL;
+ int ret = 0;
+ int i = 0;
+ struct io_event *event = NULL;
+ struct bd_aio_cb *paiocb = NULL;
+ struct io_event events[BD_AIO_MAX_NR_GETEVENTS];
+ struct timespec ts = {0, };
+
+ this = data;
+ THIS = this;
+ priv = this->private;
+
+ ts.tv_sec = 5;
+ for (;;) {
+ memset (&events[0], 0, sizeof (events));
+ ret = io_getevents (priv->ctxp, 1, BD_AIO_MAX_NR_GETEVENTS,
+ &events[0], &ts);
+ if (ret < 0) {
+ if (ret == -EINTR)
+ continue;
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_getevents() returned %d, exiting", ret);
+ break;
+ }
+
+ for (i = 0; i < ret; i++) {
+ event = &events[i];
+
+ paiocb = event->data;
+
+ switch (paiocb->op) {
+ case GF_FOP_READ:
+ bd_aio_readv_complete (paiocb, event->res,
+ event->res2);
+ break;
+ case GF_FOP_WRITE:
+ bd_aio_writev_complete (paiocb, event->res,
+ event->res2);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "unknown op %d found in piocb",
+ paiocb->op);
+ break;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+int
+bd_aio_init (xlator_t *this)
+{
+ bd_priv_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ ret = io_setup (BD_AIO_MAX_NR_EVENTS, &priv->ctxp);
+ if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Linux AIO not available at run-time."
+ " Continuing with synchronous IO");
+ ret = 0;
+ goto out;
+ }
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "io_setup() failed. ret=%d, errno=%d",
+ ret, errno);
+ goto out;
+ }
+
+ ret = pthread_create (&priv->aiothread, NULL,
+ bd_aio_thread, this);
+ if (ret != 0) {
+ io_destroy (priv->ctxp);
+ goto out;
+ }
+
+ this->fops->readv = bd_aio_readv;
+ this->fops->writev = bd_aio_writev;
+out:
+ return ret;
+}
+
+
+int
+bd_aio_on (xlator_t *this)
+{
+ bd_priv_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ if (!priv->aio_init_done) {
+ ret = bd_aio_init (this);
+ if (ret == 0)
+ priv->aio_capable = _gf_true;
+ else
+ priv->aio_capable = _gf_false;
+ priv->aio_init_done = _gf_true;
+ }
+
+ if (priv->aio_capable) {
+ this->fops->readv = bd_aio_readv;
+ this->fops->writev = bd_aio_writev;
+ }
+
+ return ret;
+}
+
+int
+bd_aio_off (xlator_t *this)
+{
+ this->fops->readv = bd_readv;
+ this->fops->writev = bd_writev;
+
+ return 0;
+}
+
+#else
+
+int
+bd_aio_on (xlator_t *this)
+{
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return 0;
+}
+
+int
+bd_aio_off (xlator_t *this)
+{
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return 0;
+}
+
+void
+__bd_fd_set_odirect (fd_t *fd, struct bd_fd *pfd, int opflags,
+ off_t offset, size_t size)
+{
+ xlator_t *this = THIS;
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return;
+}
+#endif
diff --git a/xlators/storage/bd/src/bd-aio.h b/xlators/storage/bd/src/bd-aio.h
new file mode 100644
index 000000000..16f686a4c
--- /dev/null
+++ b/xlators/storage/bd/src/bd-aio.h
@@ -0,0 +1,41 @@
+/*
+ Copyright IBM, Corp. 2013
+
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _BD_AIO_H
+#define _BD_AIO_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "glusterfs.h"
+
+/*
+ * Maximum number of concurrently submitted IO events. The heaviest load
+ * GlusterFS has been able to handle had 60-80 concurrent calls
+ */
+#define BD_AIO_MAX_NR_EVENTS 256
+
+/* Maximum number of completed IO operations to reap per getevents syscall */
+#define BD_AIO_MAX_NR_GETEVENTS 16
+
+int bd_aio_on (xlator_t *this);
+int bd_aio_off (xlator_t *this);
+
+int bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata);
+
+int bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata);
+
+#endif /* !_BD_AIO_H */
diff --git a/xlators/storage/bd/src/bd-helper.c b/xlators/storage/bd/src/bd-helper.c
new file mode 100644
index 000000000..5525e346b
--- /dev/null
+++ b/xlators/storage/bd/src/bd-helper.c
@@ -0,0 +1,783 @@
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+#include <lvm2app.h>
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#endif
+
+#include "bd.h"
+#include "run.h"
+
+int
+bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx)
+{
+ int ret = -1;
+ uint64_t ctx_int = 0;
+
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ GF_VALIDATE_OR_GOTO (this->name, ctx, out);
+
+ ctx_int = (long)ctx;
+ ret = inode_ctx_set (inode, this, &ctx_int);
+out:
+ return ret;
+}
+
+int
+bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx)
+{
+ int ret = -1;
+ uint64_t ctx_int = 0;
+
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ ret = inode_ctx_get (inode, this, &ctx_int);
+ if (ret)
+ return ret;
+ if (ctx)
+ *ctx = (bd_attr_t *) ctx_int;
+out:
+ return ret;
+}
+
+void
+bd_local_free (xlator_t *this, bd_local_t *local)
+{
+ if (!local)
+ return;
+ if (local->fd)
+ fd_unref (local->fd);
+ else if (local->loc.path)
+ loc_wipe (&local->loc);
+ if (local->dict)
+ dict_unref (local->dict);
+ if (local->inode)
+ inode_unref (local->inode);
+ if (local->bdatt) {
+ GF_FREE (local->bdatt->type);
+ GF_FREE (local->bdatt);
+ }
+ mem_put (local);
+ local = NULL;
+}
+
+bd_local_t *
+bd_local_init (call_frame_t *frame, xlator_t *this)
+{
+ frame->local = mem_get0 (this->local_pool);
+ if (!frame->local)
+ return NULL;
+
+ return frame->local;
+}
+
+/*
+ * VG are set with the tag in GF_XATTR_VOL_ID_KEY:<uuid> format.
+ * This function validates this tag agains volume-uuid. Also goes
+ * through LV list to find out if a thin-pool is configured or not.
+ */
+int bd_scan_vg (xlator_t *this, bd_priv_t *priv)
+{
+ vg_t brick = NULL;
+ data_t *tmp_data = NULL;
+ struct dm_list *tags = NULL;
+ int op_ret = -1;
+ uuid_t dict_uuid = {0, };
+ uuid_t vg_uuid = {0, };
+ gf_boolean_t uuid = _gf_false;
+ lvm_str_list_t *strl = NULL;
+ struct dm_list *lv_dm_list = NULL;
+ lv_list_t *lv_list = NULL;
+ struct dm_list *dm_seglist = NULL;
+ lvseg_list_t *seglist = NULL;
+ lvm_property_value_t prop = {0, };
+ gf_boolean_t thin = _gf_false;
+ const char *lv_name = NULL;
+
+ brick = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!brick) {
+ gf_log (this->name, GF_LOG_CRITICAL, "VG %s is not found",
+ priv->vg);
+ return ENOENT;
+ }
+
+ lv_dm_list = lvm_vg_list_lvs (brick);
+ if (!lv_dm_list)
+ goto check;
+
+ dm_list_iterate_items (lv_list, lv_dm_list) {
+ dm_seglist = lvm_lv_list_lvsegs (lv_list->lv);
+ if (!dm_seglist)
+ continue;
+ dm_list_iterate_items (seglist, dm_seglist) {
+ prop = lvm_lvseg_get_property (seglist->lvseg,
+ "segtype");
+ if (!prop.is_valid || !prop.value.string)
+ continue;
+ if (!strcmp (prop.value.string, "thin-pool")) {
+ thin = _gf_true;
+ lv_name = lvm_lv_get_name (lv_list->lv);
+ priv->pool = gf_strdup (lv_name);
+ gf_log (THIS->name, GF_LOG_INFO, "Thin Pool "
+ "\"%s\" will be used for thin LVs",
+ lv_name);
+ break;
+ }
+ }
+ }
+
+check:
+ /* If there is no volume-id set in dict, we cant validate */
+ tmp_data = dict_get (this->options, "volume-id");
+ if (!tmp_data) {
+ op_ret = 0;
+ goto out;
+ }
+
+ op_ret = uuid_parse (tmp_data->data, dict_uuid);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "wrong volume-id (%s) set in volume file",
+ tmp_data->data);
+ op_ret = -1;
+ goto out;
+ }
+
+ tags = lvm_vg_get_tags (brick);
+ if (!tags) { /* no tags in the VG */
+ gf_log (this->name, GF_LOG_ERROR,
+ "Extended attribute trusted.glusterfs."
+ "volume-id is absent");
+ op_ret = -1;
+ goto out;
+ }
+ dm_list_iterate_items (strl, tags) {
+ if (!strncmp (strl->str, GF_XATTR_VOL_ID_KEY,
+ strlen (GF_XATTR_VOL_ID_KEY))) {
+ uuid = _gf_true;
+ break;
+ }
+ }
+ /* UUID tag is not set in VG */
+ if (!uuid) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Extended attribute trusted.glusterfs."
+ "volume-id is absent");
+ op_ret = -1;
+ goto out;
+ }
+
+ op_ret = uuid_parse (strl->str + strlen (GF_XATTR_VOL_ID_KEY) + 1,
+ vg_uuid);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "wrong volume-id (%s) set in VG", strl->str);
+ op_ret = -1;
+ goto out;
+ }
+ if (uuid_compare (dict_uuid, vg_uuid)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "mismatching volume-id (%s) received. "
+ "already is a part of volume %s ",
+ tmp_data->data, vg_uuid);
+ op_ret = -1;
+ goto out;
+ }
+
+ op_ret = 0;
+
+out:
+ lvm_vg_close (brick);
+
+ if (!thin)
+ gf_log (THIS->name, GF_LOG_WARNING, "No thin pool found in "
+ "VG %s\n", priv->vg);
+ else
+ priv->caps |= BD_CAPS_THIN;
+
+ return op_ret;
+}
+
+/* FIXME: Move this code to common place, so posix and bd xlator can use */
+char *
+page_aligned_alloc (size_t size, char **aligned_buf)
+{
+ char *alloc_buf = NULL;
+ char *buf = NULL;
+
+ alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_common_mt_char);
+ if (!alloc_buf)
+ return NULL;
+ /* page aligned buffer */
+ buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE);
+ *aligned_buf = buf;
+
+ return alloc_buf;
+}
+
+static int
+__bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd_p)
+{
+ int ret = -1;
+ int _fd = -1;
+ char *devpath = NULL;
+ bd_fd_t *bdfd = NULL;
+ uint64_t tmp_bdfd = 0;
+ bd_priv_t *priv = this->private;
+ bd_gfid_t gfid = {0, };
+ bd_attr_t *bdatt = NULL;
+
+ /* not bd file */
+ if (fd->inode->ia_type != IA_IFREG ||
+ bd_inode_ctx_get (fd->inode, this, &bdatt))
+ return 0;
+
+ ret = __fd_ctx_get (fd, this, &tmp_bdfd);
+ if (ret == 0) {
+ bdfd = (void *)(long) tmp_bdfd;
+ *bdfd_p = bdfd;
+ return 0;
+ }
+
+ uuid_utoa_r (fd->inode->gfid, gfid);
+ asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid);
+ if (!devpath)
+ goto out;
+
+ _fd = open (devpath, O_RDWR | O_LARGEFILE, 0);
+ if (_fd < 0) {
+ ret = errno;
+ gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath,
+ strerror (ret));
+ goto out;
+ }
+ bdfd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd);
+ BD_VALIDATE_MEM_ALLOC (bdfd, ret, out);
+
+ bdfd->fd = _fd;
+ bdfd->flag = O_RDWR | O_LARGEFILE;
+ if (__fd_ctx_set (fd, this, (uint64_t)(long)bdfd) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set the fd context fd=%p", fd);
+ goto out;
+ }
+
+ *bdfd_p = bdfd;
+
+ ret = 0;
+out:
+ FREE (devpath);
+ if (ret) {
+ close (_fd);
+ GF_FREE (bdfd);
+ }
+ return ret;
+}
+
+int
+bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd)
+{
+ int ret;
+
+ /* FIXME: Is it ok to fd->lock here ? */
+ LOCK (&fd->lock);
+ {
+ ret = __bd_fd_ctx_get (this, fd, bdfd);
+ }
+ UNLOCK (&fd->lock);
+
+ return ret;
+}
+
+/*
+ * Validates if LV exists for given inode or not.
+ * Returns 0 if LV exists and size also matches.
+ * If LV does not exist -1 returned
+ * If LV size mismatches, returnes 1 also lv_size is updated with actual
+ * size
+ */
+int
+bd_validate_bd_xattr (xlator_t *this, char *bd, char **type,
+ uint64_t *lv_size, uuid_t uuid)
+{
+ char *path = NULL;
+ int ret = -1;
+ bd_gfid_t gfid = {0, };
+ bd_priv_t *priv = this->private;
+ struct stat stbuf = {0, };
+ uint64_t size = 0;
+ vg_t vg = NULL;
+ lv_t lv = NULL;
+ char *bytes = NULL;
+
+ bytes = strrchr (bd, ':');
+ if (bytes) {
+ *bytes = '\0';
+ bytes++;
+ gf_string2bytesize (bytes, &size);
+ }
+
+ if (strcmp (bd, BD_LV) && strcmp (bd, BD_THIN)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid xattr %s", bd);
+ return -1;
+ }
+ *type = gf_strdup (bd);
+
+ /*
+ * Check if LV really exist, there could be a failure
+ * after setxattr and successful LV creation
+ */
+ uuid_utoa_r (uuid, gfid);
+ gf_asprintf (&path, "/dev/%s/%s", priv->vg, gfid);
+ if (!path) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "insufficient memory");
+ return 0;
+ }
+
+ /* Destination file does not exist */
+ if (stat (path, &stbuf)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "lstat failed for path %s", path);
+ return -1;
+ }
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "r", 0);
+ if (!vg) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "VG %s does not exist?", priv->vg);
+ ret = -1;
+ goto out;
+ }
+
+ lv = lvm_lv_from_name (vg, gfid);
+ if (!lv) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "LV %s does not exist", gfid);
+ ret = -1;
+ goto out;
+ }
+
+ *lv_size = lvm_lv_get_size (lv);
+ if (size == *lv_size) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = 1;
+
+out:
+ if (vg)
+ lvm_vg_close (vg);
+
+ GF_FREE (path);
+ return ret;
+}
+
+static int
+create_thin_lv (char *vg, char *pool, char *lv, uint64_t extent)
+{
+ int ret = -1;
+ runner_t runner = {0, };
+ char *path = NULL;
+ struct stat stat = {0, };
+
+ runinit (&runner);
+ runner_add_args (&runner, LVM_CREATE, NULL);
+ runner_add_args (&runner, "--thin", NULL);
+ runner_argprintf (&runner, "%s/%s", vg, pool);
+ runner_add_args (&runner, "--name", NULL);
+ runner_argprintf (&runner, "%s", lv);
+ runner_add_args (&runner, "--virtualsize", NULL);
+ runner_argprintf (&runner, "%ldB", extent);
+ runner_start (&runner);
+ runner_end (&runner);
+
+ gf_asprintf (&path, "/dev/%s/%s", vg, lv);
+ if (!path) {
+ ret = ENOMEM;
+ goto out;
+ }
+ if (lstat (path, &stat) < 0)
+ ret = EAGAIN;
+ else
+ ret = 0;
+out:
+ GF_FREE (path);
+ return ret;
+}
+
+int
+bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv)
+{
+ int ret = 0;
+ vg_t vg = NULL;
+ bd_gfid_t gfid = {0, };
+
+ uuid_utoa_r (uuid, gfid);
+
+ if (!strcmp (type, BD_THIN))
+ return create_thin_lv (priv->vg, priv->pool, gfid,
+ size);
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ return ENOENT;
+ }
+
+ if (!lvm_vg_create_lv_linear (vg, gfid, size)) {
+ gf_log (THIS->name, GF_LOG_WARNING, "lvm_vg_create_lv_linear "
+ "failed");
+ ret = errno;
+ }
+
+ lvm_vg_close (vg);
+
+ return ret;
+}
+
+int32_t
+bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size)
+{
+ uint64_t new_size = 0;
+ runner_t runner = {0, };
+ bd_gfid_t gfid = {0, };
+ int ret = 0;
+ vg_t vg = NULL;
+ lv_t lv = NULL;
+
+ uuid_utoa_r (uuid, gfid);
+
+ runinit (&runner);
+
+ runner_add_args (&runner, LVM_RESIZE, NULL);
+ runner_argprintf (&runner, "%s/%s", priv->vg, gfid);
+ runner_argprintf (&runner, "-L%ldb", size);
+ runner_add_args (&runner, "-f", NULL);
+
+ runner_start (&runner);
+ runner_end (&runner);
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ return EAGAIN;
+ }
+
+ lv = lvm_lv_from_name (vg, gfid);
+ if (!lv) {
+ gf_log (THIS->name, GF_LOG_WARNING, "LV %s not found", gfid);
+ ret = EIO;
+ goto out;
+ }
+ new_size = lvm_lv_get_size (lv);
+
+ if (new_size != size) {
+ gf_log (THIS->name, GF_LOG_WARNING, "resized LV size %ld does "
+ "not match requested size %ld", new_size, size);
+ ret = EIO;
+ }
+
+out:
+ lvm_vg_close (vg);
+ return ret;
+}
+
+uint64_t
+bd_get_default_extent (bd_priv_t *priv)
+{
+ vg_t vg = NULL;
+ uint64_t size = 0;
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ return 0;
+ }
+
+ size = lvm_vg_get_extent_size (vg);
+
+ lvm_vg_close (vg);
+
+ return size;
+}
+
+/*
+ * Adjusts the user specified size to VG specific extent size
+ */
+uint64_t
+bd_adjust_size (bd_priv_t *priv, uint64_t size)
+{
+ uint64_t extent = 0;
+ uint64_t nr_ex = 0;
+
+ extent = bd_get_default_extent (priv);
+ if (!extent)
+ return 0;
+
+ nr_ex = size / extent;
+ if (size % extent)
+ nr_ex++;
+
+ size = extent * nr_ex;
+
+ return size;
+}
+
+int
+bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno)
+{
+ vg_t vg = NULL;
+ lv_t lv = NULL;
+ int ret = -1;
+
+ *op_errno = 0;
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ *op_errno = ENOENT;
+ return -1;
+ }
+ lv = lvm_lv_from_name (vg, lv_name);
+ if (!lv) {
+ gf_log (THIS->name, GF_LOG_WARNING, "No such LV %s", lv_name);
+ *op_errno = ENOENT;
+ goto out;
+ }
+ ret = lvm_vg_remove_lv (lv);
+ if (ret < 0) {
+ gf_log (THIS->name, GF_LOG_WARNING, "removing LV %s failed",
+ lv_name);
+ *op_errno = errno;
+ goto out;
+ }
+out:
+ lvm_vg_close (vg);
+
+ return ret;
+}
+
+inline void
+bd_update_amtime(struct iatt *iatt, int flag)
+{
+ struct timespec ts = {0, };
+
+ clock_gettime (CLOCK_REALTIME, &ts);
+ if (flag & GF_SET_ATTR_ATIME) {
+ iatt->ia_atime = ts.tv_sec;
+ iatt->ia_atime_nsec = ts.tv_nsec;
+ }
+ if (flag & GF_SET_ATTR_MTIME) {
+ iatt->ia_mtime = ts.tv_sec;
+ iatt->ia_mtime_nsec = ts.tv_nsec;
+ }
+}
+
+int
+bd_snapshot_create (bd_local_t *local, bd_priv_t *priv)
+{
+ char *path = NULL;
+ bd_gfid_t dest = {0, };
+ bd_gfid_t origin = {0, };
+ int ret = 0;
+ runner_t runner = {0, };
+ struct stat stat = {0, };
+
+ uuid_utoa_r (local->dloc->gfid, dest);
+ uuid_utoa_r (local->loc.gfid, origin);
+
+ gf_asprintf (&path, "/dev/%s/%s", priv->vg, dest);
+ if (!path) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "Insufficient memory");
+ return ENOMEM;
+ }
+
+ runinit (&runner);
+ runner_add_args (&runner, LVM_CREATE, NULL);
+ runner_add_args (&runner, "--snapshot", NULL);
+ runner_argprintf (&runner, "/dev/%s/%s", priv->vg, origin);
+ runner_add_args (&runner, "--name", NULL);
+ runner_argprintf (&runner, "%s", dest);
+ if (strcmp (local->bdatt->type, BD_THIN))
+ runner_argprintf (&runner, "-L%ldB", local->size);
+ runner_start (&runner);
+ runner_end (&runner);
+
+ if (lstat (path, &stat) < 0)
+ ret = EIO;
+
+ GF_FREE (path);
+ return ret;
+}
+
+int
+bd_clone (bd_local_t *local, bd_priv_t *priv)
+{
+ int ret = ENOMEM;
+ int fd1 = -1;
+ int fd2 = -1;
+ int i = 0;
+ char *buff = NULL;
+ ssize_t bytes = 0;
+ char *spath = NULL;
+ char *dpath = NULL;
+ struct iovec *vec = NULL;
+ bd_gfid_t source = {0, };
+ bd_gfid_t dest = {0, };
+ void *bufp[IOV_NR] = {0, };
+
+ vec = GF_CALLOC (IOV_NR, sizeof (struct iovec), gf_common_mt_iovec);
+ if (!vec)
+ return ENOMEM;
+
+ for (i = 0; i < IOV_NR; i++) {
+ bufp[i] = page_aligned_alloc (IOV_SIZE, &buff);
+ if (!buff)
+ goto out;
+ vec[i].iov_base = buff;
+ vec[i].iov_len = IOV_SIZE;
+ }
+
+ uuid_utoa_r (local->loc.gfid, source);
+ uuid_utoa_r (local->dloc->gfid, dest);
+
+ gf_asprintf (&spath, "/dev/%s/%s", priv->vg, source);
+ gf_asprintf (&dpath, "/dev/%s/%s", priv->vg, dest);
+ if (!spath || !dpath)
+ goto out;
+
+ ret = bd_create (local->dloc->gfid, local->size,
+ local->bdatt->type, priv);
+ if (ret)
+ goto out;
+
+ fd1 = open (spath, O_RDONLY | O_DIRECT);
+ if (fd1 < 0) {
+ ret = errno;
+ goto out;
+ }
+ fd2 = open (dpath, O_WRONLY | O_DIRECT);
+ if (fd2 < 0) {
+ ret = errno;
+ goto out;
+ }
+
+ while (1) {
+ bytes = readv (fd1, vec, IOV_NR);
+ if (bytes < 0) {
+ ret = errno;
+ gf_log (THIS->name, GF_LOG_WARNING, "read failed: %s",
+ strerror (ret));
+ goto out;
+ }
+ if (!bytes)
+ break;
+ bytes = writev (fd2, vec, IOV_NR);
+ if (bytes < 0) {
+ ret = errno;
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "write failed: %s", strerror (ret));
+ goto out;
+ }
+ }
+ ret = 0;
+
+out:
+ for (i = 0; i < IOV_NR; i++)
+ GF_FREE (bufp[i]);
+ GF_FREE (vec);
+
+ if (fd1 != -1)
+ close (fd1);
+ if (fd2 != -1)
+ close (fd2);
+
+ FREE (spath);
+ FREE (dpath);
+
+ return ret;
+}
+
+/*
+ * Merges snapshot LV to origin LV and returns status
+ */
+int
+bd_merge (bd_priv_t *priv, uuid_t gfid)
+{
+ bd_gfid_t dest = {0, };
+ char *path = NULL;
+ struct stat stat = {0, };
+ runner_t runner = {0, };
+ int ret = 0;
+
+ uuid_utoa_r (gfid, dest);
+ gf_asprintf (&path, "/dev/%s/%s", priv->vg, dest);
+
+ runinit (&runner);
+ runner_add_args (&runner, LVM_CONVERT, NULL);
+ runner_add_args (&runner, "--merge", NULL);
+ runner_argprintf (&runner, "%s", path);
+ runner_start (&runner);
+ runner_end (&runner);
+
+ if (!lstat (path, &stat))
+ ret = EIO;
+
+ GF_FREE (path);
+
+ return ret;
+}
+
+int
+bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict)
+{
+ vg_t brick = NULL;
+ lvm_property_value_t prop = {0, };
+ lv_t lv = NULL;
+ int ret = -1;
+ bd_gfid_t gfid = {0, };
+ inode_t *inode = NULL;
+ char *origin = NULL;
+
+ brick = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!brick) {
+ gf_log (THIS->name, GF_LOG_CRITICAL, "VG %s is not found",
+ priv->vg);
+ return ENOENT;
+ }
+
+ if (fd)
+ inode = fd->inode;
+ else
+ inode = loc->inode;
+
+ uuid_utoa_r (inode->gfid, gfid);
+ lv = lvm_lv_from_name (brick, gfid);
+ if (!lv) {
+ gf_log (THIS->name, GF_LOG_CRITICAL, "LV %s not found", gfid);
+ ret = ENOENT;
+ goto out;
+ }
+
+ prop = lvm_lv_get_property (lv, "origin");
+ if (!prop.is_valid || !prop.value.string) {
+ ret = ENODATA;
+ goto out;
+ }
+
+ origin = gf_strdup (prop.value.string);
+ ret = dict_set_dynstr (dict, BD_ORIGIN, origin);
+
+out:
+ lvm_vg_close (brick);
+ return ret;
+}
+
diff --git a/xlators/storage/bd/src/bd.c b/xlators/storage/bd/src/bd.c
new file mode 100644
index 000000000..405474c58
--- /dev/null
+++ b/xlators/storage/bd/src/bd.c
@@ -0,0 +1,2404 @@
+/*
+ BD translator V2 - Exports Block devices on server side as regular
+ files to client
+
+ Now only exporting Logical volumes supported.
+
+ Copyright IBM, Corp. 2013
+
+ This file is part of GlusterFS.
+
+ Author:
+ M. Mohan Kumar <mohan@in.ibm.com>
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+#include <lvm2app.h>
+#include <openssl/md5.h>
+#include <time.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#endif
+
+#include "bd.h"
+#include "bd-aio.h"
+#include "defaults.h"
+#include "glusterfs3-xdr.h"
+#include "run.h"
+#include "protocol-common.h"
+#include "checksum.h"
+
+/*
+ * Call back function for setxattr and removexattr.
+ * does not do anything. FIXME: How to handle remove/setxattr failure
+ */
+int
+bd_null_rmsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+/*
+ * returns 0 if a file is mapped to BD or not.
+ */
+int
+bd_get_bd_info (call_frame_t *frame, xlator_t *this, dict_t *xattr, uuid_t gfid,
+ char **type, uint64_t *size)
+{
+ char *bd_xattr = NULL;
+ char *bd = NULL;
+ int ret = -1;
+ loc_t loc = {0, };
+ dict_t *dict = NULL;
+ char *p = NULL;
+ call_frame_t *bd_frame = NULL;
+
+ if (!xattr)
+ return 1;
+
+ if (dict_get_str (xattr, BD_XATTR, &p))
+ return 1;
+
+ bd_xattr = gf_strdup (p);
+
+ memcpy (loc.gfid, gfid, sizeof (uuid_t));
+
+ bd_frame = copy_frame (frame);
+ BD_VALIDATE_MEM_ALLOC (bd_frame, ret, out);
+
+ ret = bd_validate_bd_xattr (this, bd_xattr, type, size, gfid);
+ if (ret < 0) {/* LV does not exist */
+ STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->removexattr, &loc,
+ BD_XATTR, NULL);
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "Mapped LV not available for posix file <gfid:%s>, "
+ "deleting mapping", uuid_utoa (gfid));
+ } else if (ret == 1) {
+ /* BD_XATTR size and LV size mismatch. Update BD_XATTR */
+ gf_asprintf (&bd, "%s:%ld", *type, *size);
+
+ dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (dict, ret, out);
+
+ ret = dict_set_dynstr (dict, BD_XATTR, bd);
+ if (ret)
+ goto out;
+
+ STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setxattr, &loc, dict, 0,
+ NULL);
+ }
+
+out:
+ dict_del (xattr, BD_XATTR);
+ GF_FREE (bd_xattr);
+ GF_FREE (bd);
+ return ret;
+}
+
+/*
+ * bd_lookup_cbk: Call back from posix_lookup.
+ */
+int32_t
+bd_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *buf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ int ret = -1;
+ bd_attr_t *bdatt = NULL;
+ uint64_t size = 0;
+ char *type = BD_TYPE_NONE;
+
+ /* only regular files are part of BD object */
+ if (op_ret < 0 || buf->ia_type != IA_IFREG)
+ goto out;
+
+ /* iatt already cached */
+ if (!bd_inode_ctx_get (inode, this, &bdatt))
+ goto next;
+
+ if (bd_get_bd_info (frame, this, xattr, buf->ia_gfid, &type, &size))
+ goto out;
+
+ /* BD file, update buf */
+ bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ if (!bdatt) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ memcpy (&bdatt->iatt, buf, sizeof (struct iatt));
+ bdatt->type = type;
+
+ /* Cache LV size in inode_ctx */
+ ret = bd_inode_ctx_set (inode, this, bdatt);
+ if (ret < 0) {
+ GF_FREE (bdatt);
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ bdatt->iatt.ia_size = size;
+ bdatt->iatt.ia_blocks = size / 512;
+
+next:
+ dict_del (xattr, GF_CONTENT_KEY);
+ memcpy (buf, &bdatt->iatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf,
+ xattr, postparent);
+ return 0;
+}
+
+/*
+ * bd_lookup: Issues posix_lookup to find out if file is mapped to BD
+ * bd_lookup -> posix_lookup -> bd_lookup_cbk
+*/
+int32_t
+bd_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ dict_t *bd_xattr = NULL;
+ bd_attr_t *bdatt = NULL;
+ int op_errno = EINVAL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (loc->path, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt) < 0) {
+ if (!xattr_req) {
+ bd_xattr = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (bd_xattr, op_errno, out);
+ xattr_req = bd_xattr;
+ }
+ if (dict_set_int8 (xattr_req, BD_XATTR, 1) < 0)
+ goto out;
+ }
+
+ STACK_WIND (frame, bd_lookup_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup, loc, xattr_req);
+
+ if (bd_xattr)
+ dict_unref (bd_xattr);
+ return 0;
+out:
+ BD_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+bd_forget (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ uint64_t ctx = 0;
+ bd_attr_t *bdatt = NULL;
+
+ ret = bd_inode_ctx_get (inode, this, &bdatt);
+ if (!ret) {
+ inode_ctx_del (inode, this, &ctx);
+ FREE (bdatt);
+ }
+ return 0;
+}
+
+int
+bd_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, gf_dirent_t *entries, dict_t *xdata)
+{
+ gf_dirent_t *entry = NULL;
+ uint64_t size = 0;
+ char *type = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ list_for_each_entry (entry, &entries->list, list) {
+ if (entry->d_type != DT_REG)
+ continue;
+ if (!bd_get_bd_info (frame, this, entry->dict,
+ entry->d_stat.ia_gfid, &type, &size)) {
+ entry->d_stat.ia_size = size;
+ entry->d_stat.ia_blocks = size / 512;
+ FREE (type);
+ }
+ }
+
+out:
+ BD_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
+
+/*
+ * bd_readdirp: In bd_readdirp_cbk if the file and BD_XATTR_SIZE is set
+ * ia_size is updated with the LV(BD_XATTR_SIZE) size
+ */
+int32_t
+bd_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *dict)
+{
+ int op_errno = EINVAL;
+ bd_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ if (!dict) {
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+ dict = local->dict;
+ }
+
+ if (dict_set_int8 (dict, BD_XATTR, 0)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set key %s", BD_XATTR);
+ goto out;
+ }
+
+ STACK_WIND (frame, bd_readdirp_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, size, off, dict);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (readdirp, frame, -1, op_errno, NULL, dict);
+ return 0;
+}
+
+int
+bd_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *buf, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+
+ /* only regular files are part of BD object */
+ if (op_ret < 0 || buf->ia_type != IA_IFREG)
+ goto out;
+
+ BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out);
+
+ /* update buf with LV size */
+ if (!bd_inode_ctx_get (local->inode, this, &bdatt))
+ memcpy (buf, bdatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+bd_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ int op_errno = EINVAL;
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (loc->path, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ if (!bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ BD_STACK_UNWIND (stat, frame, 0, 0, &bdatt->iatt, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+ local->inode = inode_ref (loc->inode);
+
+ STACK_WIND(frame, bd_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc, xdata);
+ return 0;
+out:
+ BD_STACK_UNWIND (stat, frame, -1, op_errno, NULL, xdata);
+ return 0;
+}
+
+int
+bd_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *buff, dict_t *xdata)
+{
+ uint64_t size = 0;
+ uint64_t fr_size = 0;
+ bd_priv_t *priv = NULL;
+ vg_t vg = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ priv = this->private;
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "r", 0);
+ if (!vg) {
+ gf_log (this->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ op_ret = -1;
+ op_errno = EAGAIN;
+ goto out;
+ }
+ size = lvm_vg_get_size (vg);
+ fr_size = lvm_vg_get_free_size (vg);
+ lvm_vg_close (vg);
+
+ buff->f_blocks += size / buff->f_frsize;
+ buff->f_bfree += fr_size / buff->f_frsize;
+ buff->f_bavail += fr_size / buff->f_frsize;
+
+out:
+ BD_STACK_UNWIND (statfs, frame, op_ret, op_errno, buff, xdata);
+ return 0;
+}
+
+/*
+ * bd_statfs: Mimics statfs by returning used/free extents in the VG
+ */
+int
+bd_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ STACK_WIND (frame, bd_statfs_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->statfs, loc, xdata);
+ return 0;
+out:
+ BD_STACK_UNWIND (statfs, frame, -1, EINVAL, NULL, NULL);
+ return 0;
+}
+
+int
+bd_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *buf, dict_t *xdata)
+{
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = frame->local;
+
+ /* only regular files are part of BD object */
+ if (op_ret < 0 || buf->ia_type != IA_IFREG)
+ goto out;
+
+ BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out);
+
+ /* update buf with LV size */
+ if (!bd_inode_ctx_get (local->inode, this, &bdatt))
+ memcpy (buf, &bdatt->iatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+bd_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ int op_errno = EINVAL;
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ /* if its already cached return it */
+ if (!bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ BD_STACK_UNWIND (fstat, frame, 0, 0, &bdatt->iatt, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->inode = inode_ref (fd->inode);
+
+ STACK_WIND (frame, bd_fstat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, xdata);
+ return 0;
+}
+
+/*
+ * bd_readv: If posix file, invokes posix_readv otherwise reads from the BD
+ * file
+ */
+int
+bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ int ret = -1;
+ int _fd = -1;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ bd_fd_t *bd_fd = NULL;
+ struct iovec vec = {0, };
+ struct iobuf *iobuf = NULL;
+ struct iobref *iobref = NULL;
+ uint64_t bd_size = 0;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv,
+ fd, size, offset, flags, xdata);
+ return 0;
+ }
+ if (!size) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size);
+ goto out;
+ }
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
+ if (!iobuf) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ _fd = bd_fd->fd;
+ op_ret = pread (_fd, iobuf->ptr, size, offset);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "read failed on fd=%p: %s", fd,
+ strerror (op_errno));
+ goto out;
+ }
+
+ vec.iov_base = iobuf->ptr;
+ vec.iov_len = op_ret;
+
+ iobref = iobref_new ();
+ iobref_add (iobref, iobuf);
+
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ op_errno = EINVAL;
+ op_ret = -1;
+ goto out;
+ }
+ bd_size = bdatt->iatt.ia_size;
+ if (!bd_size || (offset + vec.iov_len) >= bd_size)
+ op_errno = ENOENT;
+
+ op_ret = vec.iov_len;
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_ATIME);
+
+out:
+ BD_STACK_UNWIND (readv, frame, op_ret, op_errno,
+ &vec, 1, &bdatt->iatt, iobref, NULL);
+
+ if (iobref)
+ iobref_unref (iobref);
+ if (iobuf)
+ iobuf_unref (iobuf);
+
+ return 0;
+}
+
+#ifdef BLKDISCARD
+/*
+ * bd_discard: Sends BLKDISCARD ioctl to the block device
+ */
+int
+bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ int ret = -1;
+ int op_errno = EINVAL;
+ bd_fd_t *bd_fd = NULL;
+ uint64_t param[2] = {0, };
+ bd_attr_t *bdatt = NULL;
+ struct iatt prebuf = {0, };
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ /* posix */
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard,
+ fd, offset, len, xdata);
+ return 0;
+ }
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ param[0] = offset;
+ param[1] = len;
+ ret = ioctl (bd_fd->fd, BLKDISCARD, param);
+ if (ret < 0) {
+ if (errno == ENOTTY)
+ op_errno = ENOSYS;
+ else
+ op_errno = errno;
+ goto out;
+ }
+ memcpy (&prebuf, &bdatt->iatt, sizeof (prebuf));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+
+ BD_STACK_UNWIND (discard, frame, ret, op_errno, &prebuf,
+ &bdatt->iatt, xdata);
+ return 0;
+
+out:
+ BD_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+#else
+
+int
+bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ BD_STACK_UNWIND (discard, frame, -1, ENOSYS, NULL, NULL, NULL);
+ return 0;
+}
+#endif
+
+/*
+ * Call back from posix_open for opening the backing posix file
+ * If it failed, close BD fd
+ */
+int
+bd_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+{
+ bd_fd_t *bd_fd = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ if (!op_ret)
+ goto out;
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (!bdatt) /* posix file */
+ goto out;
+
+ /* posix open failed */
+ if (bd_fd_ctx_get (this, fd, &bd_fd) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "bd_fd is NULL from fd=%p", fd);
+ goto out;
+ }
+ close (bd_fd->fd);
+ GF_FREE (bd_fd);
+
+out:
+ BD_STACK_UNWIND (open, frame, op_ret, op_errno, fd, NULL);
+
+ return 0;
+}
+
+/*
+ * bd_open: Opens BD file if given posix file is mapped to BD. Also opens
+ * posix file.
+ * fd contains both posix and BD fd
+ */
+int32_t
+bd_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
+{
+ int32_t ret = EINVAL;
+ bd_fd_t *bd_fd = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_gfid_t gfid = {0, };
+ char *devpath = NULL;
+ bd_priv_t *priv = this->private;
+ int _fd = -1;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ /* not bd file */
+ if (fd->inode->ia_type != IA_IFREG ||
+ bd_inode_ctx_get (fd->inode, this, &bdatt))
+ goto posix;
+
+ uuid_utoa_r (fd->inode->gfid, gfid);
+ asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid);
+ BD_VALIDATE_MEM_ALLOC (devpath, ret, out);
+
+ _fd = open (devpath, flags | O_LARGEFILE, 0);
+ if (_fd < 0) {
+ ret = errno;
+ gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath,
+ strerror (ret));
+ goto out;
+ }
+ bd_fd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd);
+ BD_VALIDATE_MEM_ALLOC (bd_fd, ret, out);
+
+ bd_fd->fd = _fd;
+ bd_fd->flag = flags | O_LARGEFILE;
+
+ if (fd_ctx_set (fd, this, (uint64_t)(long)bd_fd) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set the fd context fd=%p", fd);
+ goto out;
+ }
+
+ ret = 0;
+
+posix:
+
+ /* open posix equivalant of this file, fd needed for fd related
+ operations like fsetxattr, ftruncate etc */
+ STACK_WIND (frame, bd_open_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (open, frame, -1, ret, fd, NULL);
+
+ FREE (devpath);
+ if (ret) {
+ close (_fd);
+ GF_FREE (bd_fd);
+ }
+
+ return 0;
+}
+
+/*
+ * call back from posix_setattr after updating iatt to posix file.
+ */
+int
+bd_fsync_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = local->bdatt;
+
+ BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &bdatt->iatt,
+ &bdatt->iatt, NULL);
+ return 0;
+}
+
+int
+bd_do_fsync (int fd, int datasync)
+{
+ int op_errno = 0;
+
+#ifdef HAVE_FDATASYNC
+ if (datasync) {
+ if (fdatasync (fd)) {
+ op_errno = errno;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "fdatasync on fd=%d failed: %s",
+ fd, strerror (errno));
+ }
+
+ } else
+#endif
+ {
+ if (fsync (fd)) {
+ op_errno = errno;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "fsync on fd=%d failed: %s",
+ fd, strerror (op_errno));
+ }
+ }
+
+ return op_errno;
+}
+
+/*
+ * bd_fsync: Syncs if BD fd, forwards the request to posix
+ * fsync -> posix_setattr -> posix_fsync
+*/
+int32_t
+bd_fsync (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t datasync, dict_t *xdata)
+{
+ int ret = -1;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ bd_fd_t *bd_fd = NULL;
+ bd_priv_t *priv = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = NULL;
+ int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
+ struct iatt prebuf = {0, };
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ret = bd_inode_ctx_get (fd->inode, this, &bdatt);
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd || !bdatt) {
+ STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsync, fd, datasync,
+ xdata);
+ return 0;
+ }
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+
+ op_errno = bd_do_fsync (bd_fd->fd, datasync);
+ if (op_errno)
+ goto out;
+
+ /* For BD, Update the a|mtime during full fsync only */
+ if (!datasync) {
+ local = bd_local_init (frame, this);
+ /* In case of mem failure, should posix flush called ? */
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ local->bdatt->type = gf_strdup (bdatt->type);
+ memcpy (&local->bdatt->iatt, &bdatt->iatt, sizeof (struct iatt));
+ bd_update_amtime (&local->bdatt->iatt, valid);
+ uuid_copy (local->loc.gfid, fd->inode->gfid);
+ STACK_WIND (frame, bd_fsync_setattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr, &local->loc,
+ &local->bdatt->iatt,
+ valid, NULL);
+ return 0;
+ }
+
+out:
+ BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &prebuf,
+ &bdatt->iatt, NULL);
+ return 0;
+}
+
+int
+bd_flush_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ BD_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int
+bd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ int ret = -1;
+ bd_fd_t *bd_fd = NULL;
+ bd_priv_t *priv = NULL;
+ bd_attr_t *bdatt = NULL;
+ int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
+ bd_local_t *local = NULL;
+ int op_errno = EINVAL;
+ loc_t loc = {0, };
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ret = bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (!bdatt)
+ goto out;
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd || !bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "bdfd/bdatt is NULL from fd=%p", fd);
+ goto out;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->fd = fd_ref (fd);
+ uuid_copy (loc.gfid, bdatt->iatt.ia_gfid);
+
+ /* Update the a|mtime during flush */
+ STACK_WIND (frame, bd_flush_setattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr, &loc, &bdatt->iatt,
+ valid, NULL);
+
+ return 0;
+
+out:
+ STACK_WIND (frame, default_flush_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->flush, fd, xdata);
+
+ return 0;
+}
+
+int32_t
+bd_release (xlator_t *this, fd_t *fd)
+{
+ int ret = -1;
+ bd_fd_t *bd_fd = NULL;
+ uint64_t tmp_bfd = 0;
+ bd_attr_t *bdatt = NULL;
+ bd_priv_t *priv = this->private;
+
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (priv, out);
+
+ ret = bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (ret || !bdatt) /* posix file */
+ goto out;
+
+ /* FIXME: Update amtime during release */
+
+ ret = fd_ctx_del (fd, this, &tmp_bfd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "bfd is NULL from fd=%p", fd);
+ goto out;
+ }
+ bd_fd = (bd_fd_t *)(long)tmp_bfd;
+
+ close (bd_fd->fd);
+ GF_FREE (bd_fd);
+out:
+ return 0;
+}
+
+/*
+ * Call back for removexattr after removing BD_XATTR incase of
+ * bd create failure
+ */
+int
+bd_setx_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (local->fd)
+ BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata);
+ return 0;
+
+}
+
+/*
+ * Call back after setting BD_XATTR. Creates BD. If BD creation is a failure
+ * invokes posix_removexattr to remove created BD_XATTR
+ */
+int
+bd_setx_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+
+ if (op_ret < 0)
+ goto next;
+
+ /* Create LV */
+ op_errno = bd_create (local->inode->gfid, local->bdatt->iatt.ia_size,
+ local->bdatt->type, this->private);
+ if (!op_errno)
+ goto out;
+
+ /* LV creation failed, remove BD_XATTR */
+ if (local->fd)
+ STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr,
+ local->fd, BD_XATTR, NULL);
+ else
+ STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ &local->loc, BD_XATTR, NULL);
+
+ return 0;
+out:
+
+ bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ if (!bdatt) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto next;
+ }
+
+ memcpy (&bdatt->iatt, &local->bdatt->iatt, sizeof (struct iatt));
+ bdatt->type = gf_strdup (local->bdatt->type);
+
+ bd_inode_ctx_set (local->inode, THIS, bdatt);
+
+next:
+ if (local->fd)
+ BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
+ else
+ BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+
+}
+
+/*
+ * Call back from posix_stat
+ */
+int
+bd_setx_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *iatt,
+ dict_t *xdata)
+{
+ char *param = NULL;
+ char *type = NULL;
+ char *s_size = NULL;
+ char *p = NULL;
+ char *copy = NULL;
+ bd_local_t *local = frame->local;
+ bd_priv_t *priv = this->private;
+ char *bd = NULL;
+ uint64_t size = 0;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (!IA_ISREG (iatt->ia_type)) {
+ op_errno = EOPNOTSUPP;
+ goto out;
+ }
+
+ param = copy = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char);
+ BD_VALIDATE_MEM_ALLOC (param, op_errno, out);
+
+ strncpy (param, local->data->data, local->data->len);
+
+ type = strtok_r (param, ":", &p);
+ if (!type) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (strcmp (type, BD_LV) && strcmp (type, BD_THIN)) {
+ gf_log (this->name, GF_LOG_WARNING, "Invalid bd type %s given",
+ type);
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ s_size = strtok_r (NULL, ":", &p);
+
+ /* If size not specified get default size */
+ if (!s_size)
+ size = bd_get_default_extent (priv);
+ else
+ gf_string2bytesize (s_size, &size);
+
+ gf_asprintf (&bd, "%s:%ld", type, size);
+ BD_VALIDATE_MEM_ALLOC (bd, op_errno, out);
+
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+
+ local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ if (dict_set_dynstr (local->dict, BD_XATTR, bd) < 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ local->bdatt->type = gf_strdup (type);
+ memcpy (&local->bdatt->iatt, iatt, sizeof (struct iatt));
+ local->bdatt->iatt.ia_size = size;
+
+ if (local->fd)
+ STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd, local->dict, 0, NULL);
+ else
+ STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ &local->loc, local->dict, 0, NULL);
+
+ return 0;
+
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, xdata);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, xdata);
+
+ GF_FREE (bd);
+ GF_FREE (copy);
+ return 0;
+}
+
+int
+bd_offload_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, EIO, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, EIO, NULL);
+
+ return 0;
+}
+
+int
+bd_offload_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (local->offload == BD_OF_SNAPSHOT)
+ op_ret = bd_snapshot_create (frame->local, this->private);
+ else
+ op_ret = bd_clone (frame->local, this->private);
+
+ if (op_ret) {
+ STACK_WIND (frame, bd_offload_rm_xattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ local->dloc, BD_XATTR, NULL);
+ return 0;
+ }
+
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, op_errno, op_errno, NULL);
+
+ return 0;
+}
+
+int
+bd_offload_getx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ char *bd = NULL;
+ bd_local_t *local = frame->local;
+ char *type = NULL;
+ char *p = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (dict_get_str (xattr, BD_XATTR, &p)) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ type = gf_strdup (p);
+ BD_VALIDATE_MEM_ALLOC (type, op_errno, out);
+
+ p = strrchr (type, ':');
+ if (!p) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING,
+ "source file xattr %s corrupted?", type);
+ goto out;
+ }
+
+ *p='\0';
+
+ /* For clone size is taken from source LV */
+ if (!local->size) {
+ p++;
+ gf_string2bytesize (p, &local->size);
+ }
+ gf_asprintf (&bd, "%s:%ld", type, local->size);
+ local->bdatt->type = gf_strdup (type);
+ dict_del (local->dict, BD_XATTR);
+ dict_del (local->dict, LINKTO);
+ if (dict_set_dynstr (local->dict, BD_XATTR, bd)) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ STACK_WIND (frame, bd_offload_setx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ local->dloc, local->dict, 0, NULL);
+
+ return 0;
+
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ GF_FREE (type);
+ GF_FREE (bd);
+
+ return 0;
+}
+
+int
+bd_offload_dest_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct iatt *iatt,
+ dict_t *xattr, struct iatt *postparent)
+{
+ bd_local_t *local = frame->local;
+ char *bd = NULL;
+ int ret = -1;
+ char *linkto = NULL;
+
+ if (op_ret < 0 && op_errno != ENODATA) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (!IA_ISREG (iatt->ia_type)) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "destination gfid is not a "
+ "regular file");
+ goto out;
+ }
+
+ ret = dict_get_str (xattr, LINKTO, &linkto);
+ if (linkto) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "destination file not "
+ "present in same brick");
+ goto out;
+ }
+
+ ret = dict_get_str (xattr, BD_XATTR, &bd);
+ if (bd) {
+ op_errno = EEXIST;
+ goto out;
+ }
+
+ local->bdatt = CALLOC (1, sizeof (bd_attr_t));
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ STACK_WIND (frame, bd_offload_getx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ &local->loc, BD_XATTR, NULL);
+
+ return 0;
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+int
+bd_merge_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ /* FIXME: if delete failed, remove xattr */
+
+ BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, NULL);
+ return 0;
+}
+
+int
+bd_do_merge(call_frame_t *frame, xlator_t *this)
+{
+ bd_local_t *local = frame->local;
+ inode_t *parent = NULL;
+ char *p = NULL;
+ int op_errno = 0;
+
+ op_errno = bd_merge (this->private, local->inode->gfid);
+ if (op_errno)
+ goto out;
+
+ /*
+ * posix_unlink needs loc->pargfid to be valid, but setxattr FOP does
+ * not have loc->pargfid set. Get parent's gfid by getting parents inode
+ */
+ parent = inode_parent (local->inode, NULL, NULL);
+ if (!parent) {
+ /*
+ * FIXME: Snapshot LV already deleted.
+ * remove xattr, instead of returning failure
+ */
+ op_errno = EINVAL;
+ goto out;
+ }
+ uuid_copy (local->loc.pargfid, parent->gfid);
+
+ p = strrchr (local->loc.path, '/');
+ if (p)
+ p++;
+ local->loc.name = p;
+
+ STACK_WIND (frame, bd_merge_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ &local->loc, 0, NULL);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+
+ return op_errno;
+}
+
+int
+bd_offload (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ fd_t *fd, bd_offload_t offload)
+{
+ char *param = NULL;
+ char *param_copy = NULL;
+ char *p = NULL;
+ char *size = NULL;
+ char *gfid = NULL;
+ int op_errno = 0;
+ bd_local_t *local = frame->local;
+
+ param = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char);
+ BD_VALIDATE_MEM_ALLOC (param, op_errno, out);
+ param_copy = param;
+
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+
+ local->dloc = CALLOC (1, sizeof (loc_t));
+ BD_VALIDATE_MEM_ALLOC (local->dloc, op_errno, out);
+
+ strncpy (param, local->data->data, local->data->len);
+
+ gfid = strtok_r (param, ":", &p);
+ size = strtok_r (NULL, ":", &p);
+ if (size)
+ gf_string2bytesize (size, &local->size);
+ else if (offload != BD_OF_CLONE)
+ local->size = bd_get_default_extent (this->private);
+
+ if (dict_set_int8 (local->dict, BD_XATTR, 1) < 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
+ if (dict_set_int8 (local->dict, LINKTO, 1) < 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ uuid_parse (gfid, local->dloc->gfid);
+ local->offload = offload;
+
+ STACK_WIND (frame, bd_offload_dest_lookup_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup, local->dloc,
+ local->dict);
+
+ return 0;
+
+out:
+ if (fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ GF_FREE (param_copy);
+ return 0;
+}
+
+/*
+ * bd_setxattr: Used to create & map an LV to a posix file using
+ * BD_XATTR xattr
+ * bd_setxattr -> posix_stat -> bd_setx_stat_cbk -> posix_setxattr ->
+ * bd_setx_setx_cbk -> create_lv
+ * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk
+ */
+int
+bd_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int flags, dict_t *xdata)
+{
+ int op_errno = 0;
+ data_t *data = NULL;
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_offload_t cl_type = BD_OF_NONE;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+
+ if ((data = dict_get (dict, BD_XATTR)))
+ cl_type = BD_OF_NONE;
+ else if ((data = dict_get (dict, BD_CLONE)))
+ cl_type = BD_OF_CLONE;
+ else if ((data = dict_get (dict, BD_SNAPSHOT)))
+ cl_type = BD_OF_SNAPSHOT;
+ else if ((data = dict_get (dict, BD_MERGE)))
+ cl_type = BD_OF_MERGE;
+
+ bd_inode_ctx_get (loc->inode, this, &bdatt);
+ if (!cl_type && !data) {
+ STACK_WIND (frame, default_setxattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setxattr, loc, dict,
+ flags, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->data = data;
+ loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+
+ if (cl_type) {
+ /* For cloning/snapshot, source file must be mapped to LV */
+ if (!bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s not mapped to BD", loc->path);
+ op_errno = EINVAL;
+ goto out;
+ }
+ if (cl_type == BD_OF_MERGE)
+ bd_do_merge (frame, this);
+ else
+ bd_offload (frame, this, loc, NULL, cl_type);
+ } else if (data) {
+ if (bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s already mapped to BD", loc->path);
+ op_errno = EEXIST;
+ goto out;
+ }
+ STACK_WIND (frame, bd_setx_stat_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->stat, loc, xdata);
+ }
+
+ return 0;
+out:
+ if (op_errno)
+ STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, xdata);
+
+ return 0;
+}
+
+/*
+ * bd_fsetxattr: Used to create/map an LV to a posix file using
+ * BD_XATTR xattr
+ * bd_fsetxattr -> posix_fstat -> bd_setx_stat_cbk -> posix_fsetxattr ->
+ * bd_setx_setx_cbk -> create_lv
+ * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk
+ * -> bd_fsetxattr_cbk
+ */
+int32_t
+bd_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int flags, dict_t *xdata)
+{
+ int op_errno = 0;
+ data_t *data = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = NULL;
+ bd_offload_t cl_type = BD_OF_NONE;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+
+ data = dict_get (dict, BD_XATTR);
+ if ((data = dict_get (dict, BD_XATTR)))
+ cl_type = BD_OF_NONE;
+ else if ((data = dict_get (dict, BD_CLONE)))
+ cl_type = BD_OF_CLONE;
+ else if ((data = dict_get (dict, BD_SNAPSHOT)))
+ cl_type = BD_OF_SNAPSHOT;
+ else if ((data = dict_get (dict, BD_MERGE))) {
+ /*
+ * bd_merge is not supported for fsetxattr, because snapshot LV
+ * is opened and it causes problem in snapshot merge
+ */
+ op_errno = EOPNOTSUPP;
+ goto out;
+ }
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+
+ if (!cl_type && !data) {
+ /* non bd file object */
+ STACK_WIND (frame, default_fsetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ fd, dict, flags, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref (fd);
+ local->data = data;
+
+ if (cl_type) {
+ /* For cloning/snapshot, source file must be mapped to LV */
+ if (!bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "fd %p not mapped to BD", fd);
+ op_errno = EINVAL;
+ goto out;
+
+ }
+ bd_offload (frame, this, NULL, fd, cl_type);
+ } else if (data) {
+ if (bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "fd %p already mapped to BD", fd);
+ op_errno = EEXIST;
+ goto out;
+ }
+ STACK_WIND(frame, bd_setx_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+ }
+
+ return 0;
+out:
+
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+int32_t
+bd_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ if (!strcmp (name, BD_XATTR))
+ goto out;
+
+ STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+ return 0;
+out:
+ BD_STACK_UNWIND (removexattr, frame, -1, ENODATA, NULL);
+ return 0;
+}
+
+int32_t
+bd_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ if (!strcmp (name, BD_XATTR))
+ goto out;
+
+ STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (fremovexattr, frame, -1, ENODATA, NULL);
+ return 0;
+}
+
+int
+bd_trunc_setxattr_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/*
+ * Call back for setxattr after setting BD_XATTR_SIZE.
+ */
+int
+bd_trunc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+ struct iatt prebuf = {0, };
+ char *bd = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ bd_inode_ctx_get (local->inode, this, &bdatt);
+ if (!bdatt)
+ goto revert_xattr;
+
+ op_errno = bd_resize (this->private, local->inode->gfid,
+ local->bdatt->iatt.ia_size);
+ if (op_errno)
+ goto revert_xattr;
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+ /* LV resized, update new size in the cache */
+ bdatt->iatt.ia_size = local->bdatt->iatt.ia_size;
+
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, 0, 0, &prebuf, &bdatt->iatt,
+ NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, 0, 0, &prebuf, &bdatt->iatt,
+ NULL);
+
+ return 0;
+
+revert_xattr:
+ /* revert setxattr */
+ op_ret = dict_get_str (local->dict, BD_XATTR, &bd);
+ GF_FREE (bd);
+ gf_asprintf (&bd, "%s:%ld", bdatt->type, bdatt->iatt.ia_size);
+
+ if (local->fd)
+ STACK_WIND (frame, bd_trunc_setxattr_setx_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd, local->dict, 0, NULL);
+ else
+ STACK_WIND (frame, bd_trunc_setxattr_setx_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ &local->loc, local->dict, 0, NULL);
+
+ return 0;
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/*
+ * call back from posix_[f]truncate_stat
+ * If offset > LV size, it resizes the LV and calls posix_setxattr
+ * to update new LV size in xattr else calls posix_setattr for updating
+ * the posix file so that truncate fop behaves properly
+ */
+int
+bd_trunc_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *buf, dict_t *xdata)
+{
+ char *bd = NULL;
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+
+ bd_inode_ctx_get (local->inode, this, &bdatt);
+ if (!bdatt) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ gf_asprintf (&bd, "%s:%ld", bdatt->type, local->bdatt->iatt.ia_size);
+ if (dict_set_dynstr (local->dict, BD_XATTR, bd)) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (local->fd)
+ STACK_WIND (frame, bd_trunc_setxattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd, local->dict, 0, NULL);
+ else
+ STACK_WIND (frame, bd_trunc_setxattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ &local->loc, local->dict, 0, NULL);
+
+ return 0;
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL,
+ NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL,
+ NULL);
+ GF_FREE (bd);
+ return 0;
+}
+
+void
+bd_do_trunc (call_frame_t *frame, xlator_t *this, fd_t *fd, loc_t *loc,
+ off_t offset, bd_attr_t *bdatt)
+{
+ bd_local_t *local = NULL;
+ struct iatt prebuf = {0, };
+ int op_errno = 0;
+ int op_ret = -1;
+
+ /* If requested size is less than LV size, return success */
+ if (offset <= bdatt->iatt.ia_size) {
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+ op_ret = 0;
+ goto out;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ if (fd) {
+ local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref (fd);
+ } else {
+ local->inode = inode_ref (loc->inode);
+ loc_copy (&local->loc, loc);
+ }
+
+ local->bdatt->iatt.ia_size =
+ bd_adjust_size (this->private, offset);
+
+ STACK_WIND (frame, bd_trunc_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, NULL);
+
+ return;
+
+out:
+ if (fd)
+ BD_STACK_UNWIND (ftruncate, frame, op_ret, op_errno,
+ &prebuf, &bdatt->iatt, NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, op_ret, op_errno,
+ &prebuf, &bdatt->iatt, NULL);
+ return;
+}
+
+/*
+ * bd_ftruncate: Resizes a LV if fd belongs to BD.
+ */
+int32_t
+bd_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_ftruncate_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd,
+ offset, xdata);
+ return 0;
+ }
+
+ bd_do_trunc (frame, this, fd, NULL, offset, bdatt);
+ return 0;
+out:
+ BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+/*
+ * bd_truncate: Resizes a LV if file maps to LV.
+ */
+int32_t
+bd_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_truncate_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc,
+ offset, xdata);
+ return 0;
+ }
+
+ bd_do_trunc (frame, this, NULL, loc, offset, bdatt);
+ return 0;
+
+out:
+ BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+__bd_pwritev (int fd, struct iovec *vector, int count, off_t offset,
+ uint64_t bd_size)
+{
+ int index = 0;
+ int retval = 0;
+ off_t internal_offset = 0;
+
+ if (!vector)
+ return -EFAULT;
+
+ retval = pwritev (fd, vector, count, offset);
+ if (retval == -1) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "base %p, length %ld, offset %ld, message %s",
+ vector[index].iov_base, vector[index].iov_len,
+ internal_offset, strerror (errno));
+ retval = -errno;
+ goto err;
+ }
+/*
+
+
+ internal_offset = offset;
+ for (index = 0; index < count; index++) {
+ if (internal_offset > bd_size) {
+ op_ret = -ENOSPC;
+ goto err;
+ }
+ if (internal_offset + vector[index].iov_len > bd_size) {
+ vector[index].iov_len = bd_size - internal_offset;
+ no_space = 1;
+ }
+ retval = pwritev (fd, vector[index].iov_base,
+ vector[index].iov_len, internal_offset);
+ if (retval == -1) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "base %p, length %ld, offset %ld, message %s",
+ vector[index].iov_base, vector[index].iov_len,
+ internal_offset, strerror (errno));
+ op_ret = -errno;
+ goto err;
+ }
+ op_ret += retval;
+ internal_offset += retval;
+ if (no_space)
+ break;
+ }
+*/
+err:
+ return retval;
+}
+
+/*
+ * bd_writev: Writes to LV if its BD file or forwards the request to posix_write
+ * bd_writev -> posix_writev -> bd_writev_cbk
+ */
+int
+bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdict)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ int _fd = -1;
+ bd_fd_t *bd_fd = NULL;
+ int ret = -1;
+ uint64_t size = 0;
+ struct iatt prebuf = {0, };
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (vector, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) { /* posix fd */
+ STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count,
+ offset, flags, iobref, xdict);
+ return 0;
+ }
+
+ _fd = bd_fd->fd;
+
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+ size = bdatt->iatt.ia_size;
+
+ op_ret = __bd_pwritev (_fd, vector, count, offset, size);
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ op_ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64
+ ", %s", offset, strerror (op_errno));
+ goto out;
+ }
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+out:
+
+ BD_STACK_UNWIND (writev, frame, op_ret, op_errno, &prebuf,
+ &bdatt->iatt, NULL);
+ return 0;
+}
+
+int
+bd_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ bd_attr_t *bdatt = NULL;
+ int *valid = cookie;
+ bd_local_t *local = frame->local;
+
+ if (op_ret < 0 || !valid || !local)
+ goto out;
+
+ if (bd_inode_ctx_get (local->inode, this, &bdatt))
+ goto out;
+
+ if (*valid & GF_SET_ATTR_UID)
+ bdatt->iatt.ia_uid = postbuf->ia_uid;
+ else if (*valid & GF_SET_ATTR_GID)
+ bdatt->iatt.ia_gid = postbuf->ia_gid;
+ else if (*valid & GF_SET_ATTR_MODE) {
+ bdatt->iatt.ia_type = postbuf->ia_type;
+ bdatt->iatt.ia_prot = postbuf->ia_prot;
+ } else if (*valid & GF_SET_ATTR_ATIME) {
+ bdatt->iatt.ia_atime = postbuf->ia_atime;
+ bdatt->iatt.ia_atime_nsec = postbuf->ia_atime_nsec;
+ } else if (*valid & GF_SET_ATTR_MTIME) {
+ bdatt->iatt.ia_mtime = postbuf->ia_mtime;
+ bdatt->iatt.ia_mtime_nsec = postbuf->ia_mtime_nsec;
+ }
+
+ bdatt->iatt.ia_ctime = postbuf->ia_ctime;
+ bdatt->iatt.ia_ctime_nsec = postbuf->ia_ctime_nsec;
+
+ memcpy (postbuf, &bdatt->iatt, sizeof (struct iatt));
+out:
+ FREE (valid);
+ BD_STACK_UNWIND (setattr, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+int
+bd_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
+{
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+ int *ck_valid = NULL;
+ int op_errno = 0;
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ ck_valid = CALLOC (1, sizeof (valid));
+ BD_VALIDATE_MEM_ALLOC (ck_valid, op_errno, out);
+
+ local->inode = inode_ref (loc->inode);
+ *ck_valid = valid;
+
+ STACK_WIND_COOKIE (frame, bd_setattr_cbk, ck_valid, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (setattr, frame, -1, ENOMEM, NULL, NULL, xdata);
+ return 0;
+}
+
+int
+bd_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ bd_attr_t *bdatt = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (bd_inode_ctx_get (inode, this, &bdatt))
+ goto out;
+
+ bdatt->iatt.ia_ctime = buf->ia_ctime;
+ bdatt->iatt.ia_ctime_nsec = buf->ia_ctime_nsec;
+ bdatt->iatt.ia_nlink = buf->ia_nlink;
+ memcpy (buf, &bdatt->iatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (link, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, NULL);
+ return 0;
+}
+
+int
+bd_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ STACK_WIND (frame, bd_link_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+ return 0;
+}
+
+int
+bd_handle_special_xattrs (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ dict_t *xattr = NULL;
+ int op_ret = -1;
+ int op_errno = ENOMEM;;
+ bd_priv_t *priv = this->private;
+
+ xattr = dict_new ();
+ if (!xattr)
+ goto out;
+
+ if (!strcmp (name, VOL_TYPE))
+ op_ret = dict_set_int64 (xattr, (char *)name, 1);
+ else if (!strcmp (name, VOL_CAPS))
+ op_ret = dict_set_int64 (xattr, (char *)name, priv->caps);
+ else
+ op_ret = bd_get_origin (this->private, loc, fd, xattr);
+
+out:
+ if (loc)
+ BD_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr,
+ xdata);
+ else
+ BD_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr,
+ xdata);
+
+ op_ret = dict_reset (xattr);
+ dict_unref (xattr);
+
+ return 0;
+}
+
+int
+bd_fgetxattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS)
+ || !strcmp (name, BD_ORIGIN)))
+ bd_handle_special_xattrs (frame, this, NULL, fd, name, xdata);
+ else
+ STACK_WIND (frame, default_fgetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr,
+ fd, name, xdata);
+ return 0;
+}
+
+int
+bd_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS)
+ || !strcmp (name, BD_ORIGIN)))
+ bd_handle_special_xattrs (frame, this, loc, NULL, name, xdata);
+ else
+ STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ loc, name, xdata);
+
+ return 0;
+}
+
+int
+bd_unlink_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ bd_gfid_t gfid = {0, };
+ bd_local_t *local = frame->local;
+
+ if (buf->ia_nlink > 1)
+ goto posix;
+
+ BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out);
+
+ uuid_utoa_r (inode->gfid, gfid);
+ if (bd_delete_lv (this->private, gfid, &op_errno) < 0) {
+ if (op_errno != ENOENT)
+ goto out;
+ }
+
+posix:
+ /* remove posix */
+ STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ &local->loc, 0, NULL);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+bd_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflag, dict_t *xdata)
+{
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ loc, xflag, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ loc_copy (&local->loc, loc);
+
+ STACK_WIND (frame, bd_unlink_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, NULL);
+ return 0;
+out:
+ BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+bd_priv (xlator_t *this)
+{
+ return 0;
+}
+
+int32_t
+bd_inode (xlator_t *this)
+{
+ return 0;
+}
+
+int32_t
+bd_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ int32_t len, dict_t *xdata)
+{
+ int op_ret = -1;
+ int op_errno = 0;
+ int ret = 0;
+ int _fd = -1;
+ char *alloc_buf = NULL;
+ char *buf = NULL;
+ int32_t weak_checksum = 0;
+ bd_fd_t *bd_fd = NULL;
+ unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0};
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_rchecksum_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->rchecksum, fd, offset,
+ len, xdata);
+ return 0;
+ }
+
+ memset (strong_checksum, 0, MD5_DIGEST_LENGTH);
+
+ alloc_buf = page_aligned_alloc (len, &buf);
+ if (!alloc_buf) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ _fd = bd_fd->fd;
+
+ LOCK (&fd->lock);
+ {
+ ret = pread (_fd, buf, len, offset);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "pread of %d bytes returned %d (%s)",
+ len, ret, strerror (errno));
+ op_errno = errno;
+ }
+ }
+ UNLOCK (&fd->lock);
+
+ if (ret < 0)
+ goto out;
+
+ weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf,
+ (size_t) len);
+ gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len,
+ (unsigned char *) strong_checksum);
+
+ op_ret = 0;
+out:
+ BD_STACK_UNWIND (rchecksum, frame, op_ret, op_errno,
+ weak_checksum, strong_checksum, NULL);
+
+ GF_FREE (alloc_buf);
+
+ return 0;
+}
+
+/**
+ * notify - when parent sends PARENT_UP, send CHILD_UP event from here
+ */
+int32_t
+notify (xlator_t *this,
+ int32_t event,
+ void *data,
+ ...)
+{
+ switch (event)
+ {
+ case GF_EVENT_PARENT_UP:
+ {
+ /* Tell the parent that bd xlator is up */
+ default_notify (this, GF_EVENT_CHILD_UP, data);
+ }
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_bd_mt_end + 1);
+
+ if (ret != 0)
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+ return ret;
+}
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ int ret = -1;
+ bd_priv_t *priv = this->private;
+
+ GF_OPTION_RECONF ("bd-aio", priv->aio_configured, options,
+ bool, out);
+
+ if (priv->aio_configured)
+ bd_aio_on (this);
+ else
+ bd_aio_off (this);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/**
+ * bd xlator init - Validate configured VG
+ */
+int
+init (xlator_t *this)
+{
+ int ret = 0;
+ char *vg_data = NULL;
+ char *device = NULL;
+ bd_priv_t *_private = NULL;
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "FATAL: storage/bd needs posix as subvolume");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Volume is dangling. Please check the volume file.");
+ }
+
+ GF_OPTION_INIT ("export", vg_data, str, error);
+ GF_OPTION_INIT ("device", device, str, error);
+
+ /* Now we support only LV device */
+ if (strcasecmp (device, BACKEND_VG)) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "FATAL: unknown %s backend %s", BD_XLATOR, device);
+ return -1;
+ }
+
+ this->local_pool = mem_pool_new (bd_local_t, 64);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "FATAL: Failed to create bd memory pool");
+ return -1;
+ }
+
+ ret = 0;
+ _private = GF_CALLOC (1, sizeof (*_private), gf_bd_private);
+ if (!_private)
+ goto error;
+
+ this->private = _private;
+ _private->vg = gf_strdup (vg_data);
+ if (!_private->vg)
+ goto error;
+
+ _private->handle = lvm_init (NULL);
+ if (!_private->handle) {
+ gf_log (this->name, GF_LOG_CRITICAL, "lvm_init failed");
+ goto error;
+ }
+ _private->caps = BD_CAPS_BD;
+ if (bd_scan_vg (this, _private))
+ goto error;
+
+ _private->aio_init_done = _gf_false;
+ _private->aio_capable = _gf_false;
+
+ GF_OPTION_INIT ("bd-aio", _private->aio_configured, bool, error);
+ if (_private->aio_configured) {
+ if (bd_aio_on (this)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "BD AIO init failed");
+ ret = -1;
+ goto error;
+ }
+ }
+
+ _private->caps |= BD_CAPS_OFFLOAD_COPY | BD_CAPS_OFFLOAD_SNAPSHOT;
+
+ return 0;
+error:
+ GF_FREE (_private->vg);
+ if (_private->handle)
+ lvm_quit (_private->handle);
+ mem_pool_destroy (this->local_pool);
+ GF_FREE (_private);
+ return -1;
+}
+
+void
+fini (xlator_t *this)
+{
+ bd_priv_t *priv = this->private;
+ mem_pool_destroy (this->local_pool);
+ this->local_pool = NULL;
+ if (!priv)
+ return;
+ lvm_quit (priv->handle);
+ GF_FREE (priv->vg);
+ this->private = NULL;
+ GF_FREE (priv);
+ return;
+}
+
+struct xlator_dumpops dumpops = {
+ .priv = bd_priv,
+ .inode = bd_inode,
+};
+
+struct xlator_fops fops = {
+ .readdirp = bd_readdirp,
+ .lookup = bd_lookup,
+ .stat = bd_stat,
+ .statfs = bd_statfs,
+ .open = bd_open,
+ .fstat = bd_fstat,
+ .rchecksum = bd_rchecksum,
+ .readv = bd_readv,
+ .fsync = bd_fsync,
+ .setxattr = bd_setxattr,
+ .fsetxattr = bd_fsetxattr,
+ .removexattr = bd_removexattr,
+ .fremovexattr=bd_fremovexattr,
+ .truncate = bd_truncate,
+ .ftruncate = bd_ftruncate,
+ .writev = bd_writev,
+ .getxattr = bd_getxattr,
+ .fgetxattr = bd_fgetxattr,
+ .unlink = bd_unlink,
+ .link = bd_link,
+ .flush = bd_flush,
+ .setattr = bd_setattr,
+ .discard = bd_discard,
+};
+
+struct xlator_cbks cbks = {
+ .release = bd_release,
+ .forget = bd_forget,
+};
+
+struct volume_options options[] = {
+ { .key = {"export"},
+ .type = GF_OPTION_TYPE_STR},
+ { .key = {"device"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = BACKEND_VG},
+ {
+ .key = {"bd-aio"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Support for native Linux AIO"
+ },
+
+ { .key = {NULL} }
+};
diff --git a/xlators/storage/bd/src/bd.h b/xlators/storage/bd/src/bd.h
new file mode 100644
index 000000000..34b4c9e22
--- /dev/null
+++ b/xlators/storage/bd/src/bd.h
@@ -0,0 +1,178 @@
+/*
+ BD translator - Exports Block devices on server side as regular
+ files to client
+
+ Copyright IBM, Corp. 2012
+
+ This file is part of GlusterFS.
+
+ Author:
+ M. Mohan Kumar <mohan@in.ibm.com>
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _BD_H
+#define _BD_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#endif
+
+#include "xlator.h"
+#include "mem-types.h"
+
+#define BD_XLATOR "block device mapper xlator"
+#define BACKEND_VG "vg"
+#define GF_XATTR "user.glusterfs"
+#define BD_XATTR GF_XATTR ".bd"
+
+#define BD_LV "lv"
+#define BD_THIN "thin"
+
+#define LVM_RESIZE "/sbin/lvresize"
+#define LVM_CREATE "/sbin/lvcreate"
+#define LVM_CONVERT "/sbin/lvconvert"
+
+#define VOL_TYPE "volume.type"
+#define VOL_CAPS "volume.caps"
+
+#define ALIGN_SIZE 4096
+
+#define BD_CAPS_BD 0x01
+#define BD_CAPS_THIN 0x02
+#define BD_CAPS_OFFLOAD_COPY 0x04
+#define BD_CAPS_OFFLOAD_SNAPSHOT 0x08
+
+#define BD_CLONE "clone"
+#define BD_SNAPSHOT "snapshot"
+#define BD_MERGE "merge"
+#define BD_ORIGIN "list-origin"
+
+#define IOV_NR 4
+#define IOV_SIZE (64 * 1024)
+
+#define ALIGN_SIZE 4096
+
+#define LINKTO "trusted.glusterfs.dht.linkto"
+
+#define BD_VALIDATE_MEM_ALLOC(buff, op_errno, label) \
+ if (!buff) { \
+ op_errno = ENOMEM; \
+ gf_log (this->name, GF_LOG_ERROR, "out of memory"); \
+ goto label; \
+ }
+
+#define BD_VALIDATE_LOCAL_OR_GOTO(local, op_errno, label) \
+ if (!local) { \
+ op_errno = EINVAL; \
+ goto label; \
+ }
+
+#define BD_STACK_UNWIND(typ, frame, args ...) do { \
+ bd_local_t *__local = frame->local; \
+ xlator_t *__this = frame->this; \
+ \
+ frame->local = NULL; \
+ STACK_UNWIND_STRICT (typ, frame, args); \
+ if (__local) \
+ bd_local_free (__this, __local); \
+ } while (0)
+
+typedef char bd_gfid_t[GF_UUID_BUF_SIZE];
+
+enum gf_bd_mem_types_ {
+ gf_bd_private = gf_common_mt_end + 1,
+ gf_bd_attr,
+ gf_bd_fd,
+ gf_bd_mt_end
+};
+
+/**
+ * bd_fd - internal structure
+ */
+typedef struct bd_fd {
+ int fd;
+ int32_t flag;
+ int odirect;
+} bd_fd_t;
+
+typedef struct bd_priv {
+ lvm_t handle;
+ char *vg;
+ char *pool;
+ int caps;
+ gf_boolean_t aio_init_done;
+ gf_boolean_t aio_capable;
+ gf_boolean_t aio_configured;
+#ifdef HAVE_LIBAIO
+ io_context_t ctxp;
+ pthread_t aiothread;
+#endif
+} bd_priv_t;
+
+
+typedef enum bd_type {
+ BD_TYPE_NONE,
+ BD_TYPE_LV,
+} bd_type_t;
+
+typedef struct {
+ struct iatt iatt;
+ char *type;
+} bd_attr_t;
+
+typedef enum {
+ BD_OF_NONE,
+ BD_OF_CLONE,
+ BD_OF_SNAPSHOT,
+ BD_OF_MERGE,
+} bd_offload_t;
+
+typedef struct {
+ dict_t *dict;
+ bd_attr_t *bdatt;
+ inode_t *inode;
+ loc_t loc;
+ fd_t *fd;
+ data_t *data; /* for setxattr */
+ bd_offload_t offload;
+ uint64_t size;
+ loc_t *dloc;
+} bd_local_t;
+
+/* Prototypes */
+int bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx);
+int bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx);
+int bd_scan_vg (xlator_t *this, bd_priv_t *priv);
+bd_local_t *bd_local_init (call_frame_t *frame, xlator_t *this);
+void bd_local_free (xlator_t *this, bd_local_t *local);
+int bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd);
+char *page_aligned_alloc (size_t size, char **aligned_buf);
+int bd_validate_bd_xattr (xlator_t *this, char *bd, char **type,
+ uint64_t *lv_size, uuid_t uuid);
+uint64_t bd_get_default_extent (bd_priv_t *priv);
+uint64_t bd_adjust_size (bd_priv_t *priv, uint64_t size);
+int bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv);
+int bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size);
+int bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno);
+int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv);
+int bd_clone (bd_local_t *local, bd_priv_t *priv);
+
+int bd_merge (bd_priv_t *priv, uuid_t gfid);
+int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict);
+inline void bd_update_amtime(struct iatt *iatt, int flag);
+int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv);
+int bd_clone (bd_local_t *local, bd_priv_t *priv);
+int bd_merge (bd_priv_t *priv, uuid_t gfid);
+int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict);
+
+#endif
diff --git a/xlators/storage/bd_map/src/bd_map.c b/xlators/storage/bd_map/src/bd_map.c
deleted file mode 100644
index 9c8f69c64..000000000
--- a/xlators/storage/bd_map/src/bd_map.c
+++ /dev/null
@@ -1,2580 +0,0 @@
-/*
- BD translator - Exports Block devices on server side as regular
- files to client
-
- Now only exporting Logical volumes supported.
-
- Copyright IBM, Corp. 2012
-
- This file is part of GlusterFS.
-
- Author:
- M. Mohan Kumar <mohan@in.ibm.com>
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include <time.h>
-#include <lvm2app.h>
-#include <openssl/md5.h>
-
-#include "bd_map.h"
-#include "bd_map_help.h"
-#include "defaults.h"
-#include "glusterfs3-xdr.h"
-#include "run.h"
-#include "protocol-common.h"
-
-/* Regular fops */
-
-int
-bd_access (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t mask, dict_t *xdict)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char path[PATH_MAX] = {0, };
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (loc, out);
-
- sprintf (path, "/dev/mapper/%s", loc->path);
- op_ret = access (path, mask & 07);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR, "access failed on %s: %s",
- loc->path, strerror (op_errno));
- goto out;
- }
- op_ret = 0;
-out:
- STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, NULL);
-
- return 0;
-}
-
-#define LV_RENAME "/sbin/lvrename"
-
-int bd_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc, dict_t *xdict)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char *new_path = NULL;
- char *np = NULL;
- struct iatt stbuf = {0, };
- struct iatt preoldparent = {0, };
- struct iatt postoldparent = {0, };
- struct iatt prenewparent = {0, };
- struct iatt postnewparent = {0, };
- bd_priv_t *priv = NULL;
- bd_entry_t *lventry = NULL;
- bd_entry_t *newp_entry = NULL;
- char *path = NULL;
- struct stat v_stat = {0, };
- runner_t runner = {0, };
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (oldloc, out);
- VALIDATE_OR_GOTO (newloc, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- BD_ENTRY (priv, lventry, oldloc->path);
- if (lventry->refcnt > 1) {
- op_errno = EBUSY;
- goto out;
- }
-
- memcpy (&preoldparent, lventry->parent->attr, sizeof(preoldparent));
-
- new_path = np = gf_strdup (newloc->path);
- if (!new_path)
- goto out;
- new_path = strrchr (np, '/');
- if (!new_path) {
- op_errno = EINVAL;
- goto out;
- }
-
- *new_path = '\0';
- BD_ENTRY (priv, newp_entry, np);
-
- memcpy (&prenewparent, newp_entry->parent->attr, sizeof(preoldparent));
-
- runinit (&runner);
-
- runner_add_args (&runner, LV_RENAME, NULL);
- runner_add_args (&runner, lventry->parent->name, NULL);
- runner_add_args (&runner, oldloc->name, NULL);
- runner_add_args (&runner, newloc->name, NULL);
-
- runner_start (&runner);
- runner_end (&runner);
-
- /* verify */
- gf_asprintf (&path, "/dev/%s", newloc->path);
- if (stat (path, &v_stat) < 0) {
- op_errno = EIO;
- goto out;
- }
- BD_ENTRY_UPDATE_MTIME (lventry);
- BD_ENTRY_UPDATE_MTIME (newp_entry);
- memcpy (&postoldparent, lventry->parent->attr, sizeof(postoldparent));
- memcpy (&postnewparent, newp_entry->parent->attr,
- sizeof(postoldparent));
- BD_WR_LOCK (&priv->lock);
- strncpy (lventry->name, newloc->name, sizeof(lventry->name));
- memcpy (&stbuf, lventry->attr, sizeof(stbuf));
- BD_UNLOCK (&priv->lock);
- op_ret = 0;
-out:
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
- if (newp_entry)
- BD_PUT_ENTRY (priv, newp_entry);
- if (np)
- GF_FREE (np);
- if (path)
- GF_FREE (path);
-
- STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, &stbuf,
- &preoldparent, &postoldparent, &prenewparent,
- &postnewparent, NULL);
- return 0;
-}
-
-int32_t
-bd_delete_lv (bd_priv_t *priv, bd_entry_t *p_entry, bd_entry_t *lventry,
- const char *path, int *op_errno)
-{
- vg_t vg = NULL;
- lv_t lv = NULL;
- int op_ret = -1;
-
- *op_errno = 0;
- BD_WR_LOCK (&priv->lock);
- vg = lvm_vg_open (priv->handle, p_entry->name, "w", 0);
- if (!vg) {
- *op_errno = ENOENT;
- BD_UNLOCK (&priv->lock);
- goto out;
- }
-
- lv = lvm_lv_from_name (vg, lventry->name);
- if (!lv) {
- lvm_vg_close (vg);
- *op_errno = ENOENT;
- BD_UNLOCK (&priv->lock);
- goto out;
- }
- op_ret = lvm_vg_remove_lv (lv);
- if (op_ret < 0) {
- *op_errno = errno;
- lvm_vg_close (vg);
- BD_UNLOCK (&priv->lock);
- goto out;
- }
- lvm_vg_close (vg);
-
- op_ret = bd_entry_rm (path);
- if (op_ret < 0) {
- *op_errno = EIO;
- BD_UNLOCK (&priv->lock);
- goto out;
- }
- BD_ENTRY_UPDATE_MTIME (p_entry);
-
- op_ret = 0;
- op_errno = 0;
-
- BD_UNLOCK (&priv->lock);
- op_ret = 0;
-out:
- return op_ret;
-}
-
-int32_t
-bd_unlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int xflag, dict_t *xdata)
-{
- int32_t op_ret = -1;
- int32_t op_errno = ENOENT;
- struct iatt preparent = {0, };
- struct iatt postparent = {0, };
- bd_priv_t *priv = NULL;
- bd_entry_t *lventry = NULL;
- bd_entry_t *p_entry = NULL;
- char *vg_name = NULL;
- char *volume = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- volume = vg_name = gf_strdup (loc->path);
- if (!volume)
- goto out;
- volume = strrchr (volume, '/');
- if (!volume) {
- op_errno = EINVAL;
- goto out;
- }
- /* creating under non VG directory not permited */
- if (vg_name == volume) {
- op_errno = EOPNOTSUPP;
- goto out;
- }
- *volume = '\0';
-
- BD_ENTRY (priv, p_entry, vg_name);
- BD_ENTRY (priv, lventry, loc->path);
- if (!p_entry || !lventry)
- goto out;
-
- memcpy (&preparent, p_entry->attr, sizeof(preparent));
- op_ret = bd_delete_lv (priv, p_entry, lventry, loc->path, &op_errno);
- memcpy (&postparent, p_entry->attr, sizeof(postparent));
-out:
- if (p_entry)
- BD_PUT_ENTRY (priv, p_entry);
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
- if (vg_name)
- GF_FREE (vg_name);
- STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno,
- &preparent, &postparent, NULL);
-
- return 0;
-}
-
-#define LVM_CREATE "/sbin/lvcreate"
-
-#define IOV_NR 4
-#define IOV_SIZE (4 * 1024)
-
-int bd_clone_lv (bd_priv_t *priv, bd_entry_t *p_entry, dict_t *output,
- const char *vg_name, const char *lv_name,
- const char *dest_lv_name, struct iatt *stbuf)
-{
- int32_t ret = -1;
- vg_t vg = NULL;
- lv_t lv = NULL;
- ssize_t size = 0;
- uint64_t extent = 0;
- int fd1 = -1;
- int fd2 = -1;
- struct iatt iattr = {0, };
- bd_entry_t *lventry = NULL;
- char path[512] = {0, };
- struct iovec *vec = NULL;
- int i = 0;
- ssize_t bytes = 0;
- int nr_iov = 0;
-
- vec = GF_CALLOC (IOV_NR, sizeof(struct iovec), gf_common_mt_iovec);
- if (!vec)
- goto out;
-
- for (i = 0; i < IOV_NR; i++) {
- vec[i].iov_base = GF_MALLOC (IOV_SIZE, gf_common_mt_char);
- if (!vec[i].iov_base)
- goto out;
- vec[i].iov_len = IOV_SIZE;
- }
-
- vg = lvm_vg_open (priv->handle, vg_name, "w", 0);
- if (!vg) {
- gf_log (THIS->name, GF_LOG_ERROR,
- "lvm_vg_open %s failed", vg_name);
- ret = -1;
- goto out;
- }
- lv = lvm_lv_from_name (vg, lv_name);
- if (!lv) {
- gf_log (THIS->name, GF_LOG_ERROR, "lvm_lv_from_name failed");
- ret = -1;
- goto out;
- }
-
- size = lvm_lv_get_size (lv);
- extent = size / lvm_vg_get_extent_size (vg);
-
- if (lvm_vg_create_lv_linear (vg, dest_lv_name, size) == NULL) {
- gf_log (THIS->name, GF_LOG_ERROR, "lv_create:%s",
- lvm_errmsg(priv->handle));
- ret = -1;
- goto out;
- }
- sprintf (path, "/dev/%s/%s", vg_name, lv_name);
- fd1 = open (path, O_RDONLY);
- if (fd1 < 0) {
- gf_log (THIS->name, GF_LOG_ERROR, "opening %s failed", path);
- goto out;
- }
- sprintf (path, "/dev/%s/%s", vg_name, dest_lv_name);
- fd2 = open (path, O_WRONLY);
- if (fd2 < 0) {
- gf_log (THIS->name, GF_LOG_ERROR, "opening %s failed", path);
- goto out;
- }
-
- bd_entry_istat (path, &iattr, IA_IFREG);
- iattr.ia_size = size;
-
- bytes = size;
- while (bytes) {
- size = readv(fd1, vec, IOV_NR);
- if (size < 0) {
- gf_log (THIS->name, GF_LOG_DEBUG,
- "read failed:%s", strerror(errno));
- goto out;
- }
- if (size < IOV_NR * IOV_SIZE) {
- vec[size / IOV_SIZE].iov_len = size % IOV_SIZE;
- nr_iov = (size / IOV_SIZE) + 1;
- } else
- nr_iov = IOV_NR;
- bytes -= size;
- size = writev (fd2, vec, nr_iov);
- if (size < 0) {
- gf_log (THIS->name, GF_LOG_DEBUG,
- "write failed:%s", strerror(errno));
- goto out;
- }
- }
-
- lventry = bd_entry_add (p_entry, dest_lv_name, &iattr, IA_IFREG);
- if (!lventry) {
- ret = EAGAIN;
- goto out;
- }
-
- if (stbuf)
- memcpy (stbuf, &iattr, sizeof(iattr));
-
- ret = 0;
- gf_log (THIS->name, GF_LOG_INFO, "Clone completed");
-out:
- if (vg)
- lvm_vg_close (vg);
- if (fd1 != -1)
- close (fd1);
- if (fd2 != -1)
- close (fd2);
- if (vec)
- iov_free (vec, IOV_NR);
- return ret;
-}
-
-int bd_snapshot_lv (bd_priv_t *priv, bd_entry_t *p_entry, dict_t *output,
- const char *lv_name, const char *dest_lv, char *size,
- struct iatt *stbuf)
-{
- int32_t ret = -1;
- struct iatt iattr = {0, };
- struct stat stat = {0, };
- bd_entry_t *lventry = NULL;
- char *error = NULL;
- int retval = -1;
- runner_t runner = {0, };
- char *path = NULL;
- vg_t vg = NULL;
- lv_t lv = NULL;
-
- runinit (&runner);
-
- runner_add_args (&runner, LVM_CREATE, NULL);
- runner_add_args (&runner, "--snapshot", NULL);
- runner_argprintf (&runner, "/dev/%s/%s", p_entry->name, lv_name);
- runner_add_args (&runner, "--name", NULL);
- runner_argprintf (&runner, "%s", dest_lv);
- runner_argprintf (&runner, "-L%s", size);
-
- runner_start (&runner);
- runner_end (&runner);
-
- gf_asprintf (&path, "/dev/%s/%s", p_entry->name, dest_lv);
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
- if (lstat (path, &stat) < 0) {
- ret = -EAGAIN;
- if (output)
- gf_asprintf (&error, "try again");
- goto out;
- }
-
- vg = lvm_vg_open (priv->handle, p_entry->name, "r", 0);
- if (!vg) {
- ret = -EIO;
- if (output)
- gf_asprintf (&error, "can't open vg %s", p_entry->name);
- goto out;
- }
- lv = lvm_lv_from_name (vg, lv_name);
- if (!lv) {
- ret = -EIO;
- if (output)
- gf_asprintf (&error, "can't open lv %s", lv_name);
- goto out;
- }
- bd_entry_istat (path, &iattr, IA_IFREG);
- iattr.ia_size = lvm_lv_get_size (lv);
- lventry = bd_entry_add (p_entry, dest_lv, &iattr, IA_IFREG);
- if (!lventry) {
- if (output)
- gf_asprintf (&error, "try again");
- ret = -EAGAIN;
- goto out;
- }
- if (stbuf)
- memcpy (stbuf, &iattr, sizeof(iattr));
- ret = 0;
-out:
- if (vg)
- lvm_vg_close (vg);
- if (error && output)
- retval = dict_set_str (output, "error", error);
- GF_FREE (path);
- return ret;
-}
-
-/*
- * Creates a snapshot of given LV
- */
-int
-bd_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- bd_priv_t *priv = NULL;
- struct iatt stbuf = {0, };
- struct iatt preparent = {0, };
- struct iatt postparent = {0, };
- bd_entry_t *lventry = NULL;
- char *name = NULL;
- char *np = NULL;
- char *volume = NULL;
- char *vg_name = NULL;
- char *path = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- if (strchr (loc->path, '/')) {
- vg_name = gf_strdup (loc->path);
- volume = strrchr (vg_name, '/');
- if (!volume) {
- op_errno = EINVAL;
- goto out;
- }
- /* creating under non VG directory not permited */
- if (vg_name == volume) {
- op_errno = EOPNOTSUPP;
- goto out;
- }
- GF_FREE (vg_name);
- vg_name = NULL;
- }
-
- /*
- * symlink creation for BD xlator is different
- * source (LV) has to exist for creation of symbolic link (snapshot)
- */
- if (strchr (linkname, '/')) {
- op_errno = EOPNOTSUPP;
- goto out;
- }
- gf_asprintf (&path, "%s/%s", priv->vg, linkname);
- if (!path) {
- op_errno = -ENOMEM;
- goto out;
- }
- BD_ENTRY (priv, lventry, path);
- if (!lventry) {
- op_errno = ENOENT;
- goto out;
- }
-
- name = np = gf_strdup (loc->path);
- if (!name)
- goto out;
-
- /* Get LV name from loc->path */
- name = strrchr (loc->path, '/');
- if (name != loc->path)
- name++;
-
- memcpy (&preparent, lventry->parent->attr, sizeof(preparent));
- if (bd_snapshot_lv (priv, lventry->parent, NULL, lventry->name,
- name, "1", &stbuf) < 0) {
- op_errno = EAGAIN;
- goto out;
- }
- BD_ENTRY_UPDATE_MTIME (lventry->parent);
- memcpy (&postparent, lventry->parent->attr, sizeof (postparent));
- op_ret = 0;
-out:
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
- if (np)
- GF_FREE (np);
- if (vg_name)
- GF_FREE (vg_name);
- if (path)
- GF_FREE (path);
-
- STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno,
- (loc)?loc->inode:NULL, &stbuf, &preparent,
- &postparent, NULL);
- return 0;
-}
-
-/*
- * bd_link: Does full clone of given logical volume
- * A new logical volume with source logical volume's size created
- * and entire content copied
- */
-int
-bd_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc, dict_t *xdata)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- bd_priv_t *priv = NULL;
- struct iatt stbuf = {0, };
- struct iatt preparent = {0, };
- struct iatt postparent = {0, };
- bd_entry_t *lventry = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (oldloc, out);
- VALIDATE_OR_GOTO (newloc, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- BD_ENTRY (priv, lventry, oldloc->path);
- if (!lventry) {
- op_errno = ENOENT;
- goto out;
- }
- memcpy (&postparent, lventry->parent->attr, sizeof (postparent));
- if (bd_clone_lv (priv, lventry->parent, NULL, lventry->parent->name,
- lventry->name, newloc->name, &stbuf) < 0) {
- op_errno = EAGAIN;
- goto out;
- }
- BD_ENTRY_UPDATE_MTIME (lventry->parent);
- memcpy (&preparent, lventry->parent->attr, sizeof (preparent));
- op_ret = 0;
-out:
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
-
-
- STACK_UNWIND_STRICT (link, frame, op_ret, op_errno,
- (oldloc)?oldloc->inode:NULL, &stbuf, &preparent,
- &postparent, NULL);
- return 0;
-}
-
-int32_t
-bd_open (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int32_t _fd = -1;
- bd_fd_t *bd_fd = NULL;
- bd_entry_t *lventry = NULL;
- bd_priv_t *priv = NULL;
- char *devpath = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- BD_ENTRY (priv, lventry, loc->path);
- if (!lventry) {
- op_errno = ENOENT;
- goto out;
- }
-
- gf_asprintf (&devpath, "/dev/%s/%s", lventry->parent->name,
- lventry->name);
- _fd = open (devpath, flags, 0);
- if (_fd == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "open on %s: %s", devpath, strerror (op_errno));
- goto out;
- }
-
- bd_fd = GF_CALLOC (1, sizeof(*bd_fd), gf_bd_fd);
- if (!bd_fd) {
- op_errno = errno;
- goto out;
- }
- bd_fd->entry = lventry;
- bd_fd->fd = _fd;
-
- op_ret = fd_ctx_set (fd, this, (uint64_t)(long)bd_fd);
- if (op_ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "failed to set the fd context path=%s fd=%p",
- loc->name, fd);
- goto out;
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (_fd != -1)
- close (_fd);
- /* FIXME: Should we call fd_ctx_set with NULL? */
- if (bd_fd)
- GF_FREE (bd_fd);
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
- }
- if (devpath)
- GF_FREE (devpath);
-
- STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, NULL);
-
- return 0;
-}
-
-int
-bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset, uint32_t flags, dict_t *xdata)
-{
- uint64_t tmp_bd_fd = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int _fd = -1;
- bd_priv_t *priv = NULL;
- struct iobuf *iobuf = NULL;
- struct iobref *iobref = NULL;
- struct iovec vec = {0, };
- bd_fd_t *bd_fd = NULL;
- int ret = -1;
- struct iatt stbuf = {0, };
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- op_errno = -EINVAL;
- gf_log (this->name, GF_LOG_WARNING,
- "bd_fd is NULL from fd=%p", fd);
- goto out;
- }
- bd_fd = (bd_fd_t *)(long)tmp_bd_fd;
- if (!size) {
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size);
- goto out;
- }
- iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
- if (!iobuf) {
- op_errno = ENOMEM;
- goto out;
- }
- _fd = bd_fd->fd;
- op_ret = pread (_fd, iobuf->ptr, size, offset);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "read failed on fd=%p: %s", fd,
- strerror (op_errno));
- goto out;
- }
-
- vec.iov_base = iobuf->ptr;
- vec.iov_len = op_ret;
-
- iobref = iobref_new ();
- iobref_add (iobref, iobuf);
- BD_ENTRY_UPDATE_ATIME (bd_fd->entry);
-
- memcpy (&stbuf, bd_fd->entry->attr, sizeof(stbuf));
-
- /* Hack to notify higher layers of EOF. */
- if (bd_fd->entry->size == 0)
- op_errno = ENOENT;
- else if ((offset + vec.iov_len) >= bd_fd->entry->size)
- op_errno = ENOENT;
- op_ret = vec.iov_len;
-out:
- STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno,
- &vec, 1, &stbuf, iobref, NULL);
-
- if (iobref)
- iobref_unref (iobref);
- if (iobuf)
- iobuf_unref (iobuf);
- return 0;
-}
-
-#define LVM_RESIZE "/sbin/lvresize"
-
-int32_t
-bd_resize (bd_priv_t *priv, bd_entry_t *lventry, off_t *size)
-{
- bd_entry_t *vgentry = NULL;
- uint64_t extent = 0;
- int32_t op_ret = -1;
- vg_t vg = NULL;
- uint32_t nr_ex = 0;
- lv_t lv = NULL;
- uint64_t new_size = 0;
- runner_t runner = {0, };
-
- BD_ENTRY (priv, vgentry, lventry->parent->name);
- if (!vgentry) {
- op_ret = ENOENT;
- goto out;
- }
-
- BD_WR_LOCK (&priv->lock);
- vg = lvm_vg_open (priv->handle, vgentry->name, "w", 0);
- if (!vg) {
- op_ret = lvm_errno (priv->handle);
- BD_UNLOCK (&priv->lock);
- goto out;
- }
-
- extent = lvm_vg_get_extent_size (vg);
- lvm_vg_close (vg);
- BD_UNLOCK (&priv->lock);
-
- nr_ex = *size / extent;
- if (*size % extent)
- nr_ex++;
- *size = extent * nr_ex;
-
- runinit (&runner);
-
- runner_add_args (&runner, LVM_RESIZE, NULL);
- runner_argprintf (&runner, "/dev/%s/%s", lventry->parent->name,
- lventry->name);
- runner_argprintf (&runner, "-l%ld", nr_ex);
- runner_add_args (&runner, "-f", NULL);
-
- runner_start (&runner);
- runner_end (&runner);
-
- BD_WR_LOCK (&priv->lock);
- vg = lvm_vg_open (priv->handle, vgentry->name, "w", 0);
- if (!vg) {
- op_ret = lvm_errno (priv->handle);
- BD_UNLOCK (&priv->lock);
- goto out;
- }
-
- lv = lvm_lv_from_name (vg, lventry->name);
- if (!lv) {
- op_ret = lvm_errno (priv->handle);
- lvm_vg_close (vg);
- BD_UNLOCK (&priv->lock);
- goto out;
- }
- new_size = lvm_lv_get_size (lv);
- lvm_vg_close (vg);
- if (new_size != *size) {
- op_ret = EIO;
- BD_UNLOCK (&priv->lock);
- goto out;
- }
-
- BD_UNLOCK (&priv->lock);
- op_ret = 0;
-
-out:
- if (vgentry)
- BD_PUT_ENTRY (priv, vgentry);
-
- return op_ret;
-}
-
- int32_t
-bd_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset, dict_t *xdict)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- struct iatt preop = {0, };
- struct iatt postop = {0, };
- bd_fd_t *bd_fd = NULL;
- int ret = -1;
- uint64_t tmp_bd_fd = 0;
- bd_priv_t *priv = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "bd_fd is NULL, fd=%p", fd);
- op_errno = -ret;
- goto out;
- }
- bd_fd = (bd_fd_t *)(long)tmp_bd_fd;
-
- memcpy (&preop, bd_fd->entry->attr, sizeof(preop));
- if (offset > bd_fd->entry->size) {
- op_errno = bd_resize (priv, bd_fd->entry, &offset);
- if (op_errno)
- goto out;
- if (offset > bd_fd->entry->size) {
- bd_fd->entry->attr->ia_size = offset;
- bd_fd->entry->size = offset;
- }
- }
- /* If the requested size is less then current size
- * we will not update that in bd_fd->entry->attr
- * because it will result in showing size of this file less
- * instead we will return 0 for less size truncation
- */
- BD_ENTRY_UPDATE_MTIME (bd_fd->entry);
- memcpy (&postop, bd_fd->entry->attr, sizeof(postop));
-
- op_ret = 0;
-out:
- STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, &preop,
- &postop, NULL);
- return 0;
-}
-
-int32_t
-bd_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
- off_t offset, dict_t *xdict)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- struct iatt prebuf = {0, };
- struct iatt postbuf = {0, };
- bd_entry_t *lventry = NULL;
- bd_priv_t *priv = NULL;
- off_t size = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
- BD_ENTRY (priv, lventry, loc->path);
- if (!lventry) {
- op_errno = ENOENT;
- gf_log (this->name, GF_LOG_ERROR,
- "pre-operation lstat on %s failed: %s",
- loc->path, strerror (op_errno));
- goto out;
- }
- memcpy (&prebuf, lventry->attr, sizeof(prebuf));
- if (offset > lventry->size) {
- op_errno = bd_resize (priv, lventry, &size);
- if (op_errno)
- goto out;
- if (lventry->size < offset) {
- lventry->attr->ia_size = offset;
- lventry->size = size;
- }
- }
- BD_ENTRY_UPDATE_MTIME (lventry);
- memcpy (&postbuf, lventry->attr, sizeof(postbuf));
- BD_PUT_ENTRY (priv, lventry);
- op_ret = 0;
-out:
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
- STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno,
- &prebuf, &postbuf, NULL);
- return 0;
-}
-
-int32_t
-__bd_pwritev (int fd, struct iovec *vector, int count, off_t offset,
- uint64_t bd_size)
-{
- int32_t op_ret = 0;
- int index = 0;
- int retval = 0;
- off_t internal_offset = 0;
- int no_space = 0;
-
- if (!vector)
- return -EFAULT;
-
- internal_offset = offset;
- for (index = 0; index < count; index++) {
- if (internal_offset >= bd_size) {
- op_ret = -ENOSPC;
- goto err;
- }
- if (internal_offset + vector[index].iov_len >= bd_size) {
- vector[index].iov_len = bd_size - internal_offset;
- no_space = 1;
- }
-
- retval = pwrite (fd, vector[index].iov_base,
- vector[index].iov_len, internal_offset);
- if (retval == -1) {
- gf_log (THIS->name, GF_LOG_WARNING,
- "base %p, length %ld, offset %ld, message %s",
- vector[index].iov_base, vector[index].iov_len,
- internal_offset, strerror (errno));
- op_ret = -errno;
- goto err;
- }
- op_ret += retval;
- internal_offset += retval;
- if (no_space)
- break;
- }
-err:
- return op_ret;
-}
-
-int bd_create_lv (bd_priv_t *priv, bd_entry_t *p_entry, const char *vg_name,
- const char *lv_name, char *size, mode_t mode)
-{
- vg_t vg = NULL;
- int ret = -1;
- char *path = NULL;
- struct iatt iattr = {0, };
- bd_entry_t *lventry = NULL;
- uint64_t extent = 0;
-
- BD_WR_LOCK (&priv->lock);
- vg = lvm_vg_open (priv->handle, vg_name, "w", 0);
- if (!vg) {
- ret = -1;
- goto out;
- }
- extent = lvm_vg_get_extent_size (vg);
- if (size)
- gf_string2bytesize (size, &extent);
-
- if (lvm_vg_create_lv_linear (vg, lv_name, extent) == NULL) {
- ret = -EAGAIN;
- lvm_vg_close (vg);
- goto out;
- }
- lvm_vg_close (vg);
-
- gf_asprintf (&path, "/dev/%s/%s", vg_name, lv_name);
- if (!path) {
- ret = -ENOMEM;
- lvm_vg_close (vg);
- goto out;
- }
- bd_entry_istat (path, &iattr, IA_IFREG);
- iattr.ia_size = extent;
- if (!mode)
- mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
-
- iattr.ia_type = ia_type_from_st_mode (mode);
- iattr.ia_prot = ia_prot_from_st_mode (mode);
- lventry = bd_entry_add (p_entry, lv_name, &iattr, IA_IFREG);
- if (!lventry) {
- ret = -EAGAIN;
- goto out;
- }
- ret = 0;
-out:
- BD_UNLOCK (&priv->lock);
- if (path)
- GF_FREE (path);
- return ret;
-}
-
-int bd_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- mode_t umask, fd_t *fd, dict_t *params)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int32_t _fd = -1;
- bd_priv_t *priv = NULL;
- struct iatt stbuf = {0, };
- struct iatt preparent = {0, };
- struct iatt postparent = {0, };
- bd_entry_t *p_entry = NULL;
- bd_entry_t *lventry = NULL;
- bd_fd_t *pfd = NULL;
- char *vg_name = NULL;
- char *volume = NULL;
- char *path = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- volume = vg_name = gf_strdup (loc->path);
- if (!volume)
- goto out;
- volume = strrchr (volume, '/');
- if (!volume) {
- op_errno = EINVAL;
- goto out;
- }
- /* creating under non VG directory not permited */
- if (vg_name == volume) {
- op_errno = EOPNOTSUPP;
- goto out;
- }
- *volume = '\0';
-
- BD_ENTRY (priv, p_entry, vg_name);
- if (!p_entry) {
- op_errno = ENOENT;
- goto out;
- }
-
- memcpy (&preparent, p_entry->attr, sizeof(preparent));
-
- op_errno = bd_create_lv (priv, p_entry, p_entry->name, loc->name, 0,
- mode);
- if (op_errno)
- goto out;
-
- BD_ENTRY (priv, lventry, loc->path);
- if (!lventry) {
- gf_log (this->name, GF_LOG_WARNING,
- "newly created LV not available %s", loc->path);
- op_errno = EAGAIN;
- goto out;
- }
-
- /* Mask O_CREATE since we created LV */
- flags &= ~(O_CREAT | O_EXCL);
-
- gf_asprintf (&path, "/dev/%s/%s", p_entry->name, loc->name);
- if (!path) {
- op_errno = ENOMEM;
- goto out;
- }
- _fd = open (path, flags, 0);
- if (_fd == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "open on %s: %s", path, strerror (op_errno));
- goto out;
- }
-
- memcpy (&stbuf, lventry->attr, sizeof(stbuf));
-
- pfd = GF_CALLOC (1, sizeof(*pfd), gf_bd_fd);
- if (!pfd) {
- op_errno = errno;
- goto out;
- }
- pfd->flag = flags;
- pfd->fd = _fd;
- pfd->entry = lventry;
-
- if (fd_ctx_set (fd, this, (uint64_t)(long)pfd)) {
- gf_log (this->name, GF_LOG_WARNING,
- "failed to set the fd context path=%s fd=%p",
- loc->name, fd);
- goto out;
- }
-
- op_ret = 0;
-
- memcpy (&postparent, p_entry->attr, sizeof(postparent));
-out:
- if (p_entry)
- BD_PUT_ENTRY (priv, p_entry);
- if (path)
- GF_FREE (path);
- if (op_ret < 0 && lventry)
- BD_PUT_ENTRY (priv, lventry);
- if (vg_name)
- GF_FREE (vg_name);
-
- STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd,
- (loc)?loc->inode:NULL, &stbuf, &preparent,
- &postparent, NULL);
- return 0;
-}
-
-/*
- * We don't do actual setattr on devices on the host side, we just update
- * the entries in server process & they are not persistent
- */
-int bd_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iatt *stbuf, int32_t valid, dict_t *xdata)
-{
- struct iatt statpre = {0, };
- struct iatt statpost = {0, };
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- bd_priv_t *priv = NULL;
- bd_fd_t *pfd = NULL;
- int ret = 0;
- uint64_t tmp_pfd = 0;
- int _fd = -1;
-
- priv = this->private;
-
- ret = fd_ctx_get (fd, this, &tmp_pfd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "pfd is NULL, fd=%p", fd);
- op_errno = -ret;
- goto out;
- }
- pfd = (bd_fd_t *)(long)tmp_pfd;
-
- _fd = pfd->fd;
- memcpy (&statpre, pfd->entry->attr, sizeof(statpre));
- op_ret = 0;
-
- if (valid & GF_SET_ATTR_MODE)
- pfd->entry->attr->ia_prot = stbuf->ia_prot;
- if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
- if (valid & GF_SET_ATTR_UID)
- pfd->entry->attr->ia_uid = stbuf->ia_uid;
- if (valid & GF_SET_ATTR_GID)
- pfd->entry->attr->ia_gid = stbuf->ia_gid;
- }
- if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
- pfd->entry->attr->ia_atime = stbuf->ia_atime;
- pfd->entry->attr->ia_atime_nsec = stbuf->ia_atime_nsec;
- pfd->entry->attr->ia_mtime = stbuf->ia_mtime;
- pfd->entry->attr->ia_mtime_nsec = stbuf->ia_mtime_nsec;
- }
- memcpy (&statpost, pfd->entry->attr, sizeof(statpost));
- op_errno = 0;
-out:
- STACK_UNWIND_STRICT (setattr, frame, 0, 0, &statpre, &statpost, NULL);
- return 0;
-}
-
-int bd_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *stbuf, int32_t valid, dict_t *xdata)
-{
- struct iatt statpre = {0, };
- struct iatt statpost = {0, };
- bd_entry_t *lventry = NULL;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- bd_priv_t *priv = NULL;
- char path[PATH_MAX] = {0, };
-
- priv = this->private;
-
- /*
- * We don't allow to do setattr on / on host side
- * ie /dev
- */
- if (!strcmp (loc->path, "/")) {
- op_ret = 0;
- goto out;
- }
-
- BD_ENTRY (priv, lventry, loc->path);
- if (!lventry) {
- op_errno = ENOENT;
- goto out;
- }
- sprintf (path, "/dev/%s/%s", lventry->parent->name, lventry->name);
-
- memcpy (&statpre, lventry->attr, sizeof(statpre));
- if (valid & GF_SET_ATTR_MODE)
- lventry->attr->ia_prot = stbuf->ia_prot;
- if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
- if (valid & GF_SET_ATTR_UID)
- lventry->attr->ia_uid = stbuf->ia_uid;
- if (valid & GF_SET_ATTR_GID)
- lventry->attr->ia_gid = stbuf->ia_gid;
- }
- if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
- lventry->attr->ia_atime = stbuf->ia_atime;
- lventry->attr->ia_atime_nsec = stbuf->ia_atime_nsec;
- lventry->attr->ia_mtime = stbuf->ia_mtime;
- lventry->attr->ia_mtime_nsec = stbuf->ia_mtime_nsec;
- }
- memcpy (&statpost, lventry->attr, sizeof(statpost));
- op_errno = 0;
-out:
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
- STACK_UNWIND_STRICT (setattr, frame, 0, 0, &statpre, &statpost, NULL);
- return 0;
-}
-
-int
-bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count, off_t offset,
- uint32_t flags, struct iobref *iobref, dict_t *xdict)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int _fd = -1;
- bd_priv_t *priv = NULL;
- bd_fd_t *bd_fd = NULL;
- int ret = -1;
- struct iatt preop = {0, };
- struct iatt postop = {0, };
- uint64_t tmp_bd_fd = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
- VALIDATE_OR_GOTO (vector, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- op_errno = -ret;
- gf_log (this->name, GF_LOG_WARNING,
- "bd_fd is NULL from fd=%p", fd);
- goto out;
- }
- bd_fd = (bd_fd_t *)(long)tmp_bd_fd;
- _fd = bd_fd->fd;
-
- memcpy (&preop, bd_fd->entry->attr, sizeof(preop));
- op_ret = __bd_pwritev (_fd, vector, count, offset, bd_fd->entry->size);
- if (op_ret < 0) {
- op_errno = -op_ret;
- op_ret = -1;
- gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64
- ", %s", offset, strerror (op_errno));
- goto out;
- }
- BD_ENTRY_UPDATE_MTIME (bd_fd->entry);
- memcpy (&postop, bd_fd->entry->attr, sizeof(postop));
-
-out:
- STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop,
- &postop, NULL);
-
- return 0;
-}
-
-int32_t
-bd_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
-{
- struct iatt buf = {0, };
- int32_t op_ret = -1;
- int32_t entry_ret = 0;
- int32_t op_errno = 0;
- char *pathdup = NULL;
- bd_entry_t *bdentry = NULL;
- struct iatt postparent = {0, };
- bd_priv_t *priv = NULL;
- char *p = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (loc, out);
- VALIDATE_OR_GOTO (loc->path, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- BD_ENTRY (priv, bdentry, loc->path);
- if (!bdentry) {
- op_errno = ENOENT;
- entry_ret = -1;
- goto parent;
- }
- memcpy (&buf, bdentry->attr, sizeof(buf));
- BD_PUT_ENTRY (priv, bdentry);
-
-parent:
- if (loc->parent) {
- pathdup = p = gf_strdup (loc->path);
- if (!pathdup) {
- op_errno = ENOMEM;
- entry_ret = -1;
- goto out;
- }
- p = strrchr (pathdup, '/');
- if (p == pathdup)
- *(p+1) = '\0';
- else
- *p = '\0';
- BD_ENTRY (priv, bdentry, pathdup);
- if (!bdentry) {
- op_errno = ENOENT;
- gf_log (this->name, GF_LOG_ERROR,
- "post-operation lookup on parent of %s "
- "failed: %s",
- loc->path, strerror (op_errno));
- goto out;
- }
- memcpy (&postparent, bdentry->attr, sizeof(postparent));
- BD_PUT_ENTRY (priv, bdentry);
- }
-
- op_ret = entry_ret;
-out:
- if (pathdup)
- GF_FREE (pathdup);
-
- STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno,
- (loc)?loc->inode:NULL, &buf, NULL, &postparent);
-
- return 0;
-}
-
-int32_t
-bd_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
-{
- struct iatt buf = {0,};
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- bd_entry_t *bdentry = NULL;
- bd_priv_t *priv = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- BD_ENTRY (priv, bdentry, loc->path);
- if (!bdentry) {
- op_errno = ENOENT;
- gf_log (this->name, GF_LOG_ERROR, "stat on %s failed: %s",
- loc->path, strerror (op_errno));
- goto out;
- }
- memcpy (&buf, bdentry->attr, sizeof(buf));
- BD_PUT_ENTRY (priv, bdentry);
- op_ret = 0;
-
-out:
- STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, &buf, NULL);
-
- return 0;
-}
-
-int32_t
-bd_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
-{
- int ret = -1;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- uint64_t tmp_bd_fd = 0;
- struct iatt buf = {0, };
- bd_fd_t *bd_fd = NULL;
- int _fd = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "bd_fd is NULL, fd=%p", fd);
- op_errno = -EINVAL;
- goto out;
- }
- bd_fd = (bd_fd_t *)(long)tmp_bd_fd;
- _fd = bd_fd->fd;
-
- memcpy (&buf, bd_fd->entry->attr, sizeof(buf));
- op_ret = 0;
-
-out:
- STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, &buf, NULL);
- return 0;
-}
-
-int32_t
-bd_opendir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, fd_t *fd, dict_t *xdata)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- bd_fd_t *bd_fd = NULL;
- bd_entry_t *bdentry = NULL;
- bd_priv_t *priv = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (loc, out);
- VALIDATE_OR_GOTO (loc->path, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- BD_ENTRY (priv, bdentry, loc->path);
- if (!bdentry) {
- op_errno = ENOENT;
- gf_log (this->name, GF_LOG_ERROR, "opendir failed on %s: %s",
- loc->path, strerror (op_errno));
- goto out;
- }
- bd_fd = GF_CALLOC (1, sizeof(*bd_fd), gf_bd_fd);
- if (!bd_fd) {
- op_errno = errno;
- BD_PUT_ENTRY (priv, bdentry);
- goto out;
- }
-
- bd_fd->p_entry = bdentry;
-
- bdentry = list_entry ((&bdentry->child)->next, typeof(*bdentry), child);
- if (!bdentry) {
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_ERROR, "bd_entry NULL");
- goto out;
- }
- bdentry = list_entry ((&bdentry->sibling), typeof(*bdentry), sibling);
- if (!bdentry) {
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_ERROR, "bd_entry NULL");
- goto out;
- }
-
- bd_fd->entry = bdentry;
-
- op_ret = fd_ctx_set (fd, this, (uint64_t) (long)bd_fd);
- if (op_ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "failed to set the fd context path=%s fd=%p",
- loc->path, fd);
- goto out;
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- BD_PUT_ENTRY (priv, bd_fd->p_entry);
- if (bd_fd)
- GF_FREE (bd_fd);
- }
-
- STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, NULL);
- return 0;
-}
-
-int32_t
-bd_releasedir (xlator_t *this, fd_t *fd)
-{
- bd_fd_t *bd_fd = NULL;
- uint64_t tmp_bd_fd = 0;
- int ret = 0;
- bd_priv_t *priv = NULL;
-
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- ret = fd_ctx_del (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG, "bd_fd from fd=%p is NULL",
- fd);
- goto out;
- }
- bd_fd = (bd_fd_t *) (long)tmp_bd_fd;
- BD_PUT_ENTRY (priv, bd_fd->p_entry);
-
- bd_fd = (bd_fd_t *) (long)tmp_bd_fd;
- GF_FREE (bd_fd);
-out:
- return 0;
-}
-
-/*
- * bd_statfs: Mimics statfs by returning used/free extents in the VG
- * TODO: IF more than one VG allowed per volume, this functions needs some
- * change
- */
-int32_t
-bd_statfs (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xdata)
-{
- int32_t op_ret = -1;
- int32_t ret = -1;
- int32_t op_errno = 0;
- bd_priv_t *priv = NULL;
- struct statvfs buf = {0, };
- vg_t vg = NULL;
- char *vg_name = NULL;
- uint64_t size = 0;
- uint64_t fr_size = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- ret = dict_get_str (this->options, "export", &vg_name);
- if (ret) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "FATAL: storage/bd does not specify volume groups");
- op_errno = EINVAL;
- goto out;
- }
-
- BD_RD_LOCK (&priv->lock);
-
- vg = lvm_vg_open (priv->handle, vg_name, "r", 0);
- size += lvm_vg_get_size (vg);
- fr_size += lvm_vg_get_free_size (vg);
- lvm_vg_close (vg);
-
- BD_UNLOCK (&priv->lock);
-
- if (statvfs ("/", &buf) < 0) {
- op_errno = errno;
- goto out;
- }
- op_ret = 0;
- buf.f_blocks = size / buf.f_frsize;
- buf.f_bfree = fr_size / buf.f_frsize;
- buf.f_bavail = fr_size / buf.f_frsize;
-out:
- STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, &buf, NULL);
- return 0;
-}
-
-int32_t
-bd_release (xlator_t *this, fd_t *fd)
-{
- bd_fd_t *bd_fd = NULL;
- int ret = -1;
- uint64_t tmp_bd_fd = 0;
- bd_priv_t *priv = NULL;
-
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING, "bd_fd is NULL from fd=%p",
- fd);
- goto out;
- }
- bd_fd = (bd_fd_t *) (long)tmp_bd_fd;
- close (bd_fd->fd);
- BD_PUT_ENTRY (priv, bd_fd->entry);
-
- GF_FREE (bd_fd);
-out:
- return 0;
-}
-
-int32_t
-bd_fsync (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t datasync, dict_t *xdata)
-{
- int _fd = -1;
- int ret = -1;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- uint64_t tmp_bd_fd = 0;
- bd_fd_t *bd_fd = NULL;
- struct iatt preop = {0, };
- struct iatt postop = {0, };
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "bd_fd is NULL, fd=%p", fd);
- op_errno = -ret;
- goto out;
- }
- bd_fd = (bd_fd_t *)(long)tmp_bd_fd;
-
- _fd = bd_fd->fd;
- memcpy (&preop, &bd_fd->entry->attr, sizeof(preop));
- if (datasync) {
- ;
-#ifdef HAVE_FDATASYNC
- op_ret = fdatasync (_fd);
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "fdatasync on fd=%p failed: %s",
- fd, strerror (errno));
- }
-#endif
- } else {
- op_ret = fsync (_fd);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "fsync on fd=%p failed: %s",
- fd, strerror (op_errno));
- goto out;
- }
- }
-
- memcpy (&postop, bd_fd->entry->attr, sizeof(postop));
- op_ret = 0;
-
-out:
- STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, &preop,
- &postop, NULL);
-
- return 0;
-}
-
-int32_t
-bd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdict)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int ret = -1;
- uint64_t tmp_bd_fd = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- op_errno = -EINVAL;
- gf_log (this->name, GF_LOG_WARNING,
- "bd_fd is NULL on fd=%p", fd);
- goto out;
- }
- op_ret = 0;
-out:
- STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, NULL);
-
- return 0;
-}
-
-int
-__bd_fill_readdir (pthread_rwlock_t *bd_lock, bd_fd_t *bd_fd, off_t off,
- size_t size, gf_dirent_t *entries)
-{
- size_t filled = 0;
- int count = 0;
- struct dirent entry = {0, };
- int32_t this_size = -1;
- gf_dirent_t *this_entry = NULL;
- bd_entry_t *bdentry = NULL;
- bd_entry_t *cur_entry = NULL;
- bd_entry_t *n_entry = NULL;
-
- BD_RD_LOCK (bd_lock);
-
- bdentry = list_entry ((&bd_fd->p_entry->child)->next, typeof(*n_entry),
- child);
-
- if (off) {
- int i = 0;
- list_for_each_entry (n_entry, &bd_fd->entry->sibling, sibling) {
- if (i == off && strcmp (n_entry->name, "")) {
- bd_fd->entry = n_entry;
- break;
- }
- }
- } else
- bd_fd->entry = list_entry ((&bdentry->sibling),
- typeof(*n_entry), sibling);
-
- while (filled <= size) {
- cur_entry = bd_fd->entry;
-
- n_entry = list_entry ((&bd_fd->entry->sibling)->next,
- typeof (*cur_entry), sibling);
- if (&n_entry->sibling == (&bdentry->sibling))
- break;
-
- strcpy (entry.d_name, n_entry->name);
- entry.d_ino = n_entry->attr->ia_ino;
- entry.d_off = off;
- if (n_entry->attr->ia_type == IA_IFDIR)
- entry.d_type = DT_DIR;
- else
- entry.d_type = DT_REG;
-
- this_size = max (sizeof(gf_dirent_t),
- sizeof (gfs3_dirplist))
- + strlen (entry.d_name) + 1;
-
- if (this_size + filled > size)
- break;
-
- bd_fd->entry = n_entry;
-
- this_entry = gf_dirent_for_name (entry.d_name);
- if (!this_entry) {
- gf_log (THIS->name, GF_LOG_ERROR,
- "could not create gf_dirent for entry %s",
- entry.d_name);
- goto out;
- }
- this_entry->d_off = off;
- this_entry->d_ino = entry.d_ino;
- this_entry->d_type = entry.d_type;
- off++;
-
- list_add_tail (&this_entry->list, &entries->list);
-
- filled += this_size;
- count++;
- }
-out:
- BD_UNLOCK (bd_lock);
- return count;
-}
-
-int32_t
-bd_do_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t off, int whichop)
-{
- uint64_t tmp_bd_fd = 0;
- bd_fd_t *bd_fd = NULL;
- int ret = -1;
- int count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- gf_dirent_t entries;
- gf_dirent_t *tmp_entry = NULL;
- bd_entry_t *bdentry = NULL;
- bd_priv_t *priv = NULL;
- char *devpath = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- INIT_LIST_HEAD (&entries.list);
-
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING, "bd_fd is NULL, fd=%p", fd);
- op_errno = -EINVAL;
- goto out;
- }
- bd_fd = (bd_fd_t *) (long)tmp_bd_fd;
- LOCK (&fd->lock);
- {
- count = __bd_fill_readdir (&priv->lock, bd_fd, off,
- size, &entries);
- }
- UNLOCK (&fd->lock);
-
- /* pick ENOENT to indicate EOF */
- op_errno = errno;
- op_ret = count;
-
- if (whichop != GF_FOP_READDIRP)
- goto out;
-
- BD_RD_LOCK (&priv->lock);
- list_for_each_entry (tmp_entry, &entries.list, list) {
- char path[PATH_MAX];
- sprintf (path, "%s/%s", bd_fd->p_entry->name,
- tmp_entry->d_name);
- bdentry = bd_entry_get (path);
- if (!bdentry) {
- gf_log (this->name, GF_LOG_WARNING,
- "entry failed %s\n", tmp_entry->d_name);
- continue;
- }
- if (bdentry->attr->ia_ino)
- tmp_entry->d_ino = bdentry->attr->ia_ino;
- memcpy (&tmp_entry->d_stat,
- bdentry->attr, sizeof (tmp_entry->d_stat));
- bd_entry_put (bdentry);
- GF_FREE (devpath);
- }
- BD_UNLOCK (&priv->lock);
-
-out:
- STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, NULL);
-
- gf_dirent_free (&entries);
-
- return 0;
-}
-
-int32_t
-bd_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t off, dict_t *dict)
-{
- bd_do_readdir (frame, this, fd, size, off, GF_FOP_READDIR);
- return 0;
-}
-
-
-int32_t
-bd_readdirp (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t off, dict_t *dict)
-{
- bd_do_readdir (frame, this, fd, size, off, GF_FOP_READDIRP);
- return 0;
-}
-
-int32_t
-bd_priv (xlator_t *this)
-{
- return 0;
-}
-
-int32_t
-bd_inode (xlator_t *this)
-{
- return 0;
-}
-
-/* unsupported interfaces */
-int32_t
-bd_readlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, size_t size, dict_t *xdata)
-{
- struct iatt stbuf = {0, };
- char *dest = NULL;
-
- dest = alloca (size + 1);
- STACK_UNWIND_STRICT (readlink, frame, -1, ENOSYS, dest, &stbuf, NULL);
- return 0;
-}
-
-int
-bd_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dev_t dev, mode_t umask, dict_t *xdata)
-{
- struct iatt stbuf = {0, };
- struct iatt preparent = {0, };
- struct iatt postparent = {0, };
-
- STACK_UNWIND_STRICT (mknod, frame, -1, ENOSYS,
- (loc)?loc->inode:NULL, &stbuf, &preparent,
- &postparent, NULL);
- return 0;
-}
-
-int
-bd_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- mode_t umask, dict_t *xdata)
-{
- struct iatt stbuf = {0, };
- struct iatt preparent = {0, };
- struct iatt postparent = {0, };
-
- STACK_UNWIND_STRICT (mkdir, frame, -1, ENOSYS,
- (loc)?loc->inode:NULL, &stbuf, &preparent,
- &postparent, NULL);
- return 0;
-}
-
-int
-bd_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
- dict_t *xdata)
-{
- struct iatt preparent = {0, };
- struct iatt postparent = {0, };
-
- STACK_UNWIND_STRICT (rmdir, frame, -1, ENOSYS,
- &preparent, &postparent, NULL);
- return 0;
-}
-
-int32_t
-bd_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int flags, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (setxattr, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-int32_t
-bd_fsetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, dict_t *dict, int flags, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (setxattr, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-int32_t
-bd_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (getxattr, frame, -1, ENOSYS, NULL, NULL);
- return 0;
-}
-
-int32_t
-bd_fgetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, const char *name, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOSYS, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-bd_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (removexattr, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-int32_t
-bd_fremovexattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, const char *name, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fremovexattr, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-int32_t
-bd_fsyncdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int datasync, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fsyncdir, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-static int gf_bd_lk_log;
-int32_t
-bd_lk (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
-{
- struct gf_flock nullock = {0, };
-
- GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL,
- "\"features/locks\" translator is "
- "not loaded. You need to use it for proper "
- "functioning of your application.");
-
- STACK_UNWIND_STRICT (lk, frame, -1, ENOSYS, &nullock, NULL);
- return 0;
-}
-
-int32_t
-bd_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd,
- struct gf_flock *lock, dict_t *xdata)
-{
- GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL,
- "\"features/locks\" translator is "
- "not loaded. You need to use it for proper "
- "functioning of your application.");
-
- STACK_UNWIND_STRICT (inodelk, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-int32_t
-bd_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd,
- struct gf_flock *lock, dict_t *xdata)
-{
- GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL,
- "\"features/locks\" translator is "
- "not loaded. You need to use it for proper "
- "functioning of your application.");
-
- STACK_UNWIND_STRICT (finodelk, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-
-int32_t
-bd_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
-{
- GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL,
- "\"features/locks\" translator is "
- "not loaded. You need to use it for proper "
- "functioning of your application.");
-
- STACK_UNWIND_STRICT (entrylk, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-int32_t
-bd_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, const char *basename,
- entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
-{
- GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL,
- "\"features/locks\" translator is "
- "not loaded. You need to use it for proper "
- "functioning of your application.");
-
- STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-int32_t
-bd_rchecksum (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset, int32_t len, dict_t *xdata)
-{
- int32_t weak_checksum = 0;
- unsigned char strong_checksum[MD5_DIGEST_LENGTH];
-
- STACK_UNWIND_STRICT (rchecksum, frame, -1, ENOSYS,
- weak_checksum, strong_checksum, NULL);
- return 0;
-}
-
-int
-bd_xattrop (call_frame_t *frame, xlator_t *this,
- loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (xattrop, frame, -1, ENOSYS, xattr, NULL);
- return 0;
-}
-
-
-int
-bd_fxattrop (call_frame_t *frame, xlator_t *this,
- fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (xattrop, frame, -1, ENOSYS, xattr, NULL);
- return 0;
-}
-
-int bd_xl_op_create (bd_priv_t *priv, dict_t *input, dict_t *output)
-{
- char *vg = NULL;
- char *lv = NULL;
- char *path = NULL;
- bd_entry_t *p_entry = NULL;
- bd_entry_t *lventry = NULL;
- char *size = 0;
- int ret = -1;
- char *error = NULL;
- int retval = -1;
- char *buff = NULL;
- char *buffp = NULL;
- char *save = NULL;
-
- ret = dict_get_str (input, "size", &size);
- if (ret) {
- gf_asprintf (&error, "no size specified");
- goto out;
- }
- ret = dict_get_str (input, "path", &path);
- if (ret) {
- gf_asprintf (&error, "no path specified");
- goto out;
- }
-
- buff = buffp = gf_strdup (path);
-
- vg = strtok_r (buff, "/", &save);
- lv = strtok_r (NULL, "/", &save);
-
- if (!vg || !lv) {
- gf_asprintf (&error, "invalid path %s", path);
- ret = -1;
- goto out;
- }
-
- BD_ENTRY (priv, p_entry, vg);
- if (!p_entry) {
- ret = -ENOENT;
- goto out;
- }
- BD_ENTRY (priv, lventry, path);
- if (lventry) {
- ret = -EEXIST;
- gf_asprintf (&error, "%s already exists", lv);
- BD_PUT_ENTRY (priv, lventry);
- goto out;
- }
-
- ret = bd_create_lv (priv, p_entry, vg, lv, size, 0);
- if (ret < 0) {
- gf_asprintf (&error, "bd_create_lv error %d", -ret);
- goto out;
- }
- ret = 0;
-out:
- if (p_entry)
- BD_PUT_ENTRY (priv, p_entry);
-
- if (buffp)
- GF_FREE (buffp);
-
- if (error)
- retval = dict_set_dynstr (output, "error", error);
- return ret;
-}
-
-int bd_xl_op_delete (bd_priv_t *priv, dict_t *input, dict_t *output)
-{
- char *vg = NULL;
- char *path = NULL;
- bd_entry_t *p_entry = NULL;
- bd_entry_t *lventry = NULL;
- int ret = -1;
- char *error = NULL;
- int retval = -1;
- char *buff = NULL;
- char *buffp = NULL;
- char *save = NULL;
- int op_errno = 0;
-
- ret = dict_get_str (input, "path", &path);
- if (ret) {
- gf_asprintf (&error, "no path specified");
- goto out;
- }
-
- buff = buffp = gf_strdup (path);
-
- vg = strtok_r (buff, "/", &save);
- if (!vg) {
- gf_asprintf (&error, "invalid path %s", path);
- op_errno = EINVAL;
- ret = -1;
- goto out;
- }
-
- BD_ENTRY (priv, p_entry, vg);
- BD_ENTRY (priv, lventry, path);
- if (!p_entry || !lventry) {
- op_errno = -ENOENT;
- gf_asprintf (&error, "%s not found", path);
- ret = -1;
- goto out;
- }
- ret = bd_delete_lv (priv, p_entry, lventry, path, &op_errno);
- if (ret < 0) {
- gf_asprintf (&error, "bd_delete_lv error, error:%d", op_errno);
- goto out;
- }
- ret = 0;
-out:
- if (p_entry)
- BD_PUT_ENTRY (priv, p_entry);
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
- if (buffp)
- GF_FREE (buffp);
- if (error)
- retval = dict_set_dynstr (output, "error", error);
- return ret;
-}
-
-int bd_xl_op_clone(bd_priv_t *priv, int subop, dict_t *input, dict_t *output)
-{
- bd_entry_t *p_entry = NULL;
- bd_entry_t *lventry = NULL;
- int ret = -1;
- char *error = NULL;
- int retval = -1;
- char *vg = NULL;
- char *lv = NULL;
- char *dest_lv = NULL;
- char *size = NULL;
- char *buff = NULL;
- char *buffp = NULL;
- char *path = NULL;
- char *save = NULL;
- char *npath = NULL;
-
- ret = dict_get_str (input, "path", &path);
- ret = dict_get_str (input, "dest_lv", &dest_lv);
- ret = dict_get_str (input, "size", &size);
-
- if (!path || !dest_lv) {
- gf_asprintf (&error, "invalid arguments");
- ret = -1;
- goto out;
- }
-
- buff = buffp = gf_strdup (path);
-
- vg = strtok_r (buff, "/", &save);
- lv = strtok_r (NULL, "/", &save);
- if (!lv) {
- gf_asprintf (&error, "lv not given %s", path);
- ret = -1;
- goto out;
- }
-
- BD_ENTRY (priv, p_entry, vg);
- if (!p_entry) {
- gf_asprintf (&error, "%s does not exist", vg);
- retval = dict_set_str (output, "error", error);
- goto out;
- }
-
- BD_ENTRY (priv, lventry, path);
- if (!lventry) {
- gf_asprintf (&error, "%s does not exist", path);
- ret = -1;
- goto out;
- }
- BD_PUT_ENTRY (priv, lventry);
- lventry = NULL;
- gf_asprintf (&npath, "/%s/%s", vg, dest_lv);
- BD_ENTRY (priv, lventry, npath);
- if (lventry) {
- gf_asprintf (&error, "%s already exists", dest_lv);
- BD_PUT_ENTRY (priv, lventry);
- ret = -1;
- goto out;
- }
-
- if (subop == GF_BD_OP_SNAPSHOT_BD) {
- if (!size) {
- gf_asprintf (&error, "size not given");
- ret = -1;
- goto out;
- }
- ret = bd_snapshot_lv (priv, p_entry, output, lv, dest_lv,
- size, NULL);
- } else
- ret = bd_clone_lv (priv, p_entry, output, vg, lv, dest_lv,
- NULL);
-
- if (ret)
- goto out;
- ret = 0;
-out:
- if (error)
- retval = dict_set_dynstr (output, "error", error);
- if (p_entry)
- BD_PUT_ENTRY (priv, p_entry);
- if (npath)
- GF_FREE (npath);
- if (buffp)
- GF_FREE (buffp);
- return ret;
-}
-
-int32_t
-bd_notify (xlator_t *this, dict_t *input, dict_t *output)
-{
- int ret = -1;
- int retval = -1;
- int32_t bdop = -1;
- bd_priv_t *priv = NULL;
- char *error = NULL;
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- ret = dict_get_int32 (input, "bd-op", (int32_t *)&bdop);
- if (ret) {
- gf_asprintf (&error, "no sub-op specified");
- goto out;
- }
-
- switch (bdop)
- {
- case GF_BD_OP_NEW_BD:
- ret = bd_xl_op_create (priv, input, output);
- break;
- case GF_BD_OP_DELETE_BD:
- ret = bd_xl_op_delete (priv, input, output);
- break;
- case GF_BD_OP_CLONE_BD:
- case GF_BD_OP_SNAPSHOT_BD:
- ret = bd_xl_op_clone (priv, bdop, input, output);
- break;
- default:
- gf_asprintf (&error, "invalid bd-op %d specified", bdop);
- retval = dict_set_dynstr (output, "error", error);
- goto out;
- }
-
-out:
- return ret;
-}
-
-/**
- * notify - when parent sends PARENT_UP, send CHILD_UP event from here
- */
-int32_t
-notify (xlator_t *this,
- int32_t event,
- void *data,
- ...)
-{
- va_list ap;
- int ret = 0;
- void *data2 = NULL;
- dict_t *input = NULL;
- dict_t *output = NULL;
-
- va_start (ap, data);
- data2 = va_arg (ap, dict_t *);
- va_end (ap);
-
- switch (event)
- {
- case GF_EVENT_PARENT_UP:
- {
- /* Tell the parent that bd xlator is up */
- default_notify (this, GF_EVENT_CHILD_UP, data);
- }
- break;
- case GF_EVENT_TRANSLATOR_OP:
- input = data;
- output = data2;
- if (!output)
- output = dict_new ();
- ret = bd_notify (this, input, output);
- break;
-
- default:
- break;
- }
- return ret;
-}
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int ret = -1;
-
- if (!this)
- return ret;
-
- ret = xlator_mem_acct_init (this, gf_bd_mt_end + 1);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- return ret;
- }
-
- return ret;
-}
-
-
-/**
- * init - Constructs lists of LVs in the given VG
- */
-int
-init (xlator_t *this)
-{
- bd_priv_t *_private = NULL;
- int ret = 0;
- char *vg = NULL;
- char *device = NULL;
-
- LOCK_INIT (&inode_lk);
-
- bd_rootp = bd_entry_add_root ();
- if (!bd_rootp) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "FATAL: adding root entry failed");
- return -1;
- }
-
- if (this->children) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "FATAL: storage/bd cannot have subvolumes");
- ret = -1;
- goto out;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "Volume is dangling. Please check the volume file.");
- }
-
- ret = dict_get_str (this->options, "device", &device);
- if (ret) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "FATAL: storage/bd does not specify backend");
- return -1;
- }
-
- /* Now we support only LV device */
- if (strcasecmp (device, BACKEND_VG)) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "FATAL: unknown %s backend %s", BD_XLATOR, device);
- return -1;
- }
-
- ret = dict_get_str (this->options, "export", &vg);
- if (ret) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "FATAL: storage/bd does not specify volume groups");
- return -1;
- }
-
- ret = 0;
- _private = GF_CALLOC (1, sizeof(*_private), gf_bd_private);
- if (!_private)
- goto error;
-
- pthread_rwlock_init (&_private->lock, NULL);
- this->private = (void *)_private;
- _private->handle = NULL;
- _private->vg = gf_strdup (vg);
- if (!_private->vg) {
- goto error;
- }
-
- if (bd_build_lv_list (this->private, vg) < 0)
- goto error;
-
-out:
- return 0;
-error:
- BD_WR_LOCK (&_private->lock);
- bd_entry_cleanup ();
- lvm_quit (_private->handle);
- if (_private->vg)
- GF_FREE (_private->vg);
- GF_FREE (_private);
- return -1;
-}
-
-void
-fini (xlator_t *this)
-{
- bd_priv_t *priv = this->private;
- if (!priv)
- return;
- lvm_quit (priv->handle);
- BD_WR_LOCK (&priv->lock);
- bd_entry_cleanup ();
- BD_UNLOCK (&priv->lock);
- GF_FREE (priv->vg);
- this->private = NULL;
- GF_FREE (priv);
- return;
-}
-
-struct xlator_dumpops dumpops = {
- .priv = bd_priv,
- .inode = bd_inode,
-};
-
-struct xlator_fops fops = {
- /* Not supported */
- .readlink = bd_readlink,
- .mknod = bd_mknod,
- .mkdir = bd_mkdir,
- .rmdir = bd_rmdir,
- .setxattr = bd_setxattr,
- .fsetxattr = bd_fsetxattr,
- .getxattr = bd_getxattr,
- .fgetxattr = bd_fgetxattr,
- .removexattr = bd_removexattr,
- .fremovexattr= bd_fremovexattr,
- .fsyncdir = bd_fsyncdir,
- .lk = bd_lk,
- .inodelk = bd_inodelk,
- .finodelk = bd_finodelk,
- .entrylk = bd_entrylk,
- .fentrylk = bd_fentrylk,
- .rchecksum = bd_rchecksum,
- .xattrop = bd_xattrop,
-
- /* Supported */
- .lookup = bd_lookup,
- .opendir = bd_opendir,
- .readdir = bd_readdir,
- .readdirp = bd_readdirp,
- .stat = bd_stat,
- .statfs = bd_statfs,
- .open = bd_open,
- .access = bd_access,
- .flush = bd_flush,
- .readv = bd_readv,
- .fstat = bd_fstat,
- .truncate = bd_truncate,
- .ftruncate = bd_ftruncate,
- .fsync = bd_fsync,
- .writev = bd_writev,
- .fstat = bd_fstat,
- .create = bd_create,
- .setattr = bd_setattr,
- .fsetattr = bd_fsetattr,
- .unlink = bd_unlink,
- .link = bd_link,
- .symlink = bd_symlink,
- .rename = bd_rename,
-};
-
-struct xlator_cbks cbks = {
- .releasedir = bd_releasedir,
- .release = bd_release,
-};
-
-struct volume_options options[] = {
- { .key = {"export"},
- .type = GF_OPTION_TYPE_STR},
- { .key = {"device"},
- .type = GF_OPTION_TYPE_STR},
- { .key = {NULL} }
-};
diff --git a/xlators/storage/bd_map/src/bd_map.h b/xlators/storage/bd_map/src/bd_map.h
deleted file mode 100644
index 1a0f4248e..000000000
--- a/xlators/storage/bd_map/src/bd_map.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- BD translator - Exports Block devices on server side as regular
- files to client
-
- Copyright IBM, Corp. 2012
-
- This file is part of GlusterFS.
-
- Author:
- M. Mohan Kumar <mohan@in.ibm.com>
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _BD_MAP_H
-#define _BD_MAP_H
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "xlator.h"
-#include "mem-types.h"
-
-#define BD_XLATOR "block device mapper xlator"
-
-#define BACKEND_VG "vg"
-
-enum gf_bd_mem_types_ {
- gf_bd_fd = gf_common_mt_end + 1,
- gf_bd_private,
- gf_bd_entry,
- gf_bd_attr,
- gf_bd_mt_end
-};
-
-/*
- * Each BD/LV is represented by this data structure
- * Usually root entry will have only children and there is no sibling for that
- * All other entries may have children and/or sibling entries
- * If an entry is a Volume Group it will have child (. & .. and Logical
- * Volumes) and also other Volume groups will be a sibling for this
- */
-typedef struct bd_entry {
- struct list_head child; /* List to child */
- struct list_head sibling; /* List of siblings */
- struct bd_entry *parent;/* Parent of this node */
- struct bd_entry *link; /* Link to actual entry, if its . or .. */
- char name[NAME_MAX];
- struct iatt *attr;
- int refcnt;
- uint64_t size;
- pthread_rwlock_t lock;
-} bd_entry_t;
-
-/**
- * bd_fd - internal structure common to file and directory fd's
- */
-typedef struct bd_fd {
- bd_entry_t *entry;
- bd_entry_t *p_entry; /* Parent entry */
- int fd;
- int32_t flag;
-} bd_fd_t;
-
-typedef struct bd_priv {
- lvm_t handle;
- pthread_rwlock_t lock;
- char *vg;
-} bd_priv_t;
-
-#endif
diff --git a/xlators/storage/bd_map/src/bd_map_help.c b/xlators/storage/bd_map/src/bd_map_help.c
deleted file mode 100644
index 0613aa383..000000000
--- a/xlators/storage/bd_map/src/bd_map_help.c
+++ /dev/null
@@ -1,501 +0,0 @@
-/*
- BD translator - Exports Block devices on server side as regular
- files to client
-
- Copyright IBM, Corp. 2012
-
- This file is part of GlusterFS.
-
- Author:
- M. Mohan Kumar <mohan@in.ibm.com>
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#define __XOPEN_SOURCE 500
-
-#include <libgen.h>
-#include <time.h>
-#include <lvm2app.h>
-
-#include "bd_map.h"
-#include "bd_map_help.h"
-#include "defaults.h"
-#include "glusterfs3-xdr.h"
-
-#define CHILD_ENTRY(node) list_entry ((&node->child)->next, typeof(*node), \
- child)
-
-bd_entry_t *bd_rootp;
-gf_lock_t inode_lk;
-static uint64_t bd_entry_ino = 5000; /* Starting inode */
-
-static void bd_entry_get_ino (uint64_t *inode)
-{
- LOCK (&inode_lk);
- {
- *inode = bd_entry_ino++;
- }
- UNLOCK (&inode_lk);
-}
-
-void bd_update_time (bd_entry_t *entry, int type)
-{
- struct timespec ts;
-
- clock_gettime (CLOCK_REALTIME, &ts);
- if (type == 0) {
- entry->attr->ia_mtime = ts.tv_sec;
- entry->attr->ia_mtime_nsec = ts.tv_nsec;
- entry->attr->ia_atime = ts.tv_sec;
- entry->attr->ia_atime_nsec = ts.tv_nsec;
- } else if (type == 1) {
- entry->attr->ia_mtime = ts.tv_sec;
- entry->attr->ia_mtime_nsec = ts.tv_nsec;
- } else {
- entry->attr->ia_atime = ts.tv_sec;
- entry->attr->ia_atime_nsec = ts.tv_nsec;
- }
-}
-
-static bd_entry_t *bd_entry_init (const char *name)
-{
- bd_entry_t *bdentry;
-
- bdentry = GF_MALLOC (sizeof(bd_entry_t), gf_bd_entry);
- if (!bdentry)
- return NULL;
-
- bdentry->attr = GF_MALLOC (sizeof(struct iatt), gf_bd_attr);
- if (!bdentry->attr) {
- GF_FREE (bdentry);
- return NULL;
- }
-
- strcpy (bdentry->name, name);
- INIT_LIST_HEAD (&bdentry->sibling);
- INIT_LIST_HEAD (&bdentry->child);
- bdentry->link = NULL;
- bdentry->refcnt = 0;
- return bdentry;
-}
-
-static bd_entry_t *bd_entry_clone (bd_entry_t *orig, char *name)
-{
- bd_entry_t *bdentry;
-
- bdentry = GF_MALLOC (sizeof(bd_entry_t), gf_bd_entry);
- if (!bdentry)
- return NULL;
-
- bdentry->attr = orig->attr;
-
- strcpy (bdentry->name, name);
- INIT_LIST_HEAD (&bdentry->sibling);
- INIT_LIST_HEAD (&bdentry->child);
- bdentry->link = orig;
- bdentry->refcnt = 0;
- return bdentry;
-}
-
-static void bd_entry_init_iattr (struct iatt *attr, int type)
-{
- struct timespec ts = {0, };
-
- clock_gettime (CLOCK_REALTIME, &ts);
- attr->ia_dev = ia_makedev (0, 0); /* FIXME: */
- attr->ia_type = type;
- attr->ia_prot = ia_prot_from_st_mode (0750);
- attr->ia_nlink = 2;
- attr->ia_uid = 0;
- attr->ia_gid = 0;
- attr->ia_rdev = ia_makedev (0, 0);
-
- attr->ia_size = 4096; /* FIXME */
- attr->ia_blksize = 4096;
- attr->ia_blocks = 0;
-
- attr->ia_atime = ts.tv_sec;
- attr->ia_atime_nsec = ts.tv_nsec;
- attr->ia_mtime = ts.tv_sec;
- attr->ia_mtime_nsec = ts.tv_nsec;
- attr->ia_ctime = ts.tv_sec;
- attr->ia_ctime_nsec = ts.tv_nsec;
-}
-
-/*
- * bd_entry_istat: Initialize iatt strucutre for a given path on success
- */
-void bd_entry_istat (const char *path, struct iatt *attr, int type)
-{
- struct stat stbuf = {0, };
-
- if (stat (path, &stbuf) < 0)
- bd_entry_init_iattr (attr, type);
- else
- iatt_from_stat (attr, &stbuf);
- sprintf ((char *)attr->ia_gfid, "%lx", stbuf.st_ino);
-}
-
-/*
- * Adds the root entry and required entries
- * ie header entry followed by . and .. entries
- */
-bd_entry_t *bd_entry_add_root (void)
-{
- bd_entry_t *bdentry = NULL;
- bd_entry_t *h_entry = NULL;
- bd_entry_t *d_entry = NULL;
- bd_entry_t *dd_entry = NULL;
-
- bdentry = bd_entry_init ("/");
- if (!bdentry)
- return NULL;
-
- bdentry->parent = bdentry;
-
- bd_entry_get_ino (&bdentry->attr->ia_ino);
- sprintf ((char *)bdentry->attr->ia_gfid, "%ld",
- bdentry->attr->ia_ino << 2);
- bd_entry_init_iattr (bdentry->attr, IA_IFDIR);
-
- h_entry = bd_entry_clone (bdentry, "");
- bdentry->child.next = &h_entry->child;
- bdentry->child.prev = &h_entry->child;
-
- d_entry = bd_entry_clone (bdentry, ".");
- dd_entry = bd_entry_clone (bdentry, "..");
-
- list_add_tail (&d_entry->sibling, &h_entry->sibling);
- list_add_tail (&dd_entry->sibling, &h_entry->sibling);
- return bdentry;
-}
-
-bd_entry_t *bd_entry_add (bd_entry_t *parent, const char *name,
- struct iatt *iattr, ia_type_t type)
-{
- bd_entry_t *bdentry = NULL;
- bd_entry_t *h_entry = NULL;
- bd_entry_t *d_entry = NULL;
- bd_entry_t *dd_entry = NULL;
- bd_entry_t *sentry = NULL;
- struct timespec ts = { 0, };
-
- if (!parent)
- parent = bd_rootp;
-
- if (type != IA_IFREG && type != IA_IFDIR)
- return NULL;
-
- bdentry = bd_entry_init (name);
- if (!bdentry)
- return NULL;
-
- bdentry->parent = parent;
-
- iattr->ia_type = type;
-
- bd_entry_get_ino (&iattr->ia_ino);
- if (IA_ISDIR(type)) {
- h_entry = bd_entry_clone (bdentry, "");
- parent->attr->ia_nlink++;
- bdentry->child.next = &h_entry->child;
- bdentry->child.prev = &h_entry->child;
-
- d_entry = bd_entry_clone (bdentry, ".");
- dd_entry = bd_entry_clone (bdentry, "..");
-
- list_add_tail (&d_entry->sibling, &h_entry->sibling);
- list_add_tail (&dd_entry->sibling, &h_entry->sibling);
- }
- memcpy (bdentry->attr, iattr, sizeof(*iattr));
-
- clock_gettime (CLOCK_REALTIME, &ts);
- parent->attr->ia_mtime = ts.tv_sec;
- parent->attr->ia_mtime_nsec = ts.tv_nsec;
- bdentry->size = iattr->ia_size;
-
- sentry = CHILD_ENTRY (parent);
- list_add_tail (&bdentry->sibling, &sentry->sibling);
- return bdentry;
-}
-
-bd_entry_t *bd_entry_get_list (const char *name, bd_entry_t *parent)
-{
- bd_entry_t *centry = NULL;
- bd_entry_t *bdentry = NULL;
-
- if (!parent)
- parent = bd_rootp;
-
- if (parent->child.next == &parent->child)
- return NULL;
-
- centry = CHILD_ENTRY (parent);
- if (!strcmp (centry->name, name))
- return centry;
-
- list_for_each_entry (bdentry, &centry->sibling, sibling) {
- if (!strcmp (bdentry->name, name))
- return bdentry;
- }
- return NULL;
-}
-
-/* FIXME: Do we need hashing here? */
-bd_entry_t *bd_entry_find_by_gfid (const char *path)
-{
- bd_entry_t *h = NULL;
- bd_entry_t *tmp = NULL;
- bd_entry_t *tmp2 = NULL;
- bd_entry_t *node = NULL;
- bd_entry_t *cnode = NULL;
- bd_entry_t *leaf = NULL;
- char *gfid = NULL;
- char *cp = NULL;
- char *bgfid = NULL;
- bd_entry_t *entry = NULL;
-
- gfid = GF_MALLOC (strlen(path) + 1, gf_common_mt_char);
- sscanf (path, "<gfid:%s", gfid);
- if (!gfid)
- return NULL;
-
- cp = strchr(gfid, '>');
- *cp = '\0';
-
- node = CHILD_ENTRY (bd_rootp);
-
- bgfid = GF_MALLOC (GF_UUID_BUF_SIZE, gf_common_mt_char);
- if (!bgfid)
- return NULL;
-
- list_for_each_entry_safe (h, tmp, &node->sibling, sibling) {
- uuid_utoa_r (h->attr->ia_gfid, bgfid);
- if (!h->link && !strcmp (gfid, bgfid)) {
- entry = h;
- goto out;
- }
-
- /* if we have children for this node */
- if (h->child.next != &h->child) {
- cnode = CHILD_ENTRY (h);
- uuid_utoa_r (cnode->attr->ia_gfid, bgfid);
- if (!cnode->link && !strcmp (gfid, bgfid)) {
- entry = cnode;
- goto out;
- }
-
- list_for_each_entry_safe (leaf, tmp2, (&cnode->sibling),
- sibling) {
- uuid_utoa_r (leaf->attr->ia_gfid, bgfid);
- if (!leaf->link && !strcmp (gfid, bgfid)) {
- entry = leaf;
- goto out;
- }
-
- }
- }
- }
-out:
- if (bgfid)
- GF_FREE (bgfid);
-
- return entry;
-}
-
-/* Called with priv->bd_lock held */
-bd_entry_t *bd_entry_get (const char *name)
-{
- bd_entry_t *pentry = NULL;
- char *path = NULL;
- char *comp = NULL;
- char *save = NULL;
-
- if (!strncmp (name, "<gfid:", 5)) {
- pentry = bd_entry_find_by_gfid (name);
- if (pentry)
- pentry->refcnt++;
- return pentry;
- }
-
- if (!strcmp (name, "/")) {
- bd_rootp->refcnt++;
- return bd_rootp;
- }
-
- path = gf_strdup (name);
- comp = strtok_r (path, "/", &save);
- pentry = bd_entry_get_list (comp, NULL);
- if (!pentry)
- goto out;
- while (comp) {
- comp = strtok_r (NULL, "/", &save);
- if (!comp)
- break;
- pentry = bd_entry_get_list (comp, pentry);
- if (!pentry)
- goto out;
- }
-
- pentry->refcnt++;
-out:
- GF_FREE (path);
- return pentry;
-}
-
-int bd_entry_rm (const char *path)
-{
- bd_entry_t *bdentry = NULL;
- int ret = -1;
-
- bdentry = bd_entry_get (path);
- if (!bdentry)
- goto out;
-
- list_del_init (&bdentry->sibling);
- list_del_init (&bdentry->child);
- GF_FREE (bdentry);
-
- ret = 0;
-out:
- return ret;
-}
-
-
-
-/* Called with priv->bd_lock held */
-void bd_entry_put (bd_entry_t *entry)
-{
- entry->refcnt--;
-}
-
-int bd_build_lv_list (bd_priv_t *priv, char *vg_name)
-{
- struct dm_list *lv_dm_list = NULL;
- struct lvm_lv_list *lv_list = NULL;
- struct iatt iattr = {0, };
- char path[PATH_MAX] = {0, };
- vg_t vg = NULL;
- bd_entry_t *vg_map = NULL;
- bd_entry_t *bd = NULL;
- int ret = -1;
- const char *lv_name = NULL;
-
- priv->handle = lvm_init (NULL);
- if (!priv->handle) {
- gf_log (THIS->name, GF_LOG_CRITICAL, "FATAL: bd_init failed");
- return -1;
- }
-
- BD_WR_LOCK (&priv->lock);
-
- vg = lvm_vg_open (priv->handle, vg_name, "r", 0);
- if (!vg) {
- gf_log (THIS->name, GF_LOG_CRITICAL,
- "opening vg %s failed", vg_name);
- goto out;
- }
- /* get list of LVs associated with this VG */
- lv_dm_list = lvm_vg_list_lvs (vg);
- sprintf (path, "/dev/%s", vg_name);
- bd_entry_istat (path, &iattr, IA_IFDIR);
- vg_map = bd_entry_add (bd_rootp, vg_name, &iattr,
- IA_IFDIR);
- if (!vg_map) {
- gf_log (THIS->name, GF_LOG_CRITICAL,
- "bd_add_entry failed");
- goto out;
- }
- ret = 0;
- if (!lv_dm_list) /* no lvs for this VG */
- goto out;
-
- dm_list_iterate_items (lv_list, lv_dm_list) {
- if (!lv_list)
- continue;
- lv_name = lvm_lv_get_name (lv_list->lv);
- /* snapshot%d is reserved name */
- if (!strncmp (lv_name, "snapshot", 8))
- continue;
- /* get symbolic path for this LV */
- sprintf (path, "/dev/%s/%s", vg_name, lv_name);
- bd_entry_istat (path, &iattr, IA_IFREG);
- /* Make the file size equivalant to BD size */
- iattr.ia_size = lvm_lv_get_size (lv_list->lv);
- /* got LV, add it to our tree */
- bd = bd_entry_add (vg_map,
- lvm_lv_get_name (lv_list->lv),
- &iattr, IA_IFREG);
- if (bd == NULL) {
- gf_log (THIS->name, GF_LOG_ERROR,
- "bd_add_entry failed");
- goto out;
- }
- }
-out:
- if (vg)
- lvm_vg_close (vg);
-
- BD_UNLOCK (&priv->lock);
- return ret;
-}
-
-/*
- * Called with bd_lock held to cleanup entire list. If there was a
- * reference to any one of the entry, nothing cleared.
- * Return 0 on success -1 in case if there is a reference to the entry
- */
-int bd_entry_cleanup (void)
-{
- bd_entry_t *node = NULL;
- bd_entry_t *tmp = NULL;
- bd_entry_t *tmp2 = NULL;
- bd_entry_t *cnode = NULL;
- bd_entry_t *h = NULL;
- bd_entry_t *leaf = NULL;
-
- if (!bd_rootp)
- return 0;
-
- node = CHILD_ENTRY (bd_rootp);
- if (node->refcnt) {
- gf_log (THIS->name, GF_LOG_WARNING,
- "entry %s is inuse\n", node->name);
- return -1;
- }
- list_for_each_entry_safe (h, tmp, &node->sibling, sibling) {
- /* if we have children for this node */
- if (h->child.next != &h->child) {
- cnode = CHILD_ENTRY (h);
- list_for_each_entry_safe (leaf, tmp2, (&cnode->sibling),
- sibling) {
- list_del_init (&leaf->sibling);
- list_del_init (&leaf->child);
- if (!leaf->link)
- GF_FREE (leaf->attr);
- GF_FREE (leaf);
- }
- list_del_init (&cnode->sibling);
- list_del_init (&cnode->child);
- if (!cnode->link)
- GF_FREE (cnode->attr);
- GF_FREE (cnode);
- }
- if (!h->link)
- GF_FREE (h->attr);
- GF_FREE (h);
- }
- GF_FREE (h);
- GF_FREE (bd_rootp->attr);
- GF_FREE (bd_rootp);
- return 0;
-}
diff --git a/xlators/storage/bd_map/src/bd_map_help.h b/xlators/storage/bd_map/src/bd_map_help.h
deleted file mode 100644
index 9fafa2d13..000000000
--- a/xlators/storage/bd_map/src/bd_map_help.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- BD translator - Exports Block devices on server side as regular
- files to client.
-
- Copyright IBM, Corp. 2012
-
- This file is part of GlusterFS.
-
- Author:
- M. Mohan Kumar <mohan@in.ibm.com>
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-#ifndef _BD_MAP_HELP_H
-#define _BD_MAP_HELP_H
-
-#define BD_RD_LOCK(lock) \
- pthread_rwlock_rdlock (lock);
-
-#define BD_WR_LOCK(lock) \
- pthread_rwlock_wrlock (lock);
-
-#define BD_UNLOCK(lock) \
- pthread_rwlock_unlock (lock);
-
-#define BD_WR_ENTRY(priv, bdentry, path) \
- do { \
- BD_WR_LOCK (&priv->lock); \
- bdentry = bd_entry_get (path); \
- BD_UNLOCK (&priv->lock); \
- } while (0)
-
-#define BD_ENTRY(priv, bdentry, path) \
- do { \
- BD_RD_LOCK (&priv->lock); \
- bdentry = bd_entry_get (path); \
- BD_UNLOCK (&priv->lock); \
- } while (0)
-
-#define BD_PUT_ENTRY(priv, bdentry) \
- do { \
- BD_RD_LOCK (&priv->lock); \
- bd_entry_put (bdentry); \
- BD_UNLOCK (&priv->lock); \
- } while (0)
-
-#define BD_ENTRY_UPDATE_TIME(bdentry) bd_update_time (bdentry, 0)
-#define BD_ENTRY_UPDATE_ATIME(bdentry) bd_update_time (bdentry, 2)
-#define BD_ENTRY_UPDATE_MTIME(bdentry) bd_update_time (bdentry, 1)
-
-extern bd_entry_t *bd_rootp;
-extern gf_lock_t inode_lk;
-
-void bd_entry_istat (const char *path, struct iatt *attr, int type);
-bd_entry_t *bd_entry_add_root (void);
-bd_entry_t *bd_entry_add (bd_entry_t *parent, const char *name,
- struct iatt *iattr, ia_type_t type);
-bd_entry_t *bd_entry_get_list (const char *name, bd_entry_t *parent);
-bd_entry_t *bd_entry_get (const char *name);
-void bd_entry_put (bd_entry_t *entry);
-int bd_build_lv_list (bd_priv_t *priv, char *vg);
-int bd_entry_cleanup (void);
-void bd_update_time (bd_entry_t *entry, int type);
-int bd_entry_rm (const char *path);
-
-#endif
diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am
index 03623cf04..88efcc784 100644
--- a/xlators/storage/posix/src/Makefile.am
+++ b/xlators/storage/posix/src/Makefile.am
@@ -2,7 +2,7 @@
xlator_LTLIBRARIES = posix.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage
-posix_la_LDFLAGS = -module -avoidversion
+posix_la_LDFLAGS = -module -avoid-version
posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c posix-aio.c
posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBAIO)
diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c
index f807618ce..c3bbddd67 100644
--- a/xlators/storage/posix/src/posix-aio.c
+++ b/xlators/storage/posix/src/posix-aio.c
@@ -136,11 +136,7 @@ posix_aio_readv_complete (struct posix_aio_cb *paiocb, int res, int res2)
/* Hack to notify higher layers of EOF. */
- if (postbuf.ia_size == 0)
- op_errno = ENOENT;
- else if ((offset + iov.iov_len) == postbuf.ia_size)
- op_errno = ENOENT;
- else if (offset > postbuf.ia_size)
+ if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size)
op_errno = ENOENT;
LOCK (&priv->lock);
@@ -490,8 +486,8 @@ posix_aio_init (xlator_t *this)
goto out;
}
- ret = pthread_create (&priv->aiothread, NULL,
- posix_aio_thread, this);
+ ret = gf_thread_create (&priv->aiothread, NULL,
+ posix_aio_thread, this);
if (ret != 0) {
io_destroy (priv->ctxp);
goto out;
@@ -566,7 +562,7 @@ __posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags,
{
xlator_t *this = THIS;
gf_log (this->name, GF_LOG_INFO,
- "Linux AIO not availble at build-time."
+ "Linux AIO not available at build-time."
" Continuing with synchronous IO");
return;
}
diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c
index 33bf3db56..219a582c9 100644
--- a/xlators/storage/posix/src/posix-handle.c
+++ b/xlators/storage/posix/src/posix-handle.c
@@ -573,13 +573,6 @@ posix_handle_hard (xlator_t *this, const char *oldpath, uuid_t gfid, struct stat
}
}
- ret = lstat (newpath, &newbuf);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "lstat on %s failed (%s)", newpath, strerror (errno));
- return -1;
- }
-
if (newbuf.st_ino != oldbuf->st_ino ||
newbuf.st_dev != oldbuf->st_dev) {
gf_log (this->name, GF_LOG_WARNING,
diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c
index 58708a347..e295f8850 100644
--- a/xlators/storage/posix/src/posix-helpers.c
+++ b/xlators/storage/posix/src/posix-helpers.c
@@ -22,6 +22,7 @@
#include <pthread.h>
#include <ftw.h>
#include <sys/stat.h>
+#include <signal.h>
#ifndef GF_BSD_HOST_OS
#include <alloca.h>
@@ -44,16 +45,9 @@
#include "timer.h"
#include "glusterfs3-xdr.h"
#include "hashfn.h"
+#include "glusterfs-acl.h"
#include <fnmatch.h>
-typedef struct {
- xlator_t *this;
- const char *real_path;
- dict_t *xattr;
- struct iatt *stbuf;
- loc_t *loc;
-} posix_xattr_filler_t;
-
char *marker_xattrs[] = {"trusted.glusterfs.quota.*",
"trusted.glusterfs.*.xtime",
NULL};
@@ -181,14 +175,9 @@ _posix_xattr_get_set (dict_t *xattr_req,
}
} else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) {
loc = filler->loc;
- if (loc && !list_empty (&loc->inode->fd_list)) {
- ret = dict_set_uint32 (filler->xattr, key, 1);
- if (ret < 0)
- gf_log (filler->this->name, GF_LOG_WARNING,
- "Failed to set dictionary value for %s",
- key);
- } else {
- ret = dict_set_uint32 (filler->xattr, key, 0);
+ if (loc) {
+ ret = dict_set_uint32 (filler->xattr, key,
+ loc->inode->fd_count);
if (ret < 0)
gf_log (filler->this->name, GF_LOG_WARNING,
"Failed to set dictionary value for %s",
@@ -896,8 +885,8 @@ posix_spawn_janitor_thread (xlator_t *this)
LOCK (&priv->lock);
{
if (!priv->janitor_present) {
- ret = pthread_create (&priv->janitor, NULL,
- posix_janitor_thread_proc, this);
+ ret = gf_thread_create (&priv->janitor, NULL,
+ posix_janitor_thread_proc, this);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
@@ -913,6 +902,74 @@ unlock:
UNLOCK (&priv->lock);
}
+static int
+is_fresh_file (struct stat *stat)
+{
+ struct timeval tv;
+
+ gettimeofday (&tv, NULL);
+
+ if ((stat->st_ctime >= (tv.tv_sec - 1))
+ && (stat->st_ctime <= tv.tv_sec))
+ return 1;
+
+ return 0;
+}
+
+
+int
+posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req)
+{
+ /* The purpose of this function is to prevent a race
+ where an inode creation FOP (like mkdir/mknod/create etc)
+ races with lookup in the following way:
+
+ {create thread} | {lookup thread}
+ |
+ t0
+ mkdir ("name") |
+ t1
+ | posix_gfid_set ("name", 2);
+ t2
+ posix_gfid_set ("name", 1); |
+ t3
+ lstat ("name"); | lstat ("name");
+
+ In the above case mkdir FOP would have resulted with GFID 2 while
+ it should have been GFID 1. It matters in the case where GFID would
+ have gotten set to 1 on other subvolumes of replciate/distribute
+
+ The "solution" here is that, if we detect lookup is attempting to
+ set a GFID on a file which is created very recently, but does not
+ yet have a GFID (i.e, between t1 and t2), then "fake" it as though
+ posix_gfid_heal was called at t0 instead.
+ */
+
+ uuid_t uuid_curr;
+ int ret = 0;
+ struct stat stat = {0, };
+
+ if (!xattr_req)
+ goto out;
+
+ if (sys_lstat (path, &stat) != 0)
+ goto out;
+
+ ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16);
+ if (ret != 16) {
+ if (is_fresh_file (&stat)) {
+ ret = -1;
+ errno = ENOENT;
+ goto out;
+ }
+ }
+
+ ret = posix_gfid_set (this, path, loc, xattr_req);
+out:
+ return ret;
+}
+
+
int
posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req)
{
@@ -926,17 +983,17 @@ posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req)
if (sys_lstat (path, &stat) != 0)
goto out;
- data = dict_get (xattr_req, "system.posix_acl_access");
+ data = dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR);
if (data) {
- ret = sys_lsetxattr (path, "system.posix_acl_access",
+ ret = sys_lsetxattr (path, POSIX_ACL_ACCESS_XATTR,
data->data, data->len, 0);
if (ret != 0)
goto out;
}
- data = dict_get (xattr_req, "system.posix_acl_default");
+ data = dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR);
if (data) {
- ret = sys_lsetxattr (path, "system.posix_acl_default",
+ ret = sys_lsetxattr (path, POSIX_ACL_DEFAULT_XATTR,
data->data, data->len, 0);
if (ret != 0)
goto out;
@@ -946,35 +1003,47 @@ out:
return ret;
}
+static int
+_handle_entry_create_keyvalue_pair (dict_t *d, char *k, data_t *v,
+ void *tmp)
+{
+ int ret = -1;
+ posix_xattr_filler_t *filler = NULL;
+
+ filler = tmp;
+
+ if (!strcmp (GFID_XATTR_KEY, k) ||
+ !strcmp ("gfid-req", k) ||
+ !strcmp (POSIX_ACL_DEFAULT_XATTR, k) ||
+ !strcmp (POSIX_ACL_ACCESS_XATTR, k) ||
+ ZR_FILE_CONTENT_REQUEST(k)) {
+ return 0;
+ }
+
+ ret = posix_handle_pair (filler->this, filler->real_path, k, v,
+ XATTR_CREATE);
+ if (ret < 0) {
+ errno = -ret;
+ return -1;
+ }
+ return 0;
+}
+
int
posix_entry_create_xattr_set (xlator_t *this, const char *path,
dict_t *dict)
{
int ret = -1;
+ posix_xattr_filler_t filler = {0,};
+
if (!dict)
goto out;
- int _handle_keyvalue_pair (dict_t *d, char *k, data_t *v,
- void *tmp)
- {
- if (!strcmp (GFID_XATTR_KEY, k) ||
- !strcmp ("gfid-req", k) ||
- !strcmp ("system.posix_acl_default", k) ||
- !strcmp ("system.posix_acl_access", k) ||
- ZR_FILE_CONTENT_REQUEST(k)) {
- return 0;
- }
+ filler.this = this;
+ filler.real_path = path;
- ret = posix_handle_pair (this, path, k, v, XATTR_CREATE);
- if (ret < 0) {
- errno = -ret;
- return -1;
- }
- return 0;
- }
-
- ret = dict_foreach (dict, _handle_keyvalue_pair, NULL);
+ ret = dict_foreach (dict, _handle_entry_create_keyvalue_pair, &filler);
out:
return ret;
@@ -1064,3 +1133,259 @@ posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd)
return ret;
}
+
+static void *
+posix_health_check_thread_proc (void *data)
+{
+ xlator_t *this = NULL;
+ struct posix_private *priv = NULL;
+ uint32_t interval = 0;
+ int ret = -1;
+ struct stat sb = {0, };
+
+ this = data;
+ priv = this->private;
+
+ /* prevent races when the interval is updated */
+ interval = priv->health_check_interval;
+ if (interval == 0)
+ goto out;
+
+ gf_log (this->name, GF_LOG_DEBUG, "health-check thread started, "
+ "interval = %d seconds", interval);
+
+ while (1) {
+ /* aborting sleep() is a request to exit this thread, sleep()
+ * will normally not return when cancelled */
+ ret = sleep (interval);
+ if (ret > 0)
+ break;
+
+ /* prevent thread errors while doing the health-check(s) */
+ pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+
+ /* Do the health-check, it should be moved to its own function
+ * in case it gets more complex. */
+ ret = stat (priv->base_path, &sb);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "stat() on %s returned: %s", priv->base_path,
+ strerror (errno));
+ goto abort;
+ }
+
+ pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL);
+ }
+
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "health-check thread exiting");
+
+ LOCK (&priv->lock);
+ {
+ priv->health_check_active = _gf_false;
+ }
+ UNLOCK (&priv->lock);
+
+ return NULL;
+
+abort:
+ /* health-check failed */
+ gf_log (this->name, GF_LOG_EMERG, "health-check failed, going down");
+ xlator_notify (this->parents->xlator, GF_EVENT_CHILD_DOWN, this);
+
+ ret = sleep (30);
+ if (ret == 0) {
+ gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGTERM");
+ kill (getpid(), SIGTERM);
+ }
+
+ ret = sleep (30);
+ if (ret == 0) {
+ gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGKILL");
+ kill (getpid(), SIGKILL);
+ }
+
+ return NULL;
+}
+
+void
+posix_spawn_health_check_thread (xlator_t *xl)
+{
+ struct posix_private *priv = NULL;
+ int ret = -1;
+
+ priv = xl->private;
+
+ LOCK (&priv->lock);
+ {
+ /* cancel the running thread */
+ if (priv->health_check_active == _gf_true) {
+ pthread_cancel (priv->health_check);
+ priv->health_check_active = _gf_false;
+ }
+
+ /* prevent scheduling a check in a tight loop */
+ if (priv->health_check_interval == 0)
+ goto unlock;
+
+ ret = gf_thread_create (&priv->health_check, NULL,
+ posix_health_check_thread_proc, xl);
+ if (ret < 0) {
+ priv->health_check_interval = 0;
+ priv->health_check_active = _gf_false;
+ gf_log (xl->name, GF_LOG_ERROR,
+ "unable to setup health-check thread: %s",
+ strerror (errno));
+ goto unlock;
+ }
+
+ /* run the thread detached, resources will be freed on exit */
+ pthread_detach (priv->health_check);
+ priv->health_check_active = _gf_true;
+ }
+unlock:
+ UNLOCK (&priv->lock);
+}
+
+int
+posix_fsyncer_pick (xlator_t *this, struct list_head *head)
+{
+ struct posix_private *priv = NULL;
+ int count = 0;
+
+ priv = this->private;
+ pthread_mutex_lock (&priv->fsync_mutex);
+ {
+ while (list_empty (&priv->fsyncs))
+ pthread_cond_wait (&priv->fsync_cond,
+ &priv->fsync_mutex);
+
+ count = priv->fsync_queue_count;
+ priv->fsync_queue_count = 0;
+ list_splice_init (&priv->fsyncs, head);
+ }
+ pthread_mutex_unlock (&priv->fsync_mutex);
+
+ return count;
+}
+
+
+void
+posix_fsyncer_process (xlator_t *this, call_stub_t *stub, gf_boolean_t do_fsync)
+{
+ struct posix_fd *pfd = NULL;
+ int ret = -1;
+ struct posix_private *priv = NULL;
+
+ priv = this->private;
+
+ ret = posix_fd_ctx_get (stub->args.fd, this, &pfd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not get fdctx for fd(%s)",
+ uuid_utoa (stub->args.fd->inode->gfid));
+ call_unwind_error (stub, -1, EINVAL);
+ return;
+ }
+
+ if (do_fsync) {
+#ifdef HAVE_FDATASYNC
+ if (stub->args.datasync)
+ ret = fdatasync (pfd->fd);
+ else
+#endif
+ ret = fsync (pfd->fd);
+ } else {
+ ret = 0;
+ }
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not fstat fd(%s)",
+ uuid_utoa (stub->args.fd->inode->gfid));
+ call_unwind_error (stub, -1, errno);
+ return;
+ }
+
+ call_unwind_error (stub, 0, 0);
+}
+
+
+static void
+posix_fsyncer_syncfs (xlator_t *this, struct list_head *head)
+{
+ call_stub_t *stub = NULL;
+ struct posix_fd *pfd = NULL;
+ int ret = -1;
+
+ stub = list_entry (head->prev, call_stub_t, list);
+ ret = posix_fd_ctx_get (stub->args.fd, this, &pfd);
+ if (ret)
+ return;
+
+#ifdef GF_LINUX_HOST_OS
+ /* syncfs() is not "declared" in RHEL's glibc even though
+ the kernel has support.
+ */
+#include <sys/syscall.h>
+#include <unistd.h>
+#ifdef SYS_syncfs
+ syscall (SYS_syncfs, pfd->fd);
+#else
+ sync();
+#endif
+#else
+ sync();
+#endif
+}
+
+
+void *
+posix_fsyncer (void *d)
+{
+ xlator_t *this = d;
+ struct posix_private *priv = NULL;
+ call_stub_t *stub = NULL;
+ call_stub_t *tmp = NULL;
+ struct list_head list;
+ int count = 0;
+ gf_boolean_t do_fsync = _gf_true;
+
+ priv = this->private;
+
+ for (;;) {
+ INIT_LIST_HEAD (&list);
+
+ count = posix_fsyncer_pick (this, &list);
+
+ usleep (priv->batch_fsync_delay_usec);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "picked %d fsyncs", count);
+
+ switch (priv->batch_fsync_mode) {
+ case BATCH_NONE:
+ case BATCH_REVERSE_FSYNC:
+ break;
+ case BATCH_SYNCFS:
+ case BATCH_SYNCFS_SINGLE_FSYNC:
+ case BATCH_SYNCFS_REVERSE_FSYNC:
+ posix_fsyncer_syncfs (this, &list);
+ break;
+ }
+
+ if (priv->batch_fsync_mode == BATCH_SYNCFS)
+ do_fsync = _gf_false;
+ else
+ do_fsync = _gf_true;
+
+ list_for_each_entry_safe_reverse (stub, tmp, &list, list) {
+ list_del_init (&stub->list);
+
+ posix_fsyncer_process (this, stub, do_fsync);
+
+ if (priv->batch_fsync_mode == BATCH_SYNCFS_SINGLE_FSYNC)
+ do_fsync = _gf_false;
+ }
+ }
+}
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index cf4e08663..fb45c7a67 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -23,6 +23,8 @@
#include <pthread.h>
#include <ftw.h>
#include <sys/stat.h>
+#include <signal.h>
+#include <sys/uio.h>
#ifndef GF_BSD_HOST_OS
#include <alloca.h>
@@ -50,6 +52,7 @@
#include "glusterfs3-xdr.h"
#include "hashfn.h"
#include "posix-aio.h"
+#include "glusterfs-acl.h"
extern char *marker_xattrs[];
#define ALIGN_SIZE 4096
@@ -128,7 +131,7 @@ posix_lookup (call_frame_t *frame, xlator_t *this,
MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf);
if (uuid_is_null (loc->inode->gfid)) {
- posix_gfid_set (this, real_path, loc, xdata);
+ posix_gfid_heal (this, real_path, loc, xdata);
MAKE_ENTRY_HANDLE (real_path, par_path, this,
loc, &buf);
}
@@ -561,6 +564,289 @@ out:
return 0;
}
+static int32_t
+posix_do_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ off_t offset, size_t len, struct iatt *statpre,
+ struct iatt *statpost)
+{
+ struct posix_fd *pfd = NULL;
+ int32_t ret = -1;
+
+ DECLARE_OLD_FS_ID_VAR;
+
+ SET_FS_ID (frame->root->uid, frame->root->gid);
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ ret = posix_fd_ctx_get (fd, this, &pfd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "pfd is NULL from fd=%p", fd);
+ goto out;
+ }
+
+ ret = posix_fdstat (this, pfd->fd, statpre);
+ if (ret == -1) {
+ ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "fallocate (fstat) failed on fd=%p: %s", fd,
+ strerror (errno));
+ goto out;
+ }
+
+ ret = sys_fallocate(pfd->fd, flags, offset, len);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
+
+ ret = posix_fdstat (this, pfd->fd, statpost);
+ if (ret == -1) {
+ ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "fallocate (fstat) failed on fd=%p: %s", fd,
+ strerror (errno));
+ goto out;
+ }
+
+out:
+ SET_TO_OLD_FS_ID ();
+
+ return ret;
+}
+
+char*
+_page_aligned_alloc (size_t size, char **aligned_buf)
+{
+ char *alloc_buf = NULL;
+ char *buf = NULL;
+
+ alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char);
+ if (!alloc_buf)
+ goto out;
+ /* page aligned buffer */
+ buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE);
+ *aligned_buf = buf;
+out:
+ return alloc_buf;
+}
+
+static int32_t
+_posix_do_zerofill(int fd, off_t offset, size_t len, int o_direct)
+{
+ size_t num_vect = 0;
+ int32_t num_loop = 1;
+ int32_t idx = 0;
+ int32_t op_ret = -1;
+ int32_t vect_size = VECTOR_SIZE;
+ size_t remain = 0;
+ size_t extra = 0;
+ struct iovec *vector = NULL;
+ char *iov_base = NULL;
+ char *alloc_buf = NULL;
+
+ if (len == 0)
+ return 0;
+ if (len < VECTOR_SIZE)
+ vect_size = len;
+
+ num_vect = len / (vect_size);
+ remain = len % vect_size ;
+ if (num_vect > MAX_NO_VECT) {
+ extra = num_vect % MAX_NO_VECT;
+ num_loop = num_vect / MAX_NO_VECT;
+ num_vect = MAX_NO_VECT;
+ }
+
+ vector = GF_CALLOC (num_vect, sizeof(struct iovec),
+ gf_common_mt_iovec);
+ if (!vector)
+ return -1;
+ if (o_direct) {
+ alloc_buf = _page_aligned_alloc(vect_size, &iov_base);
+ if (!alloc_buf) {
+ gf_log ("_posix_do_zerofill", GF_LOG_DEBUG,
+ "memory alloc failed, vect_size %d: %s",
+ vect_size, strerror(errno));
+ GF_FREE(vector);
+ return -1;
+ }
+ } else {
+ iov_base = GF_CALLOC (vect_size, sizeof(char),
+ gf_common_mt_char);
+ if (!iov_base) {
+ GF_FREE(vector);
+ return -1;
+ }
+ }
+
+ for (idx = 0; idx < num_vect; idx++) {
+ vector[idx].iov_base = iov_base;
+ vector[idx].iov_len = vect_size;
+ }
+ lseek(fd, offset, SEEK_SET);
+ for (idx = 0; idx < num_loop; idx++) {
+ op_ret = writev(fd, vector, num_vect);
+ if (op_ret < 0)
+ goto err;
+ }
+ if (extra) {
+ op_ret = writev(fd, vector, extra);
+ if (op_ret < 0)
+ goto err;
+ }
+ if (remain) {
+ vector[0].iov_len = remain;
+ op_ret = writev(fd, vector , 1);
+ if (op_ret < 0)
+ goto err;
+ }
+err:
+ if (o_direct)
+ GF_FREE(alloc_buf);
+ else
+ GF_FREE(iov_base);
+ GF_FREE(vector);
+ return op_ret;
+}
+
+static int32_t
+posix_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, size_t len, struct iatt *statpre,
+ struct iatt *statpost)
+{
+ struct posix_fd *pfd = NULL;
+ int32_t ret = -1;
+
+ DECLARE_OLD_FS_ID_VAR;
+
+ SET_FS_ID (frame->root->uid, frame->root->gid);
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ ret = posix_fd_ctx_get (fd, this, &pfd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "pfd is NULL from fd=%p", fd);
+ goto out;
+ }
+
+ ret = posix_fdstat (this, pfd->fd, statpre);
+ if (ret == -1) {
+ ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "pre-operation fstat failed on fd = %p: %s", fd,
+ strerror (errno));
+ goto out;
+ }
+ ret = _posix_do_zerofill(pfd->fd, offset, len, pfd->flags & O_DIRECT);
+ if (ret < 0) {
+ ret = -errno;
+ gf_log(this->name, GF_LOG_ERROR,
+ "zerofill failed on fd %d length %ld %s",
+ pfd->fd, len, strerror(errno));
+ goto out;
+ }
+ if (pfd->flags & (O_SYNC|O_DSYNC)) {
+ ret = fsync (pfd->fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsync() in writev on fd %d failed: %s",
+ pfd->fd, strerror (errno));
+ ret = -errno;
+ goto out;
+ }
+ }
+
+ ret = posix_fdstat (this, pfd->fd, statpost);
+ if (ret == -1) {
+ ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "post operation fstat failed on fd=%p: %s", fd,
+ strerror (errno));
+ goto out;
+ }
+
+out:
+ SET_TO_OLD_FS_ID ();
+
+ return ret;
+}
+
+static int32_t
+_posix_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ int32_t ret;
+ int32_t flags = 0;
+ struct iatt statpre = {0,};
+ struct iatt statpost = {0,};
+
+ if (keep_size)
+ flags = FALLOC_FL_KEEP_SIZE;
+
+ ret = posix_do_fallocate(frame, this, fd, flags, offset, len,
+ &statpre, &statpost);
+ if (ret < 0)
+ goto err;
+
+ STACK_UNWIND_STRICT(fallocate, frame, 0, 0, &statpre, &statpost, NULL);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT(fallocate, frame, -1, -ret, NULL, NULL, NULL);
+ return 0;
+}
+
+static int32_t
+posix_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ int32_t ret;
+ int32_t flags = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE;
+ struct iatt statpre = {0,};
+ struct iatt statpost = {0,};
+
+ ret = posix_do_fallocate(frame, this, fd, flags, offset, len,
+ &statpre, &statpost);
+ if (ret < 0)
+ goto err;
+
+ STACK_UNWIND_STRICT(discard, frame, 0, 0, &statpre, &statpost, NULL);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT(discard, frame, -1, -ret, NULL, NULL, NULL);
+ return 0;
+
+}
+
+static int32_t
+posix_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ int32_t ret = 0;
+ struct iatt statpre = {0,};
+ struct iatt statpost = {0,};
+
+ ret = posix_do_zerofill(frame, this, fd, offset, len,
+ &statpre, &statpost);
+ if (ret < 0)
+ goto err;
+
+ STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, NULL);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT(zerofill, frame, -1, -ret, NULL, NULL, NULL);
+ return 0;
+
+}
+
int32_t
posix_opendir (call_frame_t *frame, xlator_t *this,
loc_t *loc, fd_t *fd, dict_t *xdata)
@@ -1718,6 +2004,9 @@ posix_create (call_frame_t *frame, xlator_t *this,
goto out;
}
+ if (was_present)
+ goto fill_stat;
+
op_ret = posix_gfid_set (this, real_path, loc, xdata);
if (op_ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -1748,6 +2037,7 @@ posix_create (call_frame_t *frame, xlator_t *this,
strerror (errno));
}
+fill_stat:
op_ret = posix_fdstat (this, _fd, &stbuf);
if (op_ret == -1) {
op_errno = errno;
@@ -1965,11 +2255,7 @@ posix_readv (call_frame_t *frame, xlator_t *this,
}
/* Hack to notify higher layers of EOF. */
- if (stbuf.ia_size == 0)
- op_errno = ENOENT;
- else if ((offset + vec.iov_len) == stbuf.ia_size)
- op_errno = ENOENT;
- else if (offset > stbuf.ia_size)
+ if (!stbuf.ia_size || (offset + vec.iov_len) >= stbuf.ia_size)
op_errno = ENOENT;
op_ret = vec.iov_len;
@@ -2014,22 +2300,6 @@ err:
return op_ret;
}
-char*
-_page_aligned_alloc (size_t size, char **aligned_buf)
-{
- char *alloc_buf = NULL;
- char *buf = NULL;
-
- alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char);
- if (!alloc_buf)
- goto out;
- /* page aligned buffer */
- buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE);
- *aligned_buf = buf;
-out:
- return alloc_buf;
-}
-
int32_t
__posix_writev (int fd, struct iovec *vector, int count, off_t startoff,
int odirect)
@@ -2078,6 +2348,48 @@ err:
return op_ret;
}
+dict_t*
+_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append)
+{
+ dict_t *rsp_xdata = NULL;
+ int32_t ret = 0;
+ inode_t *inode = NULL;
+
+ if (fd)
+ inode = fd->inode;
+
+ if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) {
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid Args: "
+ "fd: %p inode: %p gfid:%s", fd, inode?inode:0,
+ inode?uuid_utoa(inode->gfid):"N/A");
+ goto out;
+ }
+
+ if (!xdata || !dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT))
+ goto out;
+
+ rsp_xdata = dict_new();
+ if (!rsp_xdata)
+ goto out;
+
+ ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT,
+ fd->inode->fd_count);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set "
+ "dictionary value for %s", uuid_utoa (fd->inode->gfid),
+ GLUSTERFS_OPEN_FD_COUNT);
+ }
+
+ ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND,
+ is_append);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set "
+ "dictionary value for %s", uuid_utoa (fd->inode->gfid),
+ GLUSTERFS_WRITE_IS_APPEND);
+ }
+out:
+ return rsp_xdata;
+}
int32_t
posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
@@ -2092,6 +2404,9 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iatt preop = {0,};
struct iatt postop = {0,};
int ret = -1;
+ dict_t *rsp_xdata = NULL;
+ int is_append = 0;
+ gf_boolean_t locked = _gf_false;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -2113,6 +2428,17 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
_fd = pfd->fd;
+ if (xdata && dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) {
+ /* The write_is_append check and write must happen
+ atomically. Else another write can overtake this
+ write after the check and get written earlier.
+
+ So lock before preop-stat and unlock after write.
+ */
+ locked = _gf_true;
+ LOCK(&fd->inode->lock);
+ }
+
op_ret = posix_fdstat (this, _fd, &preop);
if (op_ret == -1) {
op_errno = errno;
@@ -2122,8 +2448,19 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
goto out;
}
+ if (locked) {
+ if (preop.ia_size == offset || (fd->flags & O_APPEND))
+ is_append = 1;
+ }
+
op_ret = __posix_writev (_fd, vector, count, offset,
(pfd->flags & O_DIRECT));
+
+ if (locked) {
+ UNLOCK (&fd->inode->lock);
+ locked = _gf_false;
+ }
+
if (op_ret < 0) {
op_errno = -op_ret;
op_ret = -1;
@@ -2139,14 +2476,21 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
UNLOCK (&priv->lock);
if (op_ret >= 0) {
+ rsp_xdata = _fill_writev_xdata (fd, xdata, this, is_append);
/* wiretv successful, we also need to get the stat of
* the file we wrote to
*/
- if (pfd->flushwrites) {
- /* NOTE: ignore the error, if one occurs at this
- * point */
- fsync (_fd);
+ if (flags & (O_SYNC|O_DSYNC)) {
+ ret = fsync (_fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsync() in writev on fd %d failed: %s",
+ _fd, strerror (errno));
+ op_ret = -1;
+ op_errno = errno;
+ goto out;
+ }
}
ret = posix_fdstat (this, _fd, &postop);
@@ -2162,9 +2506,16 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
out:
+ if (locked) {
+ UNLOCK (&fd->inode->lock);
+ locked = _gf_false;
+ }
+
STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop,
- NULL);
+ rsp_xdata);
+ if (rsp_xdata)
+ dict_unref (rsp_xdata);
return 0;
}
@@ -2291,6 +2642,33 @@ out:
}
+int
+posix_batch_fsync (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int datasync, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+ struct posix_private *priv = NULL;
+
+ priv = this->private;
+
+ stub = fop_fsync_stub (frame, default_fsync, fd, datasync, xdata);
+ if (!stub) {
+ STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0);
+ return 0;
+ }
+
+ pthread_mutex_lock (&priv->fsync_mutex);
+ {
+ list_add_tail (&stub->list, &priv->fsyncs);
+ priv->fsync_queue_count++;
+ pthread_cond_signal (&priv->fsync_cond);
+ }
+ pthread_mutex_unlock (&priv->fsync_mutex);
+
+ return 0;
+}
+
+
int32_t
posix_fsync (call_frame_t *frame, xlator_t *this,
fd_t *fd, int32_t datasync, dict_t *xdata)
@@ -2302,6 +2680,7 @@ posix_fsync (call_frame_t *frame, xlator_t *this,
int ret = -1;
struct iatt preop = {0,};
struct iatt postop = {0,};
+ struct posix_private *priv = NULL;
DECLARE_OLD_FS_ID_VAR;
@@ -2317,6 +2696,12 @@ posix_fsync (call_frame_t *frame, xlator_t *this,
goto out;
#endif
+ priv = this->private;
+ if (priv->batch_fsync_mode && xdata && dict_get (xdata, "batch-fsync")) {
+ posix_batch_fsync (frame, this, fd, datasync, xdata);
+ return 0;
+ }
+
ret = posix_fd_ctx_get (fd, this, &pfd);
if (ret < 0) {
op_errno = -ret;
@@ -2378,6 +2763,17 @@ out:
}
static int gf_posix_xattr_enotsup_log;
+static int
+_handle_setxattr_keyvalue_pair (dict_t *d, char *k, data_t *v,
+ void *tmp)
+{
+ posix_xattr_filler_t *filler = NULL;
+
+ filler = tmp;
+
+ return posix_handle_pair (filler->this, filler->real_path, k, v,
+ filler->flags);
+}
int32_t
posix_setxattr (call_frame_t *frame, xlator_t *this,
@@ -2386,7 +2782,8 @@ posix_setxattr (call_frame_t *frame, xlator_t *this,
int32_t op_ret = -1;
int32_t op_errno = 0;
char * real_path = NULL;
- int ret = -1;
+
+ posix_xattr_filler_t filler = {0,};
DECLARE_OLD_FS_ID_VAR;
SET_FS_ID (frame->root->uid, frame->root->gid);
@@ -2401,17 +2798,13 @@ posix_setxattr (call_frame_t *frame, xlator_t *this,
op_ret = -1;
dict_del (dict, GFID_XATTR_KEY);
-
- int _handle_every_keyvalue_pair (dict_t *d, char *k, data_t *v,
- void *tmp)
- {
- ret = posix_handle_pair (this, real_path, k, v, flags);
- if (ret < 0) {
- op_errno = -ret;
- }
- return ret;
- }
- op_ret = dict_foreach (dict, _handle_every_keyvalue_pair, NULL);
+ filler.real_path = real_path;
+ filler.this = this;
+ filler.flags = flags;
+ op_ret = dict_foreach (dict, _handle_setxattr_keyvalue_pair,
+ &filler);
+ if (op_ret < 0)
+ op_errno = -op_ret;
out:
SET_TO_OLD_FS_ID ();
@@ -2421,6 +2814,53 @@ out:
return 0;
}
+
+int
+posix_xattr_get_real_filename (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key, dict_t *dict, dict_t *xdata)
+{
+ char *real_path = NULL;
+ struct dirent *dirent = NULL;
+ DIR *fd = NULL;
+ const char *fname = NULL;
+ char *found = NULL;
+ int ret = -1;
+ int op_ret = -1;
+
+ MAKE_INODE_HANDLE (real_path, this, loc, NULL);
+
+ fd = opendir (real_path);
+ if (!fd)
+ return -errno;
+
+ fname = key + strlen (GF_XATTR_GET_REAL_FILENAME_KEY);
+
+ while ((dirent = readdir (fd))) {
+ if (strcasecmp (dirent->d_name, fname) == 0) {
+ found = gf_strdup (dirent->d_name);
+ if (!found) {
+ closedir (fd);
+ return -ENOMEM;
+ }
+ break;
+ }
+ }
+
+ closedir (fd);
+
+ if (!found)
+ return -ENOENT;
+
+ ret = dict_set_dynstr (dict, (char *)key, found);
+ if (ret) {
+ GF_FREE (found);
+ return -ENOMEM;
+ }
+ ret = strlen (found) + 1;
+
+ return ret;
+}
+
/**
* posix_getxattr - this function returns a dictionary with all the
* key:value pair present as xattr. used for
@@ -2475,9 +2915,29 @@ posix_getxattr (call_frame_t *frame, xlator_t *this,
dict = dict_new ();
if (!dict) {
+ op_errno = ENOMEM;
goto out;
}
+ if (loc->inode && name &&
+ (strncmp (name, GF_XATTR_GET_REAL_FILENAME_KEY,
+ strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)) {
+ ret = posix_xattr_get_real_filename (frame, this, loc,
+ name, dict, xdata);
+ if (ret < 0) {
+ op_ret = -1;
+ op_errno = -ret;
+ gf_log (this->name, (op_errno == ENOENT) ?
+ GF_LOG_DEBUG : GF_LOG_WARNING,
+ "Failed to get real filename (%s, %s): %s",
+ loc->path, name, strerror (op_errno));
+ goto out;
+ }
+
+ size = ret;
+ goto done;
+ }
+
if (loc->inode && name && !strcmp (name, GLUSTERFS_OPEN_FD_COUNT)) {
if (!list_empty (&loc->inode->fd_list)) {
ret = dict_set_uint32 (dict, (char *)name, 1);
@@ -2501,8 +2961,13 @@ posix_getxattr (call_frame_t *frame, xlator_t *this,
else
rpath = real_path;
- (void) snprintf (host_buf, 1024, "<POSIX(%s):%s:%s>",
- priv->base_path, priv->hostname, rpath);
+ (void) snprintf (host_buf, 1024,
+ "<POSIX(%s):%s:%s>", priv->base_path,
+ ((priv->node_uuid_pathinfo
+ && !uuid_is_null(priv->glusterd_uuid))
+ ? uuid_utoa (priv->glusterd_uuid)
+ : priv->hostname),
+ rpath);
dyn_rpath = gf_strdup (host_buf);
if (!dyn_rpath) {
@@ -2578,6 +3043,11 @@ posix_getxattr (call_frame_t *frame, xlator_t *this,
"supported (try remounting"
" brick with 'user_xattr' "
"flag)");
+ } else if (op_errno == ENOATTR ||
+ op_errno == ENODATA) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "No such attribute:%s for file %s",
+ key, real_path);
} else {
gf_log (this->name, GF_LOG_ERROR,
"getxattr failed on %s: %s (%s)",
@@ -2895,6 +3365,17 @@ out:
return 0;
}
+static int
+_handle_fsetxattr_keyvalue_pair (dict_t *d, char *k, data_t *v,
+ void *tmp)
+{
+ posix_xattr_filler_t *filler = NULL;
+
+ filler = tmp;
+
+ return posix_fhandle_pair (filler->this, filler->fd, k, v,
+ filler->flags);
+}
int32_t
posix_fsetxattr (call_frame_t *frame, xlator_t *this,
@@ -2904,7 +3385,9 @@ posix_fsetxattr (call_frame_t *frame, xlator_t *this,
int32_t op_errno = 0;
struct posix_fd * pfd = NULL;
int _fd = -1;
- int ret = -1;
+ int ret = -1;
+
+ posix_xattr_filler_t filler = {0,};
DECLARE_OLD_FS_ID_VAR;
SET_FS_ID (frame->root->uid, frame->root->gid);
@@ -2925,17 +3408,13 @@ posix_fsetxattr (call_frame_t *frame, xlator_t *this,
dict_del (dict, GFID_XATTR_KEY);
- int _handle_every_keyvalue_pair (dict_t *d, char *k, data_t *v,
- void *tmp)
- {
- ret = posix_fhandle_pair (this, _fd, k, v, flags);
- if (ret < 0) {
- op_errno = -ret;
- }
- return ret;
- }
-
- op_ret = dict_foreach (dict, _handle_every_keyvalue_pair, NULL);
+ filler.fd = _fd;
+ filler.this = this;
+ filler.flags = flags;
+ op_ret = dict_foreach (dict, _handle_fsetxattr_keyvalue_pair,
+ &filler);
+ if (op_ret < 0)
+ op_errno = -op_ret;
out:
SET_TO_OLD_FS_ID ();
@@ -2945,6 +3424,28 @@ out:
return 0;
}
+int
+_posix_remove_xattr (dict_t *dict, char *key, data_t *value, void *data)
+{
+ int32_t op_ret = 0;
+ xlator_t *this = NULL;
+ posix_xattr_filler_t *filler = NULL;
+
+ filler = (posix_xattr_filler_t *) data;
+ this = filler->this;
+
+ op_ret = sys_lremovexattr (filler->real_path, key);
+ if (op_ret == -1) {
+ filler->op_errno = errno;
+ if (errno != ENOATTR && errno != EPERM)
+ gf_log (this->name, GF_LOG_ERROR,
+ "removexattr failed on %s (for %s): %s",
+ filler->real_path, key, strerror (errno));
+ }
+
+ return op_ret;
+}
+
int32_t
posix_removexattr (call_frame_t *frame, xlator_t *this,
@@ -2953,6 +3454,7 @@ posix_removexattr (call_frame_t *frame, xlator_t *this,
int32_t op_ret = -1;
int32_t op_errno = 0;
char * real_path = NULL;
+ posix_xattr_filler_t filler = {0,};
DECLARE_OLD_FS_ID_VAR;
@@ -2968,6 +3470,22 @@ posix_removexattr (call_frame_t *frame, xlator_t *this,
SET_FS_ID (frame->root->uid, frame->root->gid);
+ /**
+ * sending an empty key name with xdata containing the
+ * list of key(s) to be removed implies "bulk remove request"
+ * for removexattr.
+ */
+ if (name && (strcmp (name, "") == 0) && xdata) {
+ filler.real_path = real_path;
+ filler.this = this;
+ op_ret = dict_foreach (xdata, _posix_remove_xattr, &filler);
+ if (op_ret) {
+ op_errno = filler.op_errno;
+ }
+
+ goto out;
+ }
+
op_ret = sys_lremovexattr (real_path, name);
if (op_ret == -1) {
op_errno = errno;
@@ -3126,6 +3644,159 @@ __add_long_array (int64_t *dest, int64_t *src, int count)
}
}
+static int
+_posix_handle_xattr_keyvalue_pair (dict_t *d, char *k, data_t *v,
+ void *tmp)
+{
+ int size = 0;
+ int count = 0;
+ int op_ret = 0;
+ int op_errno = 0;
+ gf_xattrop_flags_t optype = 0;
+ char *array = NULL;
+ inode_t *inode = NULL;
+ xlator_t *this = NULL;
+ posix_xattr_filler_t *filler = NULL;
+
+ filler = tmp;
+
+ optype = (gf_xattrop_flags_t)(filler->flags);
+ this = filler->this;
+ inode = filler->inode;
+
+ count = v->len;
+ array = GF_CALLOC (count, sizeof (char), gf_posix_mt_char);
+
+ LOCK (&inode->lock);
+ {
+ if (filler->real_path) {
+ size = sys_lgetxattr (filler->real_path, k,
+ (char *)array, v->len);
+ } else {
+ size = sys_fgetxattr (filler->fd, k, (char *)array,
+ v->len);
+ }
+
+ op_errno = errno;
+ if ((size == -1) && (op_errno != ENODATA) &&
+ (op_errno != ENOATTR)) {
+ if (op_errno == ENOTSUP) {
+ GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log,
+ this->name, GF_LOG_WARNING,
+ "Extended attributes not "
+ "supported by filesystem");
+ } else if (op_errno != ENOENT ||
+ !posix_special_xattr (marker_xattrs,
+ k)) {
+ if (filler->real_path)
+ gf_log (this->name, GF_LOG_ERROR,
+ "getxattr failed on %s while doing "
+ "xattrop: Key:%s (%s)",
+ filler->real_path,
+ k, strerror (op_errno));
+ else
+ gf_log (this->name, GF_LOG_ERROR,
+ "fgetxattr failed on fd=%d while doing "
+ "xattrop: Key:%s (%s)",
+ filler->fd,
+ k, strerror (op_errno));
+ }
+
+ op_ret = -1;
+ goto unlock;
+ }
+
+ switch (optype) {
+
+ case GF_XATTROP_ADD_ARRAY:
+ __add_array ((int32_t *) array, (int32_t *) v->data,
+ v->len / 4);
+ break;
+
+ case GF_XATTROP_ADD_ARRAY64:
+ __add_long_array ((int64_t *) array, (int64_t *) v->data,
+ v->len / 8);
+ break;
+
+ case GF_XATTROP_OR_ARRAY:
+ __or_array ((int32_t *) array,
+ (int32_t *) v->data,
+ v->len / 4);
+ break;
+
+ case GF_XATTROP_AND_ARRAY:
+ __and_array ((int32_t *) array,
+ (int32_t *) v->data,
+ v->len / 4);
+ break;
+
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unknown xattrop type (%d) on %s. Please send "
+ "a bug report to gluster-devel@nongnu.org",
+ optype, filler->real_path);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto unlock;
+ }
+
+ if (filler->real_path) {
+ size = sys_lsetxattr (filler->real_path, k, array,
+ v->len, 0);
+ } else {
+ size = sys_fsetxattr (filler->fd, k, (char *)array,
+ v->len, 0);
+ }
+ }
+unlock:
+ UNLOCK (&inode->lock);
+
+ if (op_ret == -1)
+ goto out;
+
+ op_errno = errno;
+ if (size == -1) {
+ if (filler->real_path)
+ gf_log (this->name, GF_LOG_ERROR,
+ "setxattr failed on %s while doing xattrop: "
+ "key=%s (%s)", filler->real_path,
+ k, strerror (op_errno));
+ else
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsetxattr failed on fd=%d while doing xattrop: "
+ "key=%s (%s)", filler->fd,
+ k, strerror (op_errno));
+
+ op_ret = -1;
+ goto out;
+ } else {
+ size = dict_set_bin (d, k, array, v->len);
+
+ if (size != 0) {
+ if (filler->real_path)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "dict_set_bin failed (path=%s): "
+ "key=%s (%s)", filler->real_path,
+ k, strerror (-size));
+ else
+ gf_log (this->name, GF_LOG_DEBUG,
+ "dict_set_bin failed (fd=%d): "
+ "key=%s (%s)", filler->fd,
+ k, strerror (-size));
+
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+ array = NULL;
+ }
+
+ array = NULL;
+
+out:
+ return op_ret;
+}
+
/**
* xattrop - xattr operations - for internal use by GlusterFS
* @optype: ADD_ARRAY:
@@ -3137,32 +3808,24 @@ int
do_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
gf_xattrop_flags_t optype, dict_t *xattr)
{
- char *real_path = NULL;
- char *array = NULL;
- int size = 0;
- int count = 0;
-
- int op_ret = 0;
- int op_errno = 0;
-
- int ret = 0;
- int _fd = -1;
- struct posix_fd *pfd = NULL;
-
- char * path = NULL;
- inode_t * inode = NULL;
+ int op_ret = 0;
+ int op_errno = 0;
+ int _fd = -1;
+ char *real_path = NULL;
+ struct posix_fd *pfd = NULL;
+ inode_t *inode = NULL;
+ posix_xattr_filler_t filler = {0,};
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (xattr, out);
VALIDATE_OR_GOTO (this, out);
if (fd) {
- ret = posix_fd_ctx_get (fd, this, &pfd);
- if (ret < 0) {
+ op_ret = posix_fd_ctx_get (fd, this, &pfd);
+ if (op_ret < 0) {
gf_log (this->name, GF_LOG_WARNING,
"failed to get pfd from fd=%p",
fd);
- op_ret = -1;
op_errno = EBADFD;
goto out;
}
@@ -3173,152 +3836,21 @@ do_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
MAKE_INODE_HANDLE (real_path, this, loc, NULL);
if (real_path) {
- path = gf_strdup (real_path);
inode = loc->inode;
} else if (fd) {
inode = fd->inode;
}
- int _handle_every_keyvalue_pair (dict_t *d, char *k, data_t *v,
- void *tmp)
- {
-
- count = v->len;
- array = GF_CALLOC (count, sizeof (char), gf_posix_mt_char);
-
- LOCK (&inode->lock);
- {
- if (loc) {
- size = sys_lgetxattr (real_path, k,
- (char *)array, v->len);
- } else {
- size = sys_fgetxattr (_fd, k, (char *)array,
- v->len);
- }
-
- op_errno = errno;
- if ((size == -1) && (op_errno != ENODATA) &&
- (op_errno != ENOATTR)) {
- if (op_errno == ENOTSUP) {
- GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log,
- this->name,GF_LOG_WARNING,
- "Extended attributes not "
- "supported by filesystem");
- } else if (op_errno != ENOENT ||
- !posix_special_xattr (marker_xattrs,
- k)) {
- if (loc)
- gf_log (this->name, GF_LOG_ERROR,
- "getxattr failed on %s while doing "
- "xattrop: Key:%s (%s)", path,
- k, strerror (op_errno));
- else
- gf_log (this->name, GF_LOG_ERROR,
- "fgetxattr failed on fd=%d while doing "
- "xattrop: Key:%s (%s)", _fd,
- k, strerror (op_errno));
- }
-
- op_ret = -1;
- goto unlock;
- }
-
- switch (optype) {
-
- case GF_XATTROP_ADD_ARRAY:
- __add_array ((int32_t *) array, (int32_t *) v->data,
- v->len / 4);
- break;
-
- case GF_XATTROP_ADD_ARRAY64:
- __add_long_array ((int64_t *) array, (int64_t *) v->data,
- v->len / 8);
- break;
-
- case GF_XATTROP_OR_ARRAY:
- __or_array ((int32_t *) array,
- (int32_t *) v->data,
- v->len / 4);
- break;
-
- case GF_XATTROP_AND_ARRAY:
- __and_array ((int32_t *) array,
- (int32_t *) v->data,
- v->len / 4);
- break;
-
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "Unknown xattrop type (%d) on %s. Please send "
- "a bug report to gluster-devel@nongnu.org",
- optype, path);
- op_ret = -1;
- op_errno = EINVAL;
- goto unlock;
- }
-
- if (loc) {
- size = sys_lsetxattr (real_path, k, array,
- v->len, 0);
- } else {
- size = sys_fsetxattr (_fd, k, (char *)array,
- v->len, 0);
- }
- }
- unlock:
- UNLOCK (&inode->lock);
-
- if (op_ret == -1)
- goto out;
-
- op_errno = errno;
- if (size == -1) {
- if (loc)
- gf_log (this->name, GF_LOG_ERROR,
- "setxattr failed on %s while doing xattrop: "
- "key=%s (%s)", path,
- k, strerror (op_errno));
- else
- gf_log (this->name, GF_LOG_ERROR,
- "fsetxattr failed on fd=%d while doing xattrop: "
- "key=%s (%s)", _fd,
- k, strerror (op_errno));
-
- op_ret = -1;
- goto out;
- } else {
- size = dict_set_bin (xattr, k, array, v->len);
-
- if (size != 0) {
- if (loc)
- gf_log (this->name, GF_LOG_DEBUG,
- "dict_set_bin failed (path=%s): "
- "key=%s (%s)", path,
- k, strerror (-size));
- else
- gf_log (this->name, GF_LOG_DEBUG,
- "dict_set_bin failed (fd=%d): "
- "key=%s (%s)", _fd,
- k, strerror (-size));
-
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
- array = NULL;
- }
+ filler.this = this;
+ filler.fd = _fd;
+ filler.real_path = real_path;
+ filler.flags = (int)optype;
+ filler.inode = inode;
- array = NULL;
-
- out:
- return op_ret;
- }
- op_ret = dict_foreach (xattr, _handle_every_keyvalue_pair, NULL);
+ op_ret = dict_foreach (xattr, _posix_handle_xattr_keyvalue_pair,
+ &filler);
out:
- GF_FREE (array);
-
- GF_FREE (path);
STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, xattr, NULL);
return 0;
@@ -3723,6 +4255,8 @@ posix_readdirp_fill (xlator_t *this, fd_t *fd, gf_dirent_t *entries, dict_t *dic
struct iatt stbuf = {0, };
uuid_t gfid;
+ if (list_empty(&entries->list))
+ return 0;
itable = fd->inode->table;
@@ -3811,8 +4345,23 @@ posix_do_readdir (call_frame_t *frame, xlator_t *this,
*/
ret = dict_get_int32 (dict, GF_READDIR_SKIP_DIRS, &skip_dirs);
- count = posix_fill_readdir (fd, dir, off, size, &entries, this,
- skip_dirs);
+ LOCK (&fd->lock);
+ {
+ /* posix_fill_readdir performs multiple separate individual
+ readdir() calls to fill up the buffer.
+
+ In case of NFS where the same anonymous FD is shared between
+ different applications, reading a common directory can
+ result in the anonymous fd getting re-used unsafely between
+ the two readdir requests (in two different io-threads).
+
+ It would also help, in the future, to replace the loop
+ around readdir() with a single large getdents() call.
+ */
+ count = posix_fill_readdir (fd, dir, off, size, &entries, this,
+ skip_dirs);
+ }
+ UNLOCK (&fd->lock);
/* pick ENOENT to indicate EOF */
op_errno = errno;
@@ -4015,6 +4564,27 @@ posix_set_owner (xlator_t *this, uid_t uid, gid_t gid)
return ret;
}
+
+static int
+set_batch_fsync_mode (struct posix_private *priv, const char *str)
+{
+ if (strcmp (str, "none") == 0)
+ priv->batch_fsync_mode = BATCH_NONE;
+ else if (strcmp (str, "syncfs") == 0)
+ priv->batch_fsync_mode = BATCH_SYNCFS;
+ else if (strcmp (str, "syncfs-single-fsync") == 0)
+ priv->batch_fsync_mode = BATCH_SYNCFS_SINGLE_FSYNC;
+ else if (strcmp (str, "syncfs-reverse-fsync") == 0)
+ priv->batch_fsync_mode = BATCH_SYNCFS_REVERSE_FSYNC;
+ else if (strcmp (str, "reverse-fsync") == 0)
+ priv->batch_fsync_mode = BATCH_REVERSE_FSYNC;
+ else
+ return -1;
+
+ return 0;
+}
+
+
int
reconfigure (xlator_t *this, dict_t *options)
{
@@ -4022,6 +4592,7 @@ reconfigure (xlator_t *this, dict_t *options)
struct posix_private *priv = NULL;
uid_t uid = -1;
gid_t gid = -1;
+ char *batch_fsync_mode_str = NULL;
priv = this->private;
@@ -4029,6 +4600,18 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("brick-gid", gid, options, uint32, out);
posix_set_owner (this, uid, gid);
+ GF_OPTION_RECONF ("batch-fsync-delay-usec", priv->batch_fsync_delay_usec,
+ options, uint32, out);
+
+ GF_OPTION_RECONF ("batch-fsync-mode", batch_fsync_mode_str,
+ options, str, out);
+
+ if (set_batch_fsync_mode (priv, batch_fsync_mode_str) != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s",
+ batch_fsync_mode_str);
+ goto out;
+ }
+
GF_OPTION_RECONF ("linux-aio", priv->aio_configured,
options, bool, out);
@@ -4037,6 +4620,20 @@ reconfigure (xlator_t *this, dict_t *options)
else
posix_aio_off (this);
+ GF_OPTION_RECONF ("node-uuid-pathinfo", priv->node_uuid_pathinfo,
+ options, bool, out);
+
+ if (priv->node_uuid_pathinfo &&
+ (uuid_is_null (priv->glusterd_uuid))) {
+ gf_log (this->name, GF_LOG_INFO,
+ "glusterd uuid is NULL, pathinfo xattr would"
+ " fallback to <hostname>:<export>");
+ }
+
+ GF_OPTION_RECONF ("health-check-interval", priv->health_check_interval,
+ options, uint32, out);
+ posix_spawn_health_check_thread (this);
+
ret = 0;
out:
return ret;
@@ -4066,6 +4663,7 @@ init (xlator_t *this)
char *guuid = NULL;
uid_t uid = -1;
gid_t gid = -1;
+ char *batch_fsync_mode_str;
dir_data = dict_get (this->options, "directory");
@@ -4218,7 +4816,7 @@ init (xlator_t *this)
}
}
- size = sys_lgetxattr (dir_data->data, "system.posix_acl_access",
+ size = sys_lgetxattr (dir_data->data, POSIX_ACL_ACCESS_XATTR,
NULL, 0);
if ((size < 0) && (errno == ENOTSUP))
gf_log (this->name, GF_LOG_WARNING,
@@ -4398,11 +4996,48 @@ init (xlator_t *this)
}
}
+ GF_OPTION_INIT ("node-uuid-pathinfo",
+ _private->node_uuid_pathinfo, bool, out);
+ if (_private->node_uuid_pathinfo &&
+ (uuid_is_null (_private->glusterd_uuid))) {
+ gf_log (this->name, GF_LOG_INFO,
+ "glusterd uuid is NULL, pathinfo xattr would"
+ " fallback to <hostname>:<export>");
+ }
+
+ _private->health_check_active = _gf_false;
+ GF_OPTION_INIT ("health-check-interval",
+ _private->health_check_interval, uint32, out);
+ if (_private->health_check_interval)
+ posix_spawn_health_check_thread (this);
+
pthread_mutex_init (&_private->janitor_lock, NULL);
pthread_cond_init (&_private->janitor_cond, NULL);
INIT_LIST_HEAD (&_private->janitor_fds);
posix_spawn_janitor_thread (this);
+
+ pthread_mutex_init (&_private->fsync_mutex, NULL);
+ pthread_cond_init (&_private->fsync_cond, NULL);
+ INIT_LIST_HEAD (&_private->fsyncs);
+
+ ret = gf_thread_create (&_private->fsyncer, NULL, posix_fsyncer, this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "fsyncer thread"
+ " creation failed (%s)", strerror (errno));
+ goto out;
+ }
+
+ GF_OPTION_INIT ("batch-fsync-mode", batch_fsync_mode_str, str, out);
+
+ if (set_batch_fsync_mode (_private, batch_fsync_mode_str) != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s",
+ batch_fsync_mode_str);
+ goto out;
+ }
+
+ GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec,
+ uint32, out);
out:
return ret;
}
@@ -4468,6 +5103,9 @@ struct xlator_fops fops = {
.fxattrop = posix_fxattrop,
.setattr = posix_setattr,
.fsetattr = posix_fsetattr,
+ .fallocate = _posix_fallocate,
+ .discard = posix_discard,
+ .zerofill = posix_zerofill,
};
struct xlator_cbks cbks = {
@@ -4504,12 +5142,51 @@ struct volume_options options[] = {
{
.key = {"brick-uid"},
.type = GF_OPTION_TYPE_INT,
- .description = "Support for setting uid of brick's root"
+ .min = 0,
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Support for setting uid of brick's owner"
},
{
.key = {"brick-gid"},
.type = GF_OPTION_TYPE_INT,
- .description = "Support for setting gid of brick's root"
+ .min = 0,
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Support for setting gid of brick's owner"
},
+ { .key = {"node-uuid-pathinfo"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "return glusterd's node-uuid in pathinfo xattr"
+ " string instead of hostname"
+ },
+ {
+ .key = {"health-check-interval"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .default_value = "30",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Interval in seconds for a filesystem health check, "
+ "set to 0 to disable"
+ },
+ { .key = {"batch-fsync-mode"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "reverse-fsync",
+ .description = "Possible values:\n"
+ "\t- syncfs: Perform one syncfs() on behalf oa batch"
+ "of fsyncs.\n"
+ "\t- syncfs-single-fsync: Perform one syncfs() on behalf of a batch"
+ " of fsyncs and one fsync() per batch.\n"
+ "\t- syncfs-reverse-fsync: Preform one syncfs() on behalf of a batch"
+ " of fsyncs and fsync() each file in the batch in reverse order.\n"
+ " in reverse order.\n"
+ "\t- reverse-fsync: Perform fsync() of each file in the batch in"
+ " reverse order."
+ },
+ { .key = {"batch-fsync-delay-usec"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "0",
+ .description = "Num of usecs to wait for aggregating fsync"
+ " requests",
+ },
{ .key = {NULL} }
};
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
index 45ee35963..3121db271 100644
--- a/xlators/storage/posix/src/posix.h
+++ b/xlators/storage/posix/src/posix.h
@@ -43,12 +43,15 @@
#include "timer.h"
#include "posix-mem-types.h"
#include "posix-handle.h"
+#include "call-stub.h"
#ifdef HAVE_LIBAIO
#include <libaio.h>
#include "posix-aio.h"
#endif
+#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/
+#define MAX_NO_VECT 1024
/**
* posix_fd - internal structure common to file and directory fd's
*/
@@ -57,7 +60,6 @@ struct posix_fd {
int fd; /* fd returned by the kernel */
int32_t flags; /* flags for open/creat */
DIR * dir; /* handle returned by the kernel */
- int flushwrites;
int odirect;
struct list_head list; /* to add to the janitor list */
};
@@ -125,8 +127,45 @@ struct posix_private {
io_context_t ctxp;
pthread_t aiothread;
#endif
+
+ /* node-uuid in pathinfo xattr */
+ gf_boolean_t node_uuid_pathinfo;
+
+ pthread_t fsyncer;
+ struct list_head fsyncs;
+ pthread_mutex_t fsync_mutex;
+ pthread_cond_t fsync_cond;
+ int fsync_queue_count;
+
+ enum {
+ BATCH_NONE = 0,
+ BATCH_SYNCFS,
+ BATCH_SYNCFS_SINGLE_FSYNC,
+ BATCH_REVERSE_FSYNC,
+ BATCH_SYNCFS_REVERSE_FSYNC
+ } batch_fsync_mode;
+
+ uint32_t batch_fsync_delay_usec;
+
+ /* seconds to sleep between health checks */
+ uint32_t health_check_interval;
+ pthread_t health_check;
+ gf_boolean_t health_check_active;
};
+typedef struct {
+ xlator_t *this;
+ const char *real_path;
+ dict_t *xattr;
+ struct iatt *stbuf;
+ loc_t *loc;
+ inode_t *inode; /* for all do_xattrop() key handling */
+ int fd;
+ int flags;
+ int32_t op_errno;
+} posix_xattr_filler_t;
+
+
#define POSIX_BASE_PATH(this) (((struct posix_private *)this->private)->base_path)
#define POSIX_BASE_PATH_LEN(this) (((struct posix_private *)this->private)->base_path_length)
@@ -151,7 +190,7 @@ int posix_get_file_contents (xlator_t *this, uuid_t pargfid,
int posix_set_file_contents (xlator_t *this, const char *path, char *key,
data_t *value, int flags);
int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req);
-int posix_gfid_heal (xlator_t *this, const char *path, dict_t *xattr_req);
+int posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req);
int posix_entry_create_xattr_set (xlator_t *this, const char *path,
dict_t *dict);
@@ -163,4 +202,7 @@ gf_boolean_t posix_special_xattr (char **pattern, char *key);
void
__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags,
off_t offset, size_t size);
+void posix_spawn_health_check_thread (xlator_t *this);
+
+void *posix_fsyncer (void *);
#endif /* _POSIX_H */