summaryrefslogtreecommitdiffstats
path: root/xlators/storage
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/storage')
-rw-r--r--xlators/storage/Makefile.am8
-rw-r--r--xlators/storage/bd/Makefile.am (renamed from xlators/storage/bdb/Makefile.am)2
-rw-r--r--xlators/storage/bd/src/Makefile.am20
-rw-r--r--xlators/storage/bd/src/bd-aio.c527
-rw-r--r--xlators/storage/bd/src/bd-aio.h41
-rw-r--r--xlators/storage/bd/src/bd-helper.c783
-rw-r--r--xlators/storage/bd/src/bd.c2404
-rw-r--r--xlators/storage/bd/src/bd.h178
-rw-r--r--xlators/storage/bdb/src/Makefile.am18
-rw-r--r--xlators/storage/bdb/src/bctx.c341
-rw-r--r--xlators/storage/bdb/src/bdb-ll.c1460
-rw-r--r--xlators/storage/bdb/src/bdb.c3624
-rw-r--r--xlators/storage/bdb/src/bdb.h530
-rw-r--r--xlators/storage/posix/src/Makefile.am18
-rw-r--r--xlators/storage/posix/src/posix-aio.c569
-rw-r--r--xlators/storage/posix/src/posix-aio.h39
-rw-r--r--xlators/storage/posix/src/posix-handle.c744
-rw-r--r--xlators/storage/posix/src/posix-handle.h143
-rw-r--r--xlators/storage/posix/src/posix-helpers.c1391
-rw-r--r--xlators/storage/posix/src/posix-mem-types.h27
-rw-r--r--xlators/storage/posix/src/posix.c5723
-rw-r--r--xlators/storage/posix/src/posix.h158
22 files changed, 10426 insertions, 8322 deletions
diff --git a/xlators/storage/Makefile.am b/xlators/storage/Makefile.am
index 59b968969..c08e8e41b 100644
--- a/xlators/storage/Makefile.am
+++ b/xlators/storage/Makefile.am
@@ -1,3 +1,7 @@
-SUBDIRS = posix $(BDB_SUBDIR)
+SUBDIRS = posix
-CLEANFILES =
+if ENABLE_BD_XLATOR
+SUBDIRS += bd
+endif
+
+CLEANFILES =
diff --git a/xlators/storage/bdb/Makefile.am b/xlators/storage/bd/Makefile.am
index d471a3f92..a985f42a8 100644
--- a/xlators/storage/bdb/Makefile.am
+++ b/xlators/storage/bd/Makefile.am
@@ -1,3 +1,3 @@
SUBDIRS = src
-CLEANFILES =
+CLEANFILES =
diff --git a/xlators/storage/bd/src/Makefile.am b/xlators/storage/bd/src/Makefile.am
new file mode 100644
index 000000000..3d93f7442
--- /dev/null
+++ b/xlators/storage/bd/src/Makefile.am
@@ -0,0 +1,20 @@
+if ENABLE_BD_XLATOR
+xlator_LTLIBRARIES = bd.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage
+
+bd_la_LDFLAGS = -module -avoid-version
+LIBBD = -llvm2app -lrt
+bd_la_SOURCES = bd.c bd-helper.c bd-aio.c
+bd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBBD) $(LIBAIO)
+
+noinst_HEADERS = bd.h bd-aio.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
+
+AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS)
+
+CLEANFILES =
+
+endif
diff --git a/xlators/storage/bd/src/bd-aio.c b/xlators/storage/bd/src/bd-aio.c
new file mode 100644
index 000000000..62d4590f7
--- /dev/null
+++ b/xlators/storage/bd/src/bd-aio.c
@@ -0,0 +1,527 @@
+/*
+ Copyright IBM, Corp. 2013
+
+ This file is part of GlusterFS.
+
+ Author: M. Mohan Kumar <mohan@in.ibm.com>
+
+ Based on posix-aio.c
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <lvm2app.h>
+#include <sys/uio.h>
+
+#include "xlator.h"
+#include "glusterfs.h"
+#include "defaults.h"
+#include "bd.h"
+#include "bd-aio.h"
+
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+
+struct bd_aio_cb {
+ struct iocb iocb;
+ call_frame_t *frame;
+ struct iobuf *iobuf;
+ struct iobref *iobref;
+ struct iatt prebuf;
+ int op;
+ off_t offset;
+ fd_t *fd;
+};
+
+void
+__bd_fd_set_odirect (fd_t *fd, bd_fd_t *bd_fd, int opflags,
+ off_t offset, size_t size)
+{
+ int odirect = 0;
+ int flags = 0;
+ int ret = 0;
+
+ odirect = bd_fd->odirect;
+
+ if ((fd->flags|opflags) & O_DIRECT) {
+ /* if instructed, use O_DIRECT always */
+ odirect = 1;
+ } else {
+ /* else use O_DIRECT when feasible */
+ if ((offset|size) & 0xfff)
+ odirect = 0;
+ else
+ odirect = 1;
+ }
+
+ if (!odirect && bd_fd->odirect) {
+ flags = fcntl (bd_fd->fd, F_GETFL);
+ ret = fcntl (bd_fd->fd, F_SETFL, (flags & (~O_DIRECT)));
+ bd_fd->odirect = 0;
+ }
+
+ if (odirect && !bd_fd->odirect) {
+ flags = fcntl (bd_fd->fd, F_GETFL);
+ ret = fcntl (bd_fd->fd, F_SETFL, (flags | O_DIRECT));
+ bd_fd->odirect = 1;
+ }
+
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "fcntl() failed (%s). fd=%d flags=%d pfd->odirect=%d",
+ strerror (errno), bd_fd->fd, flags, bd_fd->odirect);
+ }
+}
+
+int
+bd_aio_readv_complete (struct bd_aio_cb *paiocb, int res, int res2)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct iobuf *iobuf = NULL;
+ struct iatt postbuf = {0,};
+ int op_ret = -1;
+ int op_errno = 0;
+ struct iovec iov;
+ struct iobref *iobref = NULL;
+ off_t offset = 0;
+ bd_attr_t *bdatt = NULL;
+
+ frame = paiocb->frame;
+ this = frame->this;
+ iobuf = paiocb->iobuf;
+ offset = paiocb->offset;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_log (this->name, GF_LOG_ERROR,
+ "readv(async) failed fd=%p,size=%lu,offset=%llu (%d/%s)",
+ paiocb->fd, paiocb->iocb.u.c.nbytes,
+ (unsigned long long) paiocb->offset,
+ res, strerror (op_errno));
+ goto out;
+ }
+
+ bd_inode_ctx_get (paiocb->fd->inode, this, &bdatt);
+ memcpy (&postbuf, &bdatt->iatt, sizeof (struct iatt));
+
+ op_ret = res;
+ op_errno = 0;
+
+ iobref = iobref_new ();
+ if (!iobref) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ iobref_add (iobref, iobuf);
+
+ iov.iov_base = iobuf_ptr (iobuf);
+ iov.iov_len = op_ret;
+
+ /* Hack to notify higher layers of EOF. */
+ if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size)
+ op_errno = ENOENT;
+
+out:
+ STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, &iov, 1,
+ &postbuf, iobref, NULL);
+ if (iobuf)
+ iobuf_unref (iobuf);
+ if (iobref)
+ iobref_unref (iobref);
+
+ GF_FREE (paiocb);
+
+ return 0;
+}
+
+int
+bd_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ size_t size, off_t offset, uint32_t flags, dict_t *xdata)
+{
+ int32_t op_errno = EINVAL;
+ int _fd = -1;
+ struct iobuf *iobuf = NULL;
+ bd_fd_t *bd_fd = NULL;
+ int ret = -1;
+ struct bd_aio_cb *paiocb = NULL;
+ bd_priv_t *priv = NULL;
+ struct iocb *iocb = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ priv = this->private;
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_readv_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->readv, fd, size, offset,
+ flags, xdata);
+ return 0;
+ }
+ _fd = bd_fd->fd;
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (!size) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size);
+ goto err;
+ }
+
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
+ if (!iobuf) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ paiocb = CALLOC (1, sizeof (*paiocb));
+ if (!paiocb) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ paiocb->frame = frame;
+ paiocb->iobuf = iobuf;
+ paiocb->offset = offset;
+ paiocb->op = GF_FOP_READ;
+ paiocb->fd = fd;
+
+ paiocb->iocb.data = paiocb;
+ paiocb->iocb.aio_fildes = _fd;
+ paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD;
+ paiocb->iocb.aio_reqprio = 0;
+ paiocb->iocb.u.c.buf = iobuf_ptr (iobuf);
+ paiocb->iocb.u.c.nbytes = size;
+ paiocb->iocb.u.c.offset = offset;
+
+ iocb = &paiocb->iocb;
+
+ LOCK (&fd->lock);
+ {
+ __bd_fd_set_odirect (fd, bd_fd, flags, offset, size);
+
+ ret = io_submit (priv->ctxp, 1, &iocb);
+ }
+ UNLOCK (&fd->lock);
+
+ if (ret != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_submit() returned %d", ret);
+ op_errno = -ret;
+ goto err;
+ }
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
+ if (iobuf)
+ iobuf_unref (iobuf);
+
+ if (paiocb)
+ GF_FREE (paiocb);
+
+ return 0;
+}
+
+int
+bd_aio_writev_complete (struct bd_aio_cb *paiocb, int res, int res2)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct iatt prebuf = {0,};
+ struct iatt postbuf = {0,};
+ int op_ret = -1;
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+
+ frame = paiocb->frame;
+ prebuf = paiocb->prebuf;
+ this = frame->this;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_log (this->name, GF_LOG_ERROR,
+ "writev(async) failed fd=%p,offset=%llu (%d/%s)",
+ paiocb->fd, (unsigned long long) paiocb->offset, res,
+ strerror (op_errno));
+
+ goto out;
+ }
+
+ bd_inode_ctx_get (paiocb->fd->inode, this, &bdatt);
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+ memcpy (&postbuf, &bdatt->iatt, sizeof (struct iatt));
+
+ op_ret = res;
+ op_errno = 0;
+
+out:
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &prebuf, &postbuf,
+ NULL);
+
+ if (paiocb) {
+ if (paiocb->iobref)
+ iobref_unref (paiocb->iobref);
+ GF_FREE (paiocb);
+ }
+
+ return 0;
+}
+
+int
+bd_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *iov, int count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ int32_t op_errno = EINVAL;
+ int _fd = -1;
+ bd_fd_t *bd_fd = NULL;
+ int ret = -1;
+ struct bd_aio_cb *paiocb = NULL;
+ bd_priv_t *priv = NULL;
+ struct iocb *iocb = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ priv = this->private;
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_writev_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev,
+ fd, iov, count, offset, flags, iobref, xdata);
+ return 0;
+ }
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+
+ _fd = bd_fd->fd;
+
+ paiocb = CALLOC (1, sizeof (*paiocb));
+ if (!paiocb) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+
+ paiocb->frame = frame;
+ paiocb->offset = offset;
+ paiocb->op = GF_FOP_WRITE;
+ paiocb->fd = fd;
+
+ paiocb->iocb.data = paiocb;
+ paiocb->iocb.aio_fildes = _fd;
+ paiocb->iobref = iobref_ref (iobref);
+ paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV;
+ paiocb->iocb.aio_reqprio = 0;
+ paiocb->iocb.u.v.vec = iov;
+ paiocb->iocb.u.v.nr = count;
+ paiocb->iocb.u.v.offset = offset;
+
+ iocb = &paiocb->iocb;
+
+ memcpy (&paiocb->prebuf, &bdatt->iatt, sizeof (struct iatt));
+ LOCK (&fd->lock);
+ {
+ __bd_fd_set_odirect (fd, bd_fd, flags, offset,
+ iov_length (iov, count));
+
+ ret = io_submit (priv->ctxp, 1, &iocb);
+ }
+ UNLOCK (&fd->lock);
+
+ if (ret != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_submit() returned %d", ret);
+ op_errno = -ret;
+ goto err;
+ }
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (writev, frame, -1, op_errno, 0, 0, 0);
+
+ if (paiocb) {
+ if (paiocb->iobref)
+ iobref_unref (paiocb->iobref);
+ GF_FREE (paiocb);
+ }
+
+ return 0;
+}
+
+void *
+bd_aio_thread (void *data)
+{
+ xlator_t *this = NULL;
+ bd_priv_t *priv = NULL;
+ int ret = 0;
+ int i = 0;
+ struct io_event *event = NULL;
+ struct bd_aio_cb *paiocb = NULL;
+ struct io_event events[BD_AIO_MAX_NR_GETEVENTS];
+ struct timespec ts = {0, };
+
+ this = data;
+ THIS = this;
+ priv = this->private;
+
+ ts.tv_sec = 5;
+ for (;;) {
+ memset (&events[0], 0, sizeof (events));
+ ret = io_getevents (priv->ctxp, 1, BD_AIO_MAX_NR_GETEVENTS,
+ &events[0], &ts);
+ if (ret < 0) {
+ if (ret == -EINTR)
+ continue;
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_getevents() returned %d, exiting", ret);
+ break;
+ }
+
+ for (i = 0; i < ret; i++) {
+ event = &events[i];
+
+ paiocb = event->data;
+
+ switch (paiocb->op) {
+ case GF_FOP_READ:
+ bd_aio_readv_complete (paiocb, event->res,
+ event->res2);
+ break;
+ case GF_FOP_WRITE:
+ bd_aio_writev_complete (paiocb, event->res,
+ event->res2);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "unknown op %d found in piocb",
+ paiocb->op);
+ break;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+int
+bd_aio_init (xlator_t *this)
+{
+ bd_priv_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ ret = io_setup (BD_AIO_MAX_NR_EVENTS, &priv->ctxp);
+ if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Linux AIO not available at run-time."
+ " Continuing with synchronous IO");
+ ret = 0;
+ goto out;
+ }
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "io_setup() failed. ret=%d, errno=%d",
+ ret, errno);
+ goto out;
+ }
+
+ ret = pthread_create (&priv->aiothread, NULL,
+ bd_aio_thread, this);
+ if (ret != 0) {
+ io_destroy (priv->ctxp);
+ goto out;
+ }
+
+ this->fops->readv = bd_aio_readv;
+ this->fops->writev = bd_aio_writev;
+out:
+ return ret;
+}
+
+
+int
+bd_aio_on (xlator_t *this)
+{
+ bd_priv_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ if (!priv->aio_init_done) {
+ ret = bd_aio_init (this);
+ if (ret == 0)
+ priv->aio_capable = _gf_true;
+ else
+ priv->aio_capable = _gf_false;
+ priv->aio_init_done = _gf_true;
+ }
+
+ if (priv->aio_capable) {
+ this->fops->readv = bd_aio_readv;
+ this->fops->writev = bd_aio_writev;
+ }
+
+ return ret;
+}
+
+int
+bd_aio_off (xlator_t *this)
+{
+ this->fops->readv = bd_readv;
+ this->fops->writev = bd_writev;
+
+ return 0;
+}
+
+#else
+
+int
+bd_aio_on (xlator_t *this)
+{
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return 0;
+}
+
+int
+bd_aio_off (xlator_t *this)
+{
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return 0;
+}
+
+void
+__bd_fd_set_odirect (fd_t *fd, struct bd_fd *pfd, int opflags,
+ off_t offset, size_t size)
+{
+ xlator_t *this = THIS;
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return;
+}
+#endif
diff --git a/xlators/storage/bd/src/bd-aio.h b/xlators/storage/bd/src/bd-aio.h
new file mode 100644
index 000000000..16f686a4c
--- /dev/null
+++ b/xlators/storage/bd/src/bd-aio.h
@@ -0,0 +1,41 @@
+/*
+ Copyright IBM, Corp. 2013
+
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _BD_AIO_H
+#define _BD_AIO_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "glusterfs.h"
+
+/*
+ * Maximum number of concurrently submitted IO events. The heaviest load
+ * GlusterFS has been able to handle had 60-80 concurrent calls
+ */
+#define BD_AIO_MAX_NR_EVENTS 256
+
+/* Maximum number of completed IO operations to reap per getevents syscall */
+#define BD_AIO_MAX_NR_GETEVENTS 16
+
+int bd_aio_on (xlator_t *this);
+int bd_aio_off (xlator_t *this);
+
+int bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata);
+
+int bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata);
+
+#endif /* !_BD_AIO_H */
diff --git a/xlators/storage/bd/src/bd-helper.c b/xlators/storage/bd/src/bd-helper.c
new file mode 100644
index 000000000..5525e346b
--- /dev/null
+++ b/xlators/storage/bd/src/bd-helper.c
@@ -0,0 +1,783 @@
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+#include <lvm2app.h>
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#endif
+
+#include "bd.h"
+#include "run.h"
+
+int
+bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx)
+{
+ int ret = -1;
+ uint64_t ctx_int = 0;
+
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ GF_VALIDATE_OR_GOTO (this->name, ctx, out);
+
+ ctx_int = (long)ctx;
+ ret = inode_ctx_set (inode, this, &ctx_int);
+out:
+ return ret;
+}
+
+int
+bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx)
+{
+ int ret = -1;
+ uint64_t ctx_int = 0;
+
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ ret = inode_ctx_get (inode, this, &ctx_int);
+ if (ret)
+ return ret;
+ if (ctx)
+ *ctx = (bd_attr_t *) ctx_int;
+out:
+ return ret;
+}
+
+void
+bd_local_free (xlator_t *this, bd_local_t *local)
+{
+ if (!local)
+ return;
+ if (local->fd)
+ fd_unref (local->fd);
+ else if (local->loc.path)
+ loc_wipe (&local->loc);
+ if (local->dict)
+ dict_unref (local->dict);
+ if (local->inode)
+ inode_unref (local->inode);
+ if (local->bdatt) {
+ GF_FREE (local->bdatt->type);
+ GF_FREE (local->bdatt);
+ }
+ mem_put (local);
+ local = NULL;
+}
+
+bd_local_t *
+bd_local_init (call_frame_t *frame, xlator_t *this)
+{
+ frame->local = mem_get0 (this->local_pool);
+ if (!frame->local)
+ return NULL;
+
+ return frame->local;
+}
+
+/*
+ * VG are set with the tag in GF_XATTR_VOL_ID_KEY:<uuid> format.
+ * This function validates this tag agains volume-uuid. Also goes
+ * through LV list to find out if a thin-pool is configured or not.
+ */
+int bd_scan_vg (xlator_t *this, bd_priv_t *priv)
+{
+ vg_t brick = NULL;
+ data_t *tmp_data = NULL;
+ struct dm_list *tags = NULL;
+ int op_ret = -1;
+ uuid_t dict_uuid = {0, };
+ uuid_t vg_uuid = {0, };
+ gf_boolean_t uuid = _gf_false;
+ lvm_str_list_t *strl = NULL;
+ struct dm_list *lv_dm_list = NULL;
+ lv_list_t *lv_list = NULL;
+ struct dm_list *dm_seglist = NULL;
+ lvseg_list_t *seglist = NULL;
+ lvm_property_value_t prop = {0, };
+ gf_boolean_t thin = _gf_false;
+ const char *lv_name = NULL;
+
+ brick = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!brick) {
+ gf_log (this->name, GF_LOG_CRITICAL, "VG %s is not found",
+ priv->vg);
+ return ENOENT;
+ }
+
+ lv_dm_list = lvm_vg_list_lvs (brick);
+ if (!lv_dm_list)
+ goto check;
+
+ dm_list_iterate_items (lv_list, lv_dm_list) {
+ dm_seglist = lvm_lv_list_lvsegs (lv_list->lv);
+ if (!dm_seglist)
+ continue;
+ dm_list_iterate_items (seglist, dm_seglist) {
+ prop = lvm_lvseg_get_property (seglist->lvseg,
+ "segtype");
+ if (!prop.is_valid || !prop.value.string)
+ continue;
+ if (!strcmp (prop.value.string, "thin-pool")) {
+ thin = _gf_true;
+ lv_name = lvm_lv_get_name (lv_list->lv);
+ priv->pool = gf_strdup (lv_name);
+ gf_log (THIS->name, GF_LOG_INFO, "Thin Pool "
+ "\"%s\" will be used for thin LVs",
+ lv_name);
+ break;
+ }
+ }
+ }
+
+check:
+ /* If there is no volume-id set in dict, we cant validate */
+ tmp_data = dict_get (this->options, "volume-id");
+ if (!tmp_data) {
+ op_ret = 0;
+ goto out;
+ }
+
+ op_ret = uuid_parse (tmp_data->data, dict_uuid);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "wrong volume-id (%s) set in volume file",
+ tmp_data->data);
+ op_ret = -1;
+ goto out;
+ }
+
+ tags = lvm_vg_get_tags (brick);
+ if (!tags) { /* no tags in the VG */
+ gf_log (this->name, GF_LOG_ERROR,
+ "Extended attribute trusted.glusterfs."
+ "volume-id is absent");
+ op_ret = -1;
+ goto out;
+ }
+ dm_list_iterate_items (strl, tags) {
+ if (!strncmp (strl->str, GF_XATTR_VOL_ID_KEY,
+ strlen (GF_XATTR_VOL_ID_KEY))) {
+ uuid = _gf_true;
+ break;
+ }
+ }
+ /* UUID tag is not set in VG */
+ if (!uuid) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Extended attribute trusted.glusterfs."
+ "volume-id is absent");
+ op_ret = -1;
+ goto out;
+ }
+
+ op_ret = uuid_parse (strl->str + strlen (GF_XATTR_VOL_ID_KEY) + 1,
+ vg_uuid);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "wrong volume-id (%s) set in VG", strl->str);
+ op_ret = -1;
+ goto out;
+ }
+ if (uuid_compare (dict_uuid, vg_uuid)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "mismatching volume-id (%s) received. "
+ "already is a part of volume %s ",
+ tmp_data->data, vg_uuid);
+ op_ret = -1;
+ goto out;
+ }
+
+ op_ret = 0;
+
+out:
+ lvm_vg_close (brick);
+
+ if (!thin)
+ gf_log (THIS->name, GF_LOG_WARNING, "No thin pool found in "
+ "VG %s\n", priv->vg);
+ else
+ priv->caps |= BD_CAPS_THIN;
+
+ return op_ret;
+}
+
+/* FIXME: Move this code to common place, so posix and bd xlator can use */
+char *
+page_aligned_alloc (size_t size, char **aligned_buf)
+{
+ char *alloc_buf = NULL;
+ char *buf = NULL;
+
+ alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_common_mt_char);
+ if (!alloc_buf)
+ return NULL;
+ /* page aligned buffer */
+ buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE);
+ *aligned_buf = buf;
+
+ return alloc_buf;
+}
+
+static int
+__bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd_p)
+{
+ int ret = -1;
+ int _fd = -1;
+ char *devpath = NULL;
+ bd_fd_t *bdfd = NULL;
+ uint64_t tmp_bdfd = 0;
+ bd_priv_t *priv = this->private;
+ bd_gfid_t gfid = {0, };
+ bd_attr_t *bdatt = NULL;
+
+ /* not bd file */
+ if (fd->inode->ia_type != IA_IFREG ||
+ bd_inode_ctx_get (fd->inode, this, &bdatt))
+ return 0;
+
+ ret = __fd_ctx_get (fd, this, &tmp_bdfd);
+ if (ret == 0) {
+ bdfd = (void *)(long) tmp_bdfd;
+ *bdfd_p = bdfd;
+ return 0;
+ }
+
+ uuid_utoa_r (fd->inode->gfid, gfid);
+ asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid);
+ if (!devpath)
+ goto out;
+
+ _fd = open (devpath, O_RDWR | O_LARGEFILE, 0);
+ if (_fd < 0) {
+ ret = errno;
+ gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath,
+ strerror (ret));
+ goto out;
+ }
+ bdfd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd);
+ BD_VALIDATE_MEM_ALLOC (bdfd, ret, out);
+
+ bdfd->fd = _fd;
+ bdfd->flag = O_RDWR | O_LARGEFILE;
+ if (__fd_ctx_set (fd, this, (uint64_t)(long)bdfd) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set the fd context fd=%p", fd);
+ goto out;
+ }
+
+ *bdfd_p = bdfd;
+
+ ret = 0;
+out:
+ FREE (devpath);
+ if (ret) {
+ close (_fd);
+ GF_FREE (bdfd);
+ }
+ return ret;
+}
+
+int
+bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd)
+{
+ int ret;
+
+ /* FIXME: Is it ok to fd->lock here ? */
+ LOCK (&fd->lock);
+ {
+ ret = __bd_fd_ctx_get (this, fd, bdfd);
+ }
+ UNLOCK (&fd->lock);
+
+ return ret;
+}
+
+/*
+ * Validates if LV exists for given inode or not.
+ * Returns 0 if LV exists and size also matches.
+ * If LV does not exist -1 returned
+ * If LV size mismatches, returnes 1 also lv_size is updated with actual
+ * size
+ */
+int
+bd_validate_bd_xattr (xlator_t *this, char *bd, char **type,
+ uint64_t *lv_size, uuid_t uuid)
+{
+ char *path = NULL;
+ int ret = -1;
+ bd_gfid_t gfid = {0, };
+ bd_priv_t *priv = this->private;
+ struct stat stbuf = {0, };
+ uint64_t size = 0;
+ vg_t vg = NULL;
+ lv_t lv = NULL;
+ char *bytes = NULL;
+
+ bytes = strrchr (bd, ':');
+ if (bytes) {
+ *bytes = '\0';
+ bytes++;
+ gf_string2bytesize (bytes, &size);
+ }
+
+ if (strcmp (bd, BD_LV) && strcmp (bd, BD_THIN)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid xattr %s", bd);
+ return -1;
+ }
+ *type = gf_strdup (bd);
+
+ /*
+ * Check if LV really exist, there could be a failure
+ * after setxattr and successful LV creation
+ */
+ uuid_utoa_r (uuid, gfid);
+ gf_asprintf (&path, "/dev/%s/%s", priv->vg, gfid);
+ if (!path) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "insufficient memory");
+ return 0;
+ }
+
+ /* Destination file does not exist */
+ if (stat (path, &stbuf)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "lstat failed for path %s", path);
+ return -1;
+ }
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "r", 0);
+ if (!vg) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "VG %s does not exist?", priv->vg);
+ ret = -1;
+ goto out;
+ }
+
+ lv = lvm_lv_from_name (vg, gfid);
+ if (!lv) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "LV %s does not exist", gfid);
+ ret = -1;
+ goto out;
+ }
+
+ *lv_size = lvm_lv_get_size (lv);
+ if (size == *lv_size) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = 1;
+
+out:
+ if (vg)
+ lvm_vg_close (vg);
+
+ GF_FREE (path);
+ return ret;
+}
+
+static int
+create_thin_lv (char *vg, char *pool, char *lv, uint64_t extent)
+{
+ int ret = -1;
+ runner_t runner = {0, };
+ char *path = NULL;
+ struct stat stat = {0, };
+
+ runinit (&runner);
+ runner_add_args (&runner, LVM_CREATE, NULL);
+ runner_add_args (&runner, "--thin", NULL);
+ runner_argprintf (&runner, "%s/%s", vg, pool);
+ runner_add_args (&runner, "--name", NULL);
+ runner_argprintf (&runner, "%s", lv);
+ runner_add_args (&runner, "--virtualsize", NULL);
+ runner_argprintf (&runner, "%ldB", extent);
+ runner_start (&runner);
+ runner_end (&runner);
+
+ gf_asprintf (&path, "/dev/%s/%s", vg, lv);
+ if (!path) {
+ ret = ENOMEM;
+ goto out;
+ }
+ if (lstat (path, &stat) < 0)
+ ret = EAGAIN;
+ else
+ ret = 0;
+out:
+ GF_FREE (path);
+ return ret;
+}
+
+int
+bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv)
+{
+ int ret = 0;
+ vg_t vg = NULL;
+ bd_gfid_t gfid = {0, };
+
+ uuid_utoa_r (uuid, gfid);
+
+ if (!strcmp (type, BD_THIN))
+ return create_thin_lv (priv->vg, priv->pool, gfid,
+ size);
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ return ENOENT;
+ }
+
+ if (!lvm_vg_create_lv_linear (vg, gfid, size)) {
+ gf_log (THIS->name, GF_LOG_WARNING, "lvm_vg_create_lv_linear "
+ "failed");
+ ret = errno;
+ }
+
+ lvm_vg_close (vg);
+
+ return ret;
+}
+
+int32_t
+bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size)
+{
+ uint64_t new_size = 0;
+ runner_t runner = {0, };
+ bd_gfid_t gfid = {0, };
+ int ret = 0;
+ vg_t vg = NULL;
+ lv_t lv = NULL;
+
+ uuid_utoa_r (uuid, gfid);
+
+ runinit (&runner);
+
+ runner_add_args (&runner, LVM_RESIZE, NULL);
+ runner_argprintf (&runner, "%s/%s", priv->vg, gfid);
+ runner_argprintf (&runner, "-L%ldb", size);
+ runner_add_args (&runner, "-f", NULL);
+
+ runner_start (&runner);
+ runner_end (&runner);
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ return EAGAIN;
+ }
+
+ lv = lvm_lv_from_name (vg, gfid);
+ if (!lv) {
+ gf_log (THIS->name, GF_LOG_WARNING, "LV %s not found", gfid);
+ ret = EIO;
+ goto out;
+ }
+ new_size = lvm_lv_get_size (lv);
+
+ if (new_size != size) {
+ gf_log (THIS->name, GF_LOG_WARNING, "resized LV size %ld does "
+ "not match requested size %ld", new_size, size);
+ ret = EIO;
+ }
+
+out:
+ lvm_vg_close (vg);
+ return ret;
+}
+
+uint64_t
+bd_get_default_extent (bd_priv_t *priv)
+{
+ vg_t vg = NULL;
+ uint64_t size = 0;
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ return 0;
+ }
+
+ size = lvm_vg_get_extent_size (vg);
+
+ lvm_vg_close (vg);
+
+ return size;
+}
+
+/*
+ * Adjusts the user specified size to VG specific extent size
+ */
+uint64_t
+bd_adjust_size (bd_priv_t *priv, uint64_t size)
+{
+ uint64_t extent = 0;
+ uint64_t nr_ex = 0;
+
+ extent = bd_get_default_extent (priv);
+ if (!extent)
+ return 0;
+
+ nr_ex = size / extent;
+ if (size % extent)
+ nr_ex++;
+
+ size = extent * nr_ex;
+
+ return size;
+}
+
+int
+bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno)
+{
+ vg_t vg = NULL;
+ lv_t lv = NULL;
+ int ret = -1;
+
+ *op_errno = 0;
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ *op_errno = ENOENT;
+ return -1;
+ }
+ lv = lvm_lv_from_name (vg, lv_name);
+ if (!lv) {
+ gf_log (THIS->name, GF_LOG_WARNING, "No such LV %s", lv_name);
+ *op_errno = ENOENT;
+ goto out;
+ }
+ ret = lvm_vg_remove_lv (lv);
+ if (ret < 0) {
+ gf_log (THIS->name, GF_LOG_WARNING, "removing LV %s failed",
+ lv_name);
+ *op_errno = errno;
+ goto out;
+ }
+out:
+ lvm_vg_close (vg);
+
+ return ret;
+}
+
+inline void
+bd_update_amtime(struct iatt *iatt, int flag)
+{
+ struct timespec ts = {0, };
+
+ clock_gettime (CLOCK_REALTIME, &ts);
+ if (flag & GF_SET_ATTR_ATIME) {
+ iatt->ia_atime = ts.tv_sec;
+ iatt->ia_atime_nsec = ts.tv_nsec;
+ }
+ if (flag & GF_SET_ATTR_MTIME) {
+ iatt->ia_mtime = ts.tv_sec;
+ iatt->ia_mtime_nsec = ts.tv_nsec;
+ }
+}
+
+int
+bd_snapshot_create (bd_local_t *local, bd_priv_t *priv)
+{
+ char *path = NULL;
+ bd_gfid_t dest = {0, };
+ bd_gfid_t origin = {0, };
+ int ret = 0;
+ runner_t runner = {0, };
+ struct stat stat = {0, };
+
+ uuid_utoa_r (local->dloc->gfid, dest);
+ uuid_utoa_r (local->loc.gfid, origin);
+
+ gf_asprintf (&path, "/dev/%s/%s", priv->vg, dest);
+ if (!path) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "Insufficient memory");
+ return ENOMEM;
+ }
+
+ runinit (&runner);
+ runner_add_args (&runner, LVM_CREATE, NULL);
+ runner_add_args (&runner, "--snapshot", NULL);
+ runner_argprintf (&runner, "/dev/%s/%s", priv->vg, origin);
+ runner_add_args (&runner, "--name", NULL);
+ runner_argprintf (&runner, "%s", dest);
+ if (strcmp (local->bdatt->type, BD_THIN))
+ runner_argprintf (&runner, "-L%ldB", local->size);
+ runner_start (&runner);
+ runner_end (&runner);
+
+ if (lstat (path, &stat) < 0)
+ ret = EIO;
+
+ GF_FREE (path);
+ return ret;
+}
+
+int
+bd_clone (bd_local_t *local, bd_priv_t *priv)
+{
+ int ret = ENOMEM;
+ int fd1 = -1;
+ int fd2 = -1;
+ int i = 0;
+ char *buff = NULL;
+ ssize_t bytes = 0;
+ char *spath = NULL;
+ char *dpath = NULL;
+ struct iovec *vec = NULL;
+ bd_gfid_t source = {0, };
+ bd_gfid_t dest = {0, };
+ void *bufp[IOV_NR] = {0, };
+
+ vec = GF_CALLOC (IOV_NR, sizeof (struct iovec), gf_common_mt_iovec);
+ if (!vec)
+ return ENOMEM;
+
+ for (i = 0; i < IOV_NR; i++) {
+ bufp[i] = page_aligned_alloc (IOV_SIZE, &buff);
+ if (!buff)
+ goto out;
+ vec[i].iov_base = buff;
+ vec[i].iov_len = IOV_SIZE;
+ }
+
+ uuid_utoa_r (local->loc.gfid, source);
+ uuid_utoa_r (local->dloc->gfid, dest);
+
+ gf_asprintf (&spath, "/dev/%s/%s", priv->vg, source);
+ gf_asprintf (&dpath, "/dev/%s/%s", priv->vg, dest);
+ if (!spath || !dpath)
+ goto out;
+
+ ret = bd_create (local->dloc->gfid, local->size,
+ local->bdatt->type, priv);
+ if (ret)
+ goto out;
+
+ fd1 = open (spath, O_RDONLY | O_DIRECT);
+ if (fd1 < 0) {
+ ret = errno;
+ goto out;
+ }
+ fd2 = open (dpath, O_WRONLY | O_DIRECT);
+ if (fd2 < 0) {
+ ret = errno;
+ goto out;
+ }
+
+ while (1) {
+ bytes = readv (fd1, vec, IOV_NR);
+ if (bytes < 0) {
+ ret = errno;
+ gf_log (THIS->name, GF_LOG_WARNING, "read failed: %s",
+ strerror (ret));
+ goto out;
+ }
+ if (!bytes)
+ break;
+ bytes = writev (fd2, vec, IOV_NR);
+ if (bytes < 0) {
+ ret = errno;
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "write failed: %s", strerror (ret));
+ goto out;
+ }
+ }
+ ret = 0;
+
+out:
+ for (i = 0; i < IOV_NR; i++)
+ GF_FREE (bufp[i]);
+ GF_FREE (vec);
+
+ if (fd1 != -1)
+ close (fd1);
+ if (fd2 != -1)
+ close (fd2);
+
+ FREE (spath);
+ FREE (dpath);
+
+ return ret;
+}
+
+/*
+ * Merges snapshot LV to origin LV and returns status
+ */
+int
+bd_merge (bd_priv_t *priv, uuid_t gfid)
+{
+ bd_gfid_t dest = {0, };
+ char *path = NULL;
+ struct stat stat = {0, };
+ runner_t runner = {0, };
+ int ret = 0;
+
+ uuid_utoa_r (gfid, dest);
+ gf_asprintf (&path, "/dev/%s/%s", priv->vg, dest);
+
+ runinit (&runner);
+ runner_add_args (&runner, LVM_CONVERT, NULL);
+ runner_add_args (&runner, "--merge", NULL);
+ runner_argprintf (&runner, "%s", path);
+ runner_start (&runner);
+ runner_end (&runner);
+
+ if (!lstat (path, &stat))
+ ret = EIO;
+
+ GF_FREE (path);
+
+ return ret;
+}
+
+int
+bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict)
+{
+ vg_t brick = NULL;
+ lvm_property_value_t prop = {0, };
+ lv_t lv = NULL;
+ int ret = -1;
+ bd_gfid_t gfid = {0, };
+ inode_t *inode = NULL;
+ char *origin = NULL;
+
+ brick = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!brick) {
+ gf_log (THIS->name, GF_LOG_CRITICAL, "VG %s is not found",
+ priv->vg);
+ return ENOENT;
+ }
+
+ if (fd)
+ inode = fd->inode;
+ else
+ inode = loc->inode;
+
+ uuid_utoa_r (inode->gfid, gfid);
+ lv = lvm_lv_from_name (brick, gfid);
+ if (!lv) {
+ gf_log (THIS->name, GF_LOG_CRITICAL, "LV %s not found", gfid);
+ ret = ENOENT;
+ goto out;
+ }
+
+ prop = lvm_lv_get_property (lv, "origin");
+ if (!prop.is_valid || !prop.value.string) {
+ ret = ENODATA;
+ goto out;
+ }
+
+ origin = gf_strdup (prop.value.string);
+ ret = dict_set_dynstr (dict, BD_ORIGIN, origin);
+
+out:
+ lvm_vg_close (brick);
+ return ret;
+}
+
diff --git a/xlators/storage/bd/src/bd.c b/xlators/storage/bd/src/bd.c
new file mode 100644
index 000000000..405474c58
--- /dev/null
+++ b/xlators/storage/bd/src/bd.c
@@ -0,0 +1,2404 @@
+/*
+ BD translator V2 - Exports Block devices on server side as regular
+ files to client
+
+ Now only exporting Logical volumes supported.
+
+ Copyright IBM, Corp. 2013
+
+ This file is part of GlusterFS.
+
+ Author:
+ M. Mohan Kumar <mohan@in.ibm.com>
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+#include <lvm2app.h>
+#include <openssl/md5.h>
+#include <time.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#endif
+
+#include "bd.h"
+#include "bd-aio.h"
+#include "defaults.h"
+#include "glusterfs3-xdr.h"
+#include "run.h"
+#include "protocol-common.h"
+#include "checksum.h"
+
+/*
+ * Call back function for setxattr and removexattr.
+ * does not do anything. FIXME: How to handle remove/setxattr failure
+ */
+int
+bd_null_rmsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+/*
+ * returns 0 if a file is mapped to BD or not.
+ */
+int
+bd_get_bd_info (call_frame_t *frame, xlator_t *this, dict_t *xattr, uuid_t gfid,
+ char **type, uint64_t *size)
+{
+ char *bd_xattr = NULL;
+ char *bd = NULL;
+ int ret = -1;
+ loc_t loc = {0, };
+ dict_t *dict = NULL;
+ char *p = NULL;
+ call_frame_t *bd_frame = NULL;
+
+ if (!xattr)
+ return 1;
+
+ if (dict_get_str (xattr, BD_XATTR, &p))
+ return 1;
+
+ bd_xattr = gf_strdup (p);
+
+ memcpy (loc.gfid, gfid, sizeof (uuid_t));
+
+ bd_frame = copy_frame (frame);
+ BD_VALIDATE_MEM_ALLOC (bd_frame, ret, out);
+
+ ret = bd_validate_bd_xattr (this, bd_xattr, type, size, gfid);
+ if (ret < 0) {/* LV does not exist */
+ STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->removexattr, &loc,
+ BD_XATTR, NULL);
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "Mapped LV not available for posix file <gfid:%s>, "
+ "deleting mapping", uuid_utoa (gfid));
+ } else if (ret == 1) {
+ /* BD_XATTR size and LV size mismatch. Update BD_XATTR */
+ gf_asprintf (&bd, "%s:%ld", *type, *size);
+
+ dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (dict, ret, out);
+
+ ret = dict_set_dynstr (dict, BD_XATTR, bd);
+ if (ret)
+ goto out;
+
+ STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setxattr, &loc, dict, 0,
+ NULL);
+ }
+
+out:
+ dict_del (xattr, BD_XATTR);
+ GF_FREE (bd_xattr);
+ GF_FREE (bd);
+ return ret;
+}
+
+/*
+ * bd_lookup_cbk: Call back from posix_lookup.
+ */
+int32_t
+bd_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *buf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ int ret = -1;
+ bd_attr_t *bdatt = NULL;
+ uint64_t size = 0;
+ char *type = BD_TYPE_NONE;
+
+ /* only regular files are part of BD object */
+ if (op_ret < 0 || buf->ia_type != IA_IFREG)
+ goto out;
+
+ /* iatt already cached */
+ if (!bd_inode_ctx_get (inode, this, &bdatt))
+ goto next;
+
+ if (bd_get_bd_info (frame, this, xattr, buf->ia_gfid, &type, &size))
+ goto out;
+
+ /* BD file, update buf */
+ bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ if (!bdatt) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ memcpy (&bdatt->iatt, buf, sizeof (struct iatt));
+ bdatt->type = type;
+
+ /* Cache LV size in inode_ctx */
+ ret = bd_inode_ctx_set (inode, this, bdatt);
+ if (ret < 0) {
+ GF_FREE (bdatt);
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ bdatt->iatt.ia_size = size;
+ bdatt->iatt.ia_blocks = size / 512;
+
+next:
+ dict_del (xattr, GF_CONTENT_KEY);
+ memcpy (buf, &bdatt->iatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf,
+ xattr, postparent);
+ return 0;
+}
+
+/*
+ * bd_lookup: Issues posix_lookup to find out if file is mapped to BD
+ * bd_lookup -> posix_lookup -> bd_lookup_cbk
+*/
+int32_t
+bd_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ dict_t *bd_xattr = NULL;
+ bd_attr_t *bdatt = NULL;
+ int op_errno = EINVAL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (loc->path, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt) < 0) {
+ if (!xattr_req) {
+ bd_xattr = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (bd_xattr, op_errno, out);
+ xattr_req = bd_xattr;
+ }
+ if (dict_set_int8 (xattr_req, BD_XATTR, 1) < 0)
+ goto out;
+ }
+
+ STACK_WIND (frame, bd_lookup_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup, loc, xattr_req);
+
+ if (bd_xattr)
+ dict_unref (bd_xattr);
+ return 0;
+out:
+ BD_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+bd_forget (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ uint64_t ctx = 0;
+ bd_attr_t *bdatt = NULL;
+
+ ret = bd_inode_ctx_get (inode, this, &bdatt);
+ if (!ret) {
+ inode_ctx_del (inode, this, &ctx);
+ FREE (bdatt);
+ }
+ return 0;
+}
+
+int
+bd_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, gf_dirent_t *entries, dict_t *xdata)
+{
+ gf_dirent_t *entry = NULL;
+ uint64_t size = 0;
+ char *type = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ list_for_each_entry (entry, &entries->list, list) {
+ if (entry->d_type != DT_REG)
+ continue;
+ if (!bd_get_bd_info (frame, this, entry->dict,
+ entry->d_stat.ia_gfid, &type, &size)) {
+ entry->d_stat.ia_size = size;
+ entry->d_stat.ia_blocks = size / 512;
+ FREE (type);
+ }
+ }
+
+out:
+ BD_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
+
+/*
+ * bd_readdirp: In bd_readdirp_cbk if the file and BD_XATTR_SIZE is set
+ * ia_size is updated with the LV(BD_XATTR_SIZE) size
+ */
+int32_t
+bd_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *dict)
+{
+ int op_errno = EINVAL;
+ bd_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ if (!dict) {
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+ dict = local->dict;
+ }
+
+ if (dict_set_int8 (dict, BD_XATTR, 0)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set key %s", BD_XATTR);
+ goto out;
+ }
+
+ STACK_WIND (frame, bd_readdirp_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, size, off, dict);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (readdirp, frame, -1, op_errno, NULL, dict);
+ return 0;
+}
+
+int
+bd_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *buf, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+
+ /* only regular files are part of BD object */
+ if (op_ret < 0 || buf->ia_type != IA_IFREG)
+ goto out;
+
+ BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out);
+
+ /* update buf with LV size */
+ if (!bd_inode_ctx_get (local->inode, this, &bdatt))
+ memcpy (buf, bdatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+bd_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ int op_errno = EINVAL;
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (loc->path, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ if (!bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ BD_STACK_UNWIND (stat, frame, 0, 0, &bdatt->iatt, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+ local->inode = inode_ref (loc->inode);
+
+ STACK_WIND(frame, bd_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc, xdata);
+ return 0;
+out:
+ BD_STACK_UNWIND (stat, frame, -1, op_errno, NULL, xdata);
+ return 0;
+}
+
+int
+bd_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *buff, dict_t *xdata)
+{
+ uint64_t size = 0;
+ uint64_t fr_size = 0;
+ bd_priv_t *priv = NULL;
+ vg_t vg = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ priv = this->private;
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "r", 0);
+ if (!vg) {
+ gf_log (this->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ op_ret = -1;
+ op_errno = EAGAIN;
+ goto out;
+ }
+ size = lvm_vg_get_size (vg);
+ fr_size = lvm_vg_get_free_size (vg);
+ lvm_vg_close (vg);
+
+ buff->f_blocks += size / buff->f_frsize;
+ buff->f_bfree += fr_size / buff->f_frsize;
+ buff->f_bavail += fr_size / buff->f_frsize;
+
+out:
+ BD_STACK_UNWIND (statfs, frame, op_ret, op_errno, buff, xdata);
+ return 0;
+}
+
+/*
+ * bd_statfs: Mimics statfs by returning used/free extents in the VG
+ */
+int
+bd_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ STACK_WIND (frame, bd_statfs_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->statfs, loc, xdata);
+ return 0;
+out:
+ BD_STACK_UNWIND (statfs, frame, -1, EINVAL, NULL, NULL);
+ return 0;
+}
+
+int
+bd_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *buf, dict_t *xdata)
+{
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = frame->local;
+
+ /* only regular files are part of BD object */
+ if (op_ret < 0 || buf->ia_type != IA_IFREG)
+ goto out;
+
+ BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out);
+
+ /* update buf with LV size */
+ if (!bd_inode_ctx_get (local->inode, this, &bdatt))
+ memcpy (buf, &bdatt->iatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+bd_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ int op_errno = EINVAL;
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ /* if its already cached return it */
+ if (!bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ BD_STACK_UNWIND (fstat, frame, 0, 0, &bdatt->iatt, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->inode = inode_ref (fd->inode);
+
+ STACK_WIND (frame, bd_fstat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, xdata);
+ return 0;
+}
+
+/*
+ * bd_readv: If posix file, invokes posix_readv otherwise reads from the BD
+ * file
+ */
+int
+bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ int ret = -1;
+ int _fd = -1;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ bd_fd_t *bd_fd = NULL;
+ struct iovec vec = {0, };
+ struct iobuf *iobuf = NULL;
+ struct iobref *iobref = NULL;
+ uint64_t bd_size = 0;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv,
+ fd, size, offset, flags, xdata);
+ return 0;
+ }
+ if (!size) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size);
+ goto out;
+ }
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
+ if (!iobuf) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ _fd = bd_fd->fd;
+ op_ret = pread (_fd, iobuf->ptr, size, offset);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "read failed on fd=%p: %s", fd,
+ strerror (op_errno));
+ goto out;
+ }
+
+ vec.iov_base = iobuf->ptr;
+ vec.iov_len = op_ret;
+
+ iobref = iobref_new ();
+ iobref_add (iobref, iobuf);
+
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ op_errno = EINVAL;
+ op_ret = -1;
+ goto out;
+ }
+ bd_size = bdatt->iatt.ia_size;
+ if (!bd_size || (offset + vec.iov_len) >= bd_size)
+ op_errno = ENOENT;
+
+ op_ret = vec.iov_len;
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_ATIME);
+
+out:
+ BD_STACK_UNWIND (readv, frame, op_ret, op_errno,
+ &vec, 1, &bdatt->iatt, iobref, NULL);
+
+ if (iobref)
+ iobref_unref (iobref);
+ if (iobuf)
+ iobuf_unref (iobuf);
+
+ return 0;
+}
+
+#ifdef BLKDISCARD
+/*
+ * bd_discard: Sends BLKDISCARD ioctl to the block device
+ */
+int
+bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ int ret = -1;
+ int op_errno = EINVAL;
+ bd_fd_t *bd_fd = NULL;
+ uint64_t param[2] = {0, };
+ bd_attr_t *bdatt = NULL;
+ struct iatt prebuf = {0, };
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ /* posix */
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard,
+ fd, offset, len, xdata);
+ return 0;
+ }
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ param[0] = offset;
+ param[1] = len;
+ ret = ioctl (bd_fd->fd, BLKDISCARD, param);
+ if (ret < 0) {
+ if (errno == ENOTTY)
+ op_errno = ENOSYS;
+ else
+ op_errno = errno;
+ goto out;
+ }
+ memcpy (&prebuf, &bdatt->iatt, sizeof (prebuf));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+
+ BD_STACK_UNWIND (discard, frame, ret, op_errno, &prebuf,
+ &bdatt->iatt, xdata);
+ return 0;
+
+out:
+ BD_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+#else
+
+int
+bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ BD_STACK_UNWIND (discard, frame, -1, ENOSYS, NULL, NULL, NULL);
+ return 0;
+}
+#endif
+
+/*
+ * Call back from posix_open for opening the backing posix file
+ * If it failed, close BD fd
+ */
+int
+bd_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+{
+ bd_fd_t *bd_fd = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ if (!op_ret)
+ goto out;
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (!bdatt) /* posix file */
+ goto out;
+
+ /* posix open failed */
+ if (bd_fd_ctx_get (this, fd, &bd_fd) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "bd_fd is NULL from fd=%p", fd);
+ goto out;
+ }
+ close (bd_fd->fd);
+ GF_FREE (bd_fd);
+
+out:
+ BD_STACK_UNWIND (open, frame, op_ret, op_errno, fd, NULL);
+
+ return 0;
+}
+
+/*
+ * bd_open: Opens BD file if given posix file is mapped to BD. Also opens
+ * posix file.
+ * fd contains both posix and BD fd
+ */
+int32_t
+bd_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
+{
+ int32_t ret = EINVAL;
+ bd_fd_t *bd_fd = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_gfid_t gfid = {0, };
+ char *devpath = NULL;
+ bd_priv_t *priv = this->private;
+ int _fd = -1;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ /* not bd file */
+ if (fd->inode->ia_type != IA_IFREG ||
+ bd_inode_ctx_get (fd->inode, this, &bdatt))
+ goto posix;
+
+ uuid_utoa_r (fd->inode->gfid, gfid);
+ asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid);
+ BD_VALIDATE_MEM_ALLOC (devpath, ret, out);
+
+ _fd = open (devpath, flags | O_LARGEFILE, 0);
+ if (_fd < 0) {
+ ret = errno;
+ gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath,
+ strerror (ret));
+ goto out;
+ }
+ bd_fd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd);
+ BD_VALIDATE_MEM_ALLOC (bd_fd, ret, out);
+
+ bd_fd->fd = _fd;
+ bd_fd->flag = flags | O_LARGEFILE;
+
+ if (fd_ctx_set (fd, this, (uint64_t)(long)bd_fd) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set the fd context fd=%p", fd);
+ goto out;
+ }
+
+ ret = 0;
+
+posix:
+
+ /* open posix equivalant of this file, fd needed for fd related
+ operations like fsetxattr, ftruncate etc */
+ STACK_WIND (frame, bd_open_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (open, frame, -1, ret, fd, NULL);
+
+ FREE (devpath);
+ if (ret) {
+ close (_fd);
+ GF_FREE (bd_fd);
+ }
+
+ return 0;
+}
+
+/*
+ * call back from posix_setattr after updating iatt to posix file.
+ */
+int
+bd_fsync_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = local->bdatt;
+
+ BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &bdatt->iatt,
+ &bdatt->iatt, NULL);
+ return 0;
+}
+
+int
+bd_do_fsync (int fd, int datasync)
+{
+ int op_errno = 0;
+
+#ifdef HAVE_FDATASYNC
+ if (datasync) {
+ if (fdatasync (fd)) {
+ op_errno = errno;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "fdatasync on fd=%d failed: %s",
+ fd, strerror (errno));
+ }
+
+ } else
+#endif
+ {
+ if (fsync (fd)) {
+ op_errno = errno;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "fsync on fd=%d failed: %s",
+ fd, strerror (op_errno));
+ }
+ }
+
+ return op_errno;
+}
+
+/*
+ * bd_fsync: Syncs if BD fd, forwards the request to posix
+ * fsync -> posix_setattr -> posix_fsync
+*/
+int32_t
+bd_fsync (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t datasync, dict_t *xdata)
+{
+ int ret = -1;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ bd_fd_t *bd_fd = NULL;
+ bd_priv_t *priv = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = NULL;
+ int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
+ struct iatt prebuf = {0, };
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ret = bd_inode_ctx_get (fd->inode, this, &bdatt);
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd || !bdatt) {
+ STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsync, fd, datasync,
+ xdata);
+ return 0;
+ }
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+
+ op_errno = bd_do_fsync (bd_fd->fd, datasync);
+ if (op_errno)
+ goto out;
+
+ /* For BD, Update the a|mtime during full fsync only */
+ if (!datasync) {
+ local = bd_local_init (frame, this);
+ /* In case of mem failure, should posix flush called ? */
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ local->bdatt->type = gf_strdup (bdatt->type);
+ memcpy (&local->bdatt->iatt, &bdatt->iatt, sizeof (struct iatt));
+ bd_update_amtime (&local->bdatt->iatt, valid);
+ uuid_copy (local->loc.gfid, fd->inode->gfid);
+ STACK_WIND (frame, bd_fsync_setattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr, &local->loc,
+ &local->bdatt->iatt,
+ valid, NULL);
+ return 0;
+ }
+
+out:
+ BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &prebuf,
+ &bdatt->iatt, NULL);
+ return 0;
+}
+
+int
+bd_flush_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ BD_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int
+bd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ int ret = -1;
+ bd_fd_t *bd_fd = NULL;
+ bd_priv_t *priv = NULL;
+ bd_attr_t *bdatt = NULL;
+ int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
+ bd_local_t *local = NULL;
+ int op_errno = EINVAL;
+ loc_t loc = {0, };
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ret = bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (!bdatt)
+ goto out;
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd || !bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "bdfd/bdatt is NULL from fd=%p", fd);
+ goto out;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->fd = fd_ref (fd);
+ uuid_copy (loc.gfid, bdatt->iatt.ia_gfid);
+
+ /* Update the a|mtime during flush */
+ STACK_WIND (frame, bd_flush_setattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr, &loc, &bdatt->iatt,
+ valid, NULL);
+
+ return 0;
+
+out:
+ STACK_WIND (frame, default_flush_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->flush, fd, xdata);
+
+ return 0;
+}
+
+int32_t
+bd_release (xlator_t *this, fd_t *fd)
+{
+ int ret = -1;
+ bd_fd_t *bd_fd = NULL;
+ uint64_t tmp_bfd = 0;
+ bd_attr_t *bdatt = NULL;
+ bd_priv_t *priv = this->private;
+
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (priv, out);
+
+ ret = bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (ret || !bdatt) /* posix file */
+ goto out;
+
+ /* FIXME: Update amtime during release */
+
+ ret = fd_ctx_del (fd, this, &tmp_bfd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "bfd is NULL from fd=%p", fd);
+ goto out;
+ }
+ bd_fd = (bd_fd_t *)(long)tmp_bfd;
+
+ close (bd_fd->fd);
+ GF_FREE (bd_fd);
+out:
+ return 0;
+}
+
+/*
+ * Call back for removexattr after removing BD_XATTR incase of
+ * bd create failure
+ */
+int
+bd_setx_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (local->fd)
+ BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata);
+ return 0;
+
+}
+
+/*
+ * Call back after setting BD_XATTR. Creates BD. If BD creation is a failure
+ * invokes posix_removexattr to remove created BD_XATTR
+ */
+int
+bd_setx_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+
+ if (op_ret < 0)
+ goto next;
+
+ /* Create LV */
+ op_errno = bd_create (local->inode->gfid, local->bdatt->iatt.ia_size,
+ local->bdatt->type, this->private);
+ if (!op_errno)
+ goto out;
+
+ /* LV creation failed, remove BD_XATTR */
+ if (local->fd)
+ STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr,
+ local->fd, BD_XATTR, NULL);
+ else
+ STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ &local->loc, BD_XATTR, NULL);
+
+ return 0;
+out:
+
+ bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ if (!bdatt) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto next;
+ }
+
+ memcpy (&bdatt->iatt, &local->bdatt->iatt, sizeof (struct iatt));
+ bdatt->type = gf_strdup (local->bdatt->type);
+
+ bd_inode_ctx_set (local->inode, THIS, bdatt);
+
+next:
+ if (local->fd)
+ BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
+ else
+ BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+
+}
+
+/*
+ * Call back from posix_stat
+ */
+int
+bd_setx_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *iatt,
+ dict_t *xdata)
+{
+ char *param = NULL;
+ char *type = NULL;
+ char *s_size = NULL;
+ char *p = NULL;
+ char *copy = NULL;
+ bd_local_t *local = frame->local;
+ bd_priv_t *priv = this->private;
+ char *bd = NULL;
+ uint64_t size = 0;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (!IA_ISREG (iatt->ia_type)) {
+ op_errno = EOPNOTSUPP;
+ goto out;
+ }
+
+ param = copy = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char);
+ BD_VALIDATE_MEM_ALLOC (param, op_errno, out);
+
+ strncpy (param, local->data->data, local->data->len);
+
+ type = strtok_r (param, ":", &p);
+ if (!type) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (strcmp (type, BD_LV) && strcmp (type, BD_THIN)) {
+ gf_log (this->name, GF_LOG_WARNING, "Invalid bd type %s given",
+ type);
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ s_size = strtok_r (NULL, ":", &p);
+
+ /* If size not specified get default size */
+ if (!s_size)
+ size = bd_get_default_extent (priv);
+ else
+ gf_string2bytesize (s_size, &size);
+
+ gf_asprintf (&bd, "%s:%ld", type, size);
+ BD_VALIDATE_MEM_ALLOC (bd, op_errno, out);
+
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+
+ local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ if (dict_set_dynstr (local->dict, BD_XATTR, bd) < 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ local->bdatt->type = gf_strdup (type);
+ memcpy (&local->bdatt->iatt, iatt, sizeof (struct iatt));
+ local->bdatt->iatt.ia_size = size;
+
+ if (local->fd)
+ STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd, local->dict, 0, NULL);
+ else
+ STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ &local->loc, local->dict, 0, NULL);
+
+ return 0;
+
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, xdata);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, xdata);
+
+ GF_FREE (bd);
+ GF_FREE (copy);
+ return 0;
+}
+
+int
+bd_offload_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, EIO, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, EIO, NULL);
+
+ return 0;
+}
+
+int
+bd_offload_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (local->offload == BD_OF_SNAPSHOT)
+ op_ret = bd_snapshot_create (frame->local, this->private);
+ else
+ op_ret = bd_clone (frame->local, this->private);
+
+ if (op_ret) {
+ STACK_WIND (frame, bd_offload_rm_xattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ local->dloc, BD_XATTR, NULL);
+ return 0;
+ }
+
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, op_errno, op_errno, NULL);
+
+ return 0;
+}
+
+int
+bd_offload_getx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ char *bd = NULL;
+ bd_local_t *local = frame->local;
+ char *type = NULL;
+ char *p = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (dict_get_str (xattr, BD_XATTR, &p)) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ type = gf_strdup (p);
+ BD_VALIDATE_MEM_ALLOC (type, op_errno, out);
+
+ p = strrchr (type, ':');
+ if (!p) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING,
+ "source file xattr %s corrupted?", type);
+ goto out;
+ }
+
+ *p='\0';
+
+ /* For clone size is taken from source LV */
+ if (!local->size) {
+ p++;
+ gf_string2bytesize (p, &local->size);
+ }
+ gf_asprintf (&bd, "%s:%ld", type, local->size);
+ local->bdatt->type = gf_strdup (type);
+ dict_del (local->dict, BD_XATTR);
+ dict_del (local->dict, LINKTO);
+ if (dict_set_dynstr (local->dict, BD_XATTR, bd)) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ STACK_WIND (frame, bd_offload_setx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ local->dloc, local->dict, 0, NULL);
+
+ return 0;
+
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ GF_FREE (type);
+ GF_FREE (bd);
+
+ return 0;
+}
+
+int
+bd_offload_dest_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct iatt *iatt,
+ dict_t *xattr, struct iatt *postparent)
+{
+ bd_local_t *local = frame->local;
+ char *bd = NULL;
+ int ret = -1;
+ char *linkto = NULL;
+
+ if (op_ret < 0 && op_errno != ENODATA) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (!IA_ISREG (iatt->ia_type)) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "destination gfid is not a "
+ "regular file");
+ goto out;
+ }
+
+ ret = dict_get_str (xattr, LINKTO, &linkto);
+ if (linkto) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "destination file not "
+ "present in same brick");
+ goto out;
+ }
+
+ ret = dict_get_str (xattr, BD_XATTR, &bd);
+ if (bd) {
+ op_errno = EEXIST;
+ goto out;
+ }
+
+ local->bdatt = CALLOC (1, sizeof (bd_attr_t));
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ STACK_WIND (frame, bd_offload_getx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ &local->loc, BD_XATTR, NULL);
+
+ return 0;
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+int
+bd_merge_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ /* FIXME: if delete failed, remove xattr */
+
+ BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, NULL);
+ return 0;
+}
+
+int
+bd_do_merge(call_frame_t *frame, xlator_t *this)
+{
+ bd_local_t *local = frame->local;
+ inode_t *parent = NULL;
+ char *p = NULL;
+ int op_errno = 0;
+
+ op_errno = bd_merge (this->private, local->inode->gfid);
+ if (op_errno)
+ goto out;
+
+ /*
+ * posix_unlink needs loc->pargfid to be valid, but setxattr FOP does
+ * not have loc->pargfid set. Get parent's gfid by getting parents inode
+ */
+ parent = inode_parent (local->inode, NULL, NULL);
+ if (!parent) {
+ /*
+ * FIXME: Snapshot LV already deleted.
+ * remove xattr, instead of returning failure
+ */
+ op_errno = EINVAL;
+ goto out;
+ }
+ uuid_copy (local->loc.pargfid, parent->gfid);
+
+ p = strrchr (local->loc.path, '/');
+ if (p)
+ p++;
+ local->loc.name = p;
+
+ STACK_WIND (frame, bd_merge_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ &local->loc, 0, NULL);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+
+ return op_errno;
+}
+
+int
+bd_offload (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ fd_t *fd, bd_offload_t offload)
+{
+ char *param = NULL;
+ char *param_copy = NULL;
+ char *p = NULL;
+ char *size = NULL;
+ char *gfid = NULL;
+ int op_errno = 0;
+ bd_local_t *local = frame->local;
+
+ param = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char);
+ BD_VALIDATE_MEM_ALLOC (param, op_errno, out);
+ param_copy = param;
+
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+
+ local->dloc = CALLOC (1, sizeof (loc_t));
+ BD_VALIDATE_MEM_ALLOC (local->dloc, op_errno, out);
+
+ strncpy (param, local->data->data, local->data->len);
+
+ gfid = strtok_r (param, ":", &p);
+ size = strtok_r (NULL, ":", &p);
+ if (size)
+ gf_string2bytesize (size, &local->size);
+ else if (offload != BD_OF_CLONE)
+ local->size = bd_get_default_extent (this->private);
+
+ if (dict_set_int8 (local->dict, BD_XATTR, 1) < 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
+ if (dict_set_int8 (local->dict, LINKTO, 1) < 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ uuid_parse (gfid, local->dloc->gfid);
+ local->offload = offload;
+
+ STACK_WIND (frame, bd_offload_dest_lookup_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup, local->dloc,
+ local->dict);
+
+ return 0;
+
+out:
+ if (fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ GF_FREE (param_copy);
+ return 0;
+}
+
+/*
+ * bd_setxattr: Used to create & map an LV to a posix file using
+ * BD_XATTR xattr
+ * bd_setxattr -> posix_stat -> bd_setx_stat_cbk -> posix_setxattr ->
+ * bd_setx_setx_cbk -> create_lv
+ * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk
+ */
+int
+bd_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int flags, dict_t *xdata)
+{
+ int op_errno = 0;
+ data_t *data = NULL;
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_offload_t cl_type = BD_OF_NONE;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+
+ if ((data = dict_get (dict, BD_XATTR)))
+ cl_type = BD_OF_NONE;
+ else if ((data = dict_get (dict, BD_CLONE)))
+ cl_type = BD_OF_CLONE;
+ else if ((data = dict_get (dict, BD_SNAPSHOT)))
+ cl_type = BD_OF_SNAPSHOT;
+ else if ((data = dict_get (dict, BD_MERGE)))
+ cl_type = BD_OF_MERGE;
+
+ bd_inode_ctx_get (loc->inode, this, &bdatt);
+ if (!cl_type && !data) {
+ STACK_WIND (frame, default_setxattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setxattr, loc, dict,
+ flags, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->data = data;
+ loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+
+ if (cl_type) {
+ /* For cloning/snapshot, source file must be mapped to LV */
+ if (!bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s not mapped to BD", loc->path);
+ op_errno = EINVAL;
+ goto out;
+ }
+ if (cl_type == BD_OF_MERGE)
+ bd_do_merge (frame, this);
+ else
+ bd_offload (frame, this, loc, NULL, cl_type);
+ } else if (data) {
+ if (bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s already mapped to BD", loc->path);
+ op_errno = EEXIST;
+ goto out;
+ }
+ STACK_WIND (frame, bd_setx_stat_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->stat, loc, xdata);
+ }
+
+ return 0;
+out:
+ if (op_errno)
+ STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, xdata);
+
+ return 0;
+}
+
+/*
+ * bd_fsetxattr: Used to create/map an LV to a posix file using
+ * BD_XATTR xattr
+ * bd_fsetxattr -> posix_fstat -> bd_setx_stat_cbk -> posix_fsetxattr ->
+ * bd_setx_setx_cbk -> create_lv
+ * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk
+ * -> bd_fsetxattr_cbk
+ */
+int32_t
+bd_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int flags, dict_t *xdata)
+{
+ int op_errno = 0;
+ data_t *data = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = NULL;
+ bd_offload_t cl_type = BD_OF_NONE;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+
+ data = dict_get (dict, BD_XATTR);
+ if ((data = dict_get (dict, BD_XATTR)))
+ cl_type = BD_OF_NONE;
+ else if ((data = dict_get (dict, BD_CLONE)))
+ cl_type = BD_OF_CLONE;
+ else if ((data = dict_get (dict, BD_SNAPSHOT)))
+ cl_type = BD_OF_SNAPSHOT;
+ else if ((data = dict_get (dict, BD_MERGE))) {
+ /*
+ * bd_merge is not supported for fsetxattr, because snapshot LV
+ * is opened and it causes problem in snapshot merge
+ */
+ op_errno = EOPNOTSUPP;
+ goto out;
+ }
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+
+ if (!cl_type && !data) {
+ /* non bd file object */
+ STACK_WIND (frame, default_fsetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ fd, dict, flags, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref (fd);
+ local->data = data;
+
+ if (cl_type) {
+ /* For cloning/snapshot, source file must be mapped to LV */
+ if (!bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "fd %p not mapped to BD", fd);
+ op_errno = EINVAL;
+ goto out;
+
+ }
+ bd_offload (frame, this, NULL, fd, cl_type);
+ } else if (data) {
+ if (bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "fd %p already mapped to BD", fd);
+ op_errno = EEXIST;
+ goto out;
+ }
+ STACK_WIND(frame, bd_setx_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+ }
+
+ return 0;
+out:
+
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+int32_t
+bd_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ if (!strcmp (name, BD_XATTR))
+ goto out;
+
+ STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+ return 0;
+out:
+ BD_STACK_UNWIND (removexattr, frame, -1, ENODATA, NULL);
+ return 0;
+}
+
+int32_t
+bd_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ if (!strcmp (name, BD_XATTR))
+ goto out;
+
+ STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (fremovexattr, frame, -1, ENODATA, NULL);
+ return 0;
+}
+
+int
+bd_trunc_setxattr_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/*
+ * Call back for setxattr after setting BD_XATTR_SIZE.
+ */
+int
+bd_trunc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+ struct iatt prebuf = {0, };
+ char *bd = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ bd_inode_ctx_get (local->inode, this, &bdatt);
+ if (!bdatt)
+ goto revert_xattr;
+
+ op_errno = bd_resize (this->private, local->inode->gfid,
+ local->bdatt->iatt.ia_size);
+ if (op_errno)
+ goto revert_xattr;
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+ /* LV resized, update new size in the cache */
+ bdatt->iatt.ia_size = local->bdatt->iatt.ia_size;
+
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, 0, 0, &prebuf, &bdatt->iatt,
+ NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, 0, 0, &prebuf, &bdatt->iatt,
+ NULL);
+
+ return 0;
+
+revert_xattr:
+ /* revert setxattr */
+ op_ret = dict_get_str (local->dict, BD_XATTR, &bd);
+ GF_FREE (bd);
+ gf_asprintf (&bd, "%s:%ld", bdatt->type, bdatt->iatt.ia_size);
+
+ if (local->fd)
+ STACK_WIND (frame, bd_trunc_setxattr_setx_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd, local->dict, 0, NULL);
+ else
+ STACK_WIND (frame, bd_trunc_setxattr_setx_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ &local->loc, local->dict, 0, NULL);
+
+ return 0;
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/*
+ * call back from posix_[f]truncate_stat
+ * If offset > LV size, it resizes the LV and calls posix_setxattr
+ * to update new LV size in xattr else calls posix_setattr for updating
+ * the posix file so that truncate fop behaves properly
+ */
+int
+bd_trunc_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *buf, dict_t *xdata)
+{
+ char *bd = NULL;
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+
+ bd_inode_ctx_get (local->inode, this, &bdatt);
+ if (!bdatt) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ gf_asprintf (&bd, "%s:%ld", bdatt->type, local->bdatt->iatt.ia_size);
+ if (dict_set_dynstr (local->dict, BD_XATTR, bd)) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (local->fd)
+ STACK_WIND (frame, bd_trunc_setxattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd, local->dict, 0, NULL);
+ else
+ STACK_WIND (frame, bd_trunc_setxattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ &local->loc, local->dict, 0, NULL);
+
+ return 0;
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL,
+ NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL,
+ NULL);
+ GF_FREE (bd);
+ return 0;
+}
+
+void
+bd_do_trunc (call_frame_t *frame, xlator_t *this, fd_t *fd, loc_t *loc,
+ off_t offset, bd_attr_t *bdatt)
+{
+ bd_local_t *local = NULL;
+ struct iatt prebuf = {0, };
+ int op_errno = 0;
+ int op_ret = -1;
+
+ /* If requested size is less than LV size, return success */
+ if (offset <= bdatt->iatt.ia_size) {
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+ op_ret = 0;
+ goto out;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ if (fd) {
+ local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref (fd);
+ } else {
+ local->inode = inode_ref (loc->inode);
+ loc_copy (&local->loc, loc);
+ }
+
+ local->bdatt->iatt.ia_size =
+ bd_adjust_size (this->private, offset);
+
+ STACK_WIND (frame, bd_trunc_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, NULL);
+
+ return;
+
+out:
+ if (fd)
+ BD_STACK_UNWIND (ftruncate, frame, op_ret, op_errno,
+ &prebuf, &bdatt->iatt, NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, op_ret, op_errno,
+ &prebuf, &bdatt->iatt, NULL);
+ return;
+}
+
+/*
+ * bd_ftruncate: Resizes a LV if fd belongs to BD.
+ */
+int32_t
+bd_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_ftruncate_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd,
+ offset, xdata);
+ return 0;
+ }
+
+ bd_do_trunc (frame, this, fd, NULL, offset, bdatt);
+ return 0;
+out:
+ BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+/*
+ * bd_truncate: Resizes a LV if file maps to LV.
+ */
+int32_t
+bd_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_truncate_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc,
+ offset, xdata);
+ return 0;
+ }
+
+ bd_do_trunc (frame, this, NULL, loc, offset, bdatt);
+ return 0;
+
+out:
+ BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+__bd_pwritev (int fd, struct iovec *vector, int count, off_t offset,
+ uint64_t bd_size)
+{
+ int index = 0;
+ int retval = 0;
+ off_t internal_offset = 0;
+
+ if (!vector)
+ return -EFAULT;
+
+ retval = pwritev (fd, vector, count, offset);
+ if (retval == -1) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "base %p, length %ld, offset %ld, message %s",
+ vector[index].iov_base, vector[index].iov_len,
+ internal_offset, strerror (errno));
+ retval = -errno;
+ goto err;
+ }
+/*
+
+
+ internal_offset = offset;
+ for (index = 0; index < count; index++) {
+ if (internal_offset > bd_size) {
+ op_ret = -ENOSPC;
+ goto err;
+ }
+ if (internal_offset + vector[index].iov_len > bd_size) {
+ vector[index].iov_len = bd_size - internal_offset;
+ no_space = 1;
+ }
+ retval = pwritev (fd, vector[index].iov_base,
+ vector[index].iov_len, internal_offset);
+ if (retval == -1) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "base %p, length %ld, offset %ld, message %s",
+ vector[index].iov_base, vector[index].iov_len,
+ internal_offset, strerror (errno));
+ op_ret = -errno;
+ goto err;
+ }
+ op_ret += retval;
+ internal_offset += retval;
+ if (no_space)
+ break;
+ }
+*/
+err:
+ return retval;
+}
+
+/*
+ * bd_writev: Writes to LV if its BD file or forwards the request to posix_write
+ * bd_writev -> posix_writev -> bd_writev_cbk
+ */
+int
+bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdict)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ int _fd = -1;
+ bd_fd_t *bd_fd = NULL;
+ int ret = -1;
+ uint64_t size = 0;
+ struct iatt prebuf = {0, };
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (vector, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) { /* posix fd */
+ STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count,
+ offset, flags, iobref, xdict);
+ return 0;
+ }
+
+ _fd = bd_fd->fd;
+
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+ size = bdatt->iatt.ia_size;
+
+ op_ret = __bd_pwritev (_fd, vector, count, offset, size);
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ op_ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64
+ ", %s", offset, strerror (op_errno));
+ goto out;
+ }
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+out:
+
+ BD_STACK_UNWIND (writev, frame, op_ret, op_errno, &prebuf,
+ &bdatt->iatt, NULL);
+ return 0;
+}
+
+int
+bd_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ bd_attr_t *bdatt = NULL;
+ int *valid = cookie;
+ bd_local_t *local = frame->local;
+
+ if (op_ret < 0 || !valid || !local)
+ goto out;
+
+ if (bd_inode_ctx_get (local->inode, this, &bdatt))
+ goto out;
+
+ if (*valid & GF_SET_ATTR_UID)
+ bdatt->iatt.ia_uid = postbuf->ia_uid;
+ else if (*valid & GF_SET_ATTR_GID)
+ bdatt->iatt.ia_gid = postbuf->ia_gid;
+ else if (*valid & GF_SET_ATTR_MODE) {
+ bdatt->iatt.ia_type = postbuf->ia_type;
+ bdatt->iatt.ia_prot = postbuf->ia_prot;
+ } else if (*valid & GF_SET_ATTR_ATIME) {
+ bdatt->iatt.ia_atime = postbuf->ia_atime;
+ bdatt->iatt.ia_atime_nsec = postbuf->ia_atime_nsec;
+ } else if (*valid & GF_SET_ATTR_MTIME) {
+ bdatt->iatt.ia_mtime = postbuf->ia_mtime;
+ bdatt->iatt.ia_mtime_nsec = postbuf->ia_mtime_nsec;
+ }
+
+ bdatt->iatt.ia_ctime = postbuf->ia_ctime;
+ bdatt->iatt.ia_ctime_nsec = postbuf->ia_ctime_nsec;
+
+ memcpy (postbuf, &bdatt->iatt, sizeof (struct iatt));
+out:
+ FREE (valid);
+ BD_STACK_UNWIND (setattr, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+int
+bd_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
+{
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+ int *ck_valid = NULL;
+ int op_errno = 0;
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ ck_valid = CALLOC (1, sizeof (valid));
+ BD_VALIDATE_MEM_ALLOC (ck_valid, op_errno, out);
+
+ local->inode = inode_ref (loc->inode);
+ *ck_valid = valid;
+
+ STACK_WIND_COOKIE (frame, bd_setattr_cbk, ck_valid, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (setattr, frame, -1, ENOMEM, NULL, NULL, xdata);
+ return 0;
+}
+
+int
+bd_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ bd_attr_t *bdatt = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (bd_inode_ctx_get (inode, this, &bdatt))
+ goto out;
+
+ bdatt->iatt.ia_ctime = buf->ia_ctime;
+ bdatt->iatt.ia_ctime_nsec = buf->ia_ctime_nsec;
+ bdatt->iatt.ia_nlink = buf->ia_nlink;
+ memcpy (buf, &bdatt->iatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (link, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, NULL);
+ return 0;
+}
+
+int
+bd_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ STACK_WIND (frame, bd_link_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+ return 0;
+}
+
+int
+bd_handle_special_xattrs (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ dict_t *xattr = NULL;
+ int op_ret = -1;
+ int op_errno = ENOMEM;;
+ bd_priv_t *priv = this->private;
+
+ xattr = dict_new ();
+ if (!xattr)
+ goto out;
+
+ if (!strcmp (name, VOL_TYPE))
+ op_ret = dict_set_int64 (xattr, (char *)name, 1);
+ else if (!strcmp (name, VOL_CAPS))
+ op_ret = dict_set_int64 (xattr, (char *)name, priv->caps);
+ else
+ op_ret = bd_get_origin (this->private, loc, fd, xattr);
+
+out:
+ if (loc)
+ BD_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr,
+ xdata);
+ else
+ BD_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr,
+ xdata);
+
+ op_ret = dict_reset (xattr);
+ dict_unref (xattr);
+
+ return 0;
+}
+
+int
+bd_fgetxattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS)
+ || !strcmp (name, BD_ORIGIN)))
+ bd_handle_special_xattrs (frame, this, NULL, fd, name, xdata);
+ else
+ STACK_WIND (frame, default_fgetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr,
+ fd, name, xdata);
+ return 0;
+}
+
+int
+bd_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS)
+ || !strcmp (name, BD_ORIGIN)))
+ bd_handle_special_xattrs (frame, this, loc, NULL, name, xdata);
+ else
+ STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ loc, name, xdata);
+
+ return 0;
+}
+
+int
+bd_unlink_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ bd_gfid_t gfid = {0, };
+ bd_local_t *local = frame->local;
+
+ if (buf->ia_nlink > 1)
+ goto posix;
+
+ BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out);
+
+ uuid_utoa_r (inode->gfid, gfid);
+ if (bd_delete_lv (this->private, gfid, &op_errno) < 0) {
+ if (op_errno != ENOENT)
+ goto out;
+ }
+
+posix:
+ /* remove posix */
+ STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ &local->loc, 0, NULL);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+bd_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflag, dict_t *xdata)
+{
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ loc, xflag, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ loc_copy (&local->loc, loc);
+
+ STACK_WIND (frame, bd_unlink_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, NULL);
+ return 0;
+out:
+ BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+bd_priv (xlator_t *this)
+{
+ return 0;
+}
+
+int32_t
+bd_inode (xlator_t *this)
+{
+ return 0;
+}
+
+int32_t
+bd_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ int32_t len, dict_t *xdata)
+{
+ int op_ret = -1;
+ int op_errno = 0;
+ int ret = 0;
+ int _fd = -1;
+ char *alloc_buf = NULL;
+ char *buf = NULL;
+ int32_t weak_checksum = 0;
+ bd_fd_t *bd_fd = NULL;
+ unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0};
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_rchecksum_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->rchecksum, fd, offset,
+ len, xdata);
+ return 0;
+ }
+
+ memset (strong_checksum, 0, MD5_DIGEST_LENGTH);
+
+ alloc_buf = page_aligned_alloc (len, &buf);
+ if (!alloc_buf) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ _fd = bd_fd->fd;
+
+ LOCK (&fd->lock);
+ {
+ ret = pread (_fd, buf, len, offset);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "pread of %d bytes returned %d (%s)",
+ len, ret, strerror (errno));
+ op_errno = errno;
+ }
+ }
+ UNLOCK (&fd->lock);
+
+ if (ret < 0)
+ goto out;
+
+ weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf,
+ (size_t) len);
+ gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len,
+ (unsigned char *) strong_checksum);
+
+ op_ret = 0;
+out:
+ BD_STACK_UNWIND (rchecksum, frame, op_ret, op_errno,
+ weak_checksum, strong_checksum, NULL);
+
+ GF_FREE (alloc_buf);
+
+ return 0;
+}
+
+/**
+ * notify - when parent sends PARENT_UP, send CHILD_UP event from here
+ */
+int32_t
+notify (xlator_t *this,
+ int32_t event,
+ void *data,
+ ...)
+{
+ switch (event)
+ {
+ case GF_EVENT_PARENT_UP:
+ {
+ /* Tell the parent that bd xlator is up */
+ default_notify (this, GF_EVENT_CHILD_UP, data);
+ }
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_bd_mt_end + 1);
+
+ if (ret != 0)
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+ return ret;
+}
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ int ret = -1;
+ bd_priv_t *priv = this->private;
+
+ GF_OPTION_RECONF ("bd-aio", priv->aio_configured, options,
+ bool, out);
+
+ if (priv->aio_configured)
+ bd_aio_on (this);
+ else
+ bd_aio_off (this);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/**
+ * bd xlator init - Validate configured VG
+ */
+int
+init (xlator_t *this)
+{
+ int ret = 0;
+ char *vg_data = NULL;
+ char *device = NULL;
+ bd_priv_t *_private = NULL;
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "FATAL: storage/bd needs posix as subvolume");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Volume is dangling. Please check the volume file.");
+ }
+
+ GF_OPTION_INIT ("export", vg_data, str, error);
+ GF_OPTION_INIT ("device", device, str, error);
+
+ /* Now we support only LV device */
+ if (strcasecmp (device, BACKEND_VG)) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "FATAL: unknown %s backend %s", BD_XLATOR, device);
+ return -1;
+ }
+
+ this->local_pool = mem_pool_new (bd_local_t, 64);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "FATAL: Failed to create bd memory pool");
+ return -1;
+ }
+
+ ret = 0;
+ _private = GF_CALLOC (1, sizeof (*_private), gf_bd_private);
+ if (!_private)
+ goto error;
+
+ this->private = _private;
+ _private->vg = gf_strdup (vg_data);
+ if (!_private->vg)
+ goto error;
+
+ _private->handle = lvm_init (NULL);
+ if (!_private->handle) {
+ gf_log (this->name, GF_LOG_CRITICAL, "lvm_init failed");
+ goto error;
+ }
+ _private->caps = BD_CAPS_BD;
+ if (bd_scan_vg (this, _private))
+ goto error;
+
+ _private->aio_init_done = _gf_false;
+ _private->aio_capable = _gf_false;
+
+ GF_OPTION_INIT ("bd-aio", _private->aio_configured, bool, error);
+ if (_private->aio_configured) {
+ if (bd_aio_on (this)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "BD AIO init failed");
+ ret = -1;
+ goto error;
+ }
+ }
+
+ _private->caps |= BD_CAPS_OFFLOAD_COPY | BD_CAPS_OFFLOAD_SNAPSHOT;
+
+ return 0;
+error:
+ GF_FREE (_private->vg);
+ if (_private->handle)
+ lvm_quit (_private->handle);
+ mem_pool_destroy (this->local_pool);
+ GF_FREE (_private);
+ return -1;
+}
+
+void
+fini (xlator_t *this)
+{
+ bd_priv_t *priv = this->private;
+ mem_pool_destroy (this->local_pool);
+ this->local_pool = NULL;
+ if (!priv)
+ return;
+ lvm_quit (priv->handle);
+ GF_FREE (priv->vg);
+ this->private = NULL;
+ GF_FREE (priv);
+ return;
+}
+
+struct xlator_dumpops dumpops = {
+ .priv = bd_priv,
+ .inode = bd_inode,
+};
+
+struct xlator_fops fops = {
+ .readdirp = bd_readdirp,
+ .lookup = bd_lookup,
+ .stat = bd_stat,
+ .statfs = bd_statfs,
+ .open = bd_open,
+ .fstat = bd_fstat,
+ .rchecksum = bd_rchecksum,
+ .readv = bd_readv,
+ .fsync = bd_fsync,
+ .setxattr = bd_setxattr,
+ .fsetxattr = bd_fsetxattr,
+ .removexattr = bd_removexattr,
+ .fremovexattr=bd_fremovexattr,
+ .truncate = bd_truncate,
+ .ftruncate = bd_ftruncate,
+ .writev = bd_writev,
+ .getxattr = bd_getxattr,
+ .fgetxattr = bd_fgetxattr,
+ .unlink = bd_unlink,
+ .link = bd_link,
+ .flush = bd_flush,
+ .setattr = bd_setattr,
+ .discard = bd_discard,
+};
+
+struct xlator_cbks cbks = {
+ .release = bd_release,
+ .forget = bd_forget,
+};
+
+struct volume_options options[] = {
+ { .key = {"export"},
+ .type = GF_OPTION_TYPE_STR},
+ { .key = {"device"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = BACKEND_VG},
+ {
+ .key = {"bd-aio"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Support for native Linux AIO"
+ },
+
+ { .key = {NULL} }
+};
diff --git a/xlators/storage/bd/src/bd.h b/xlators/storage/bd/src/bd.h
new file mode 100644
index 000000000..34b4c9e22
--- /dev/null
+++ b/xlators/storage/bd/src/bd.h
@@ -0,0 +1,178 @@
+/*
+ BD translator - Exports Block devices on server side as regular
+ files to client
+
+ Copyright IBM, Corp. 2012
+
+ This file is part of GlusterFS.
+
+ Author:
+ M. Mohan Kumar <mohan@in.ibm.com>
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _BD_H
+#define _BD_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#endif
+
+#include "xlator.h"
+#include "mem-types.h"
+
+#define BD_XLATOR "block device mapper xlator"
+#define BACKEND_VG "vg"
+#define GF_XATTR "user.glusterfs"
+#define BD_XATTR GF_XATTR ".bd"
+
+#define BD_LV "lv"
+#define BD_THIN "thin"
+
+#define LVM_RESIZE "/sbin/lvresize"
+#define LVM_CREATE "/sbin/lvcreate"
+#define LVM_CONVERT "/sbin/lvconvert"
+
+#define VOL_TYPE "volume.type"
+#define VOL_CAPS "volume.caps"
+
+#define ALIGN_SIZE 4096
+
+#define BD_CAPS_BD 0x01
+#define BD_CAPS_THIN 0x02
+#define BD_CAPS_OFFLOAD_COPY 0x04
+#define BD_CAPS_OFFLOAD_SNAPSHOT 0x08
+
+#define BD_CLONE "clone"
+#define BD_SNAPSHOT "snapshot"
+#define BD_MERGE "merge"
+#define BD_ORIGIN "list-origin"
+
+#define IOV_NR 4
+#define IOV_SIZE (64 * 1024)
+
+#define ALIGN_SIZE 4096
+
+#define LINKTO "trusted.glusterfs.dht.linkto"
+
+#define BD_VALIDATE_MEM_ALLOC(buff, op_errno, label) \
+ if (!buff) { \
+ op_errno = ENOMEM; \
+ gf_log (this->name, GF_LOG_ERROR, "out of memory"); \
+ goto label; \
+ }
+
+#define BD_VALIDATE_LOCAL_OR_GOTO(local, op_errno, label) \
+ if (!local) { \
+ op_errno = EINVAL; \
+ goto label; \
+ }
+
+#define BD_STACK_UNWIND(typ, frame, args ...) do { \
+ bd_local_t *__local = frame->local; \
+ xlator_t *__this = frame->this; \
+ \
+ frame->local = NULL; \
+ STACK_UNWIND_STRICT (typ, frame, args); \
+ if (__local) \
+ bd_local_free (__this, __local); \
+ } while (0)
+
+typedef char bd_gfid_t[GF_UUID_BUF_SIZE];
+
+enum gf_bd_mem_types_ {
+ gf_bd_private = gf_common_mt_end + 1,
+ gf_bd_attr,
+ gf_bd_fd,
+ gf_bd_mt_end
+};
+
+/**
+ * bd_fd - internal structure
+ */
+typedef struct bd_fd {
+ int fd;
+ int32_t flag;
+ int odirect;
+} bd_fd_t;
+
+typedef struct bd_priv {
+ lvm_t handle;
+ char *vg;
+ char *pool;
+ int caps;
+ gf_boolean_t aio_init_done;
+ gf_boolean_t aio_capable;
+ gf_boolean_t aio_configured;
+#ifdef HAVE_LIBAIO
+ io_context_t ctxp;
+ pthread_t aiothread;
+#endif
+} bd_priv_t;
+
+
+typedef enum bd_type {
+ BD_TYPE_NONE,
+ BD_TYPE_LV,
+} bd_type_t;
+
+typedef struct {
+ struct iatt iatt;
+ char *type;
+} bd_attr_t;
+
+typedef enum {
+ BD_OF_NONE,
+ BD_OF_CLONE,
+ BD_OF_SNAPSHOT,
+ BD_OF_MERGE,
+} bd_offload_t;
+
+typedef struct {
+ dict_t *dict;
+ bd_attr_t *bdatt;
+ inode_t *inode;
+ loc_t loc;
+ fd_t *fd;
+ data_t *data; /* for setxattr */
+ bd_offload_t offload;
+ uint64_t size;
+ loc_t *dloc;
+} bd_local_t;
+
+/* Prototypes */
+int bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx);
+int bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx);
+int bd_scan_vg (xlator_t *this, bd_priv_t *priv);
+bd_local_t *bd_local_init (call_frame_t *frame, xlator_t *this);
+void bd_local_free (xlator_t *this, bd_local_t *local);
+int bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd);
+char *page_aligned_alloc (size_t size, char **aligned_buf);
+int bd_validate_bd_xattr (xlator_t *this, char *bd, char **type,
+ uint64_t *lv_size, uuid_t uuid);
+uint64_t bd_get_default_extent (bd_priv_t *priv);
+uint64_t bd_adjust_size (bd_priv_t *priv, uint64_t size);
+int bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv);
+int bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size);
+int bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno);
+int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv);
+int bd_clone (bd_local_t *local, bd_priv_t *priv);
+
+int bd_merge (bd_priv_t *priv, uuid_t gfid);
+int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict);
+inline void bd_update_amtime(struct iatt *iatt, int flag);
+int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv);
+int bd_clone (bd_local_t *local, bd_priv_t *priv);
+int bd_merge (bd_priv_t *priv, uuid_t gfid);
+int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict);
+
+#endif
diff --git a/xlators/storage/bdb/src/Makefile.am b/xlators/storage/bdb/src/Makefile.am
deleted file mode 100644
index 7e2376979..000000000
--- a/xlators/storage/bdb/src/Makefile.am
+++ /dev/null
@@ -1,18 +0,0 @@
-
-xlator_LTLIBRARIES = bdb.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/storage
-
-bdb_la_LDFLAGS = -module -avoidversion
-
-bdb_la_SOURCES = bctx.c bdb-ll.c bdb.c
-bdb_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-noinst_HEADERS = bdb.h
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 -D_GNU_SOURCE -D$(GF_HOST_OS) -Wall \
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
-
-AM_LDFLAGS = -ldb
-
-CLEANFILES =
-
diff --git a/xlators/storage/bdb/src/bctx.c b/xlators/storage/bdb/src/bctx.c
deleted file mode 100644
index 18f563fb3..000000000
--- a/xlators/storage/bdb/src/bctx.c
+++ /dev/null
@@ -1,341 +0,0 @@
-/*
- Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#include <list.h>
-#include <bdb.h>
-#include <libgen.h> /* for dirname */
-
-static void
-__destroy_bctx (bctx_t *bctx)
-{
- if (bctx->directory)
- FREE (bctx->directory);
-
- if (bctx->db_path)
- FREE (bctx->db_path);
-
- FREE (bctx);
-}
-
-static void
-__unhash_bctx (bctx_t *bctx)
-{
- list_del_init (&bctx->b_hash);
-}
-
-static int32_t
-bctx_table_prune (bctx_table_t *table)
-{
- int32_t ret = 0;
- struct list_head purge = {0,};
- struct list_head *next = NULL;
- bctx_t *entry = NULL;
- bctx_t *del = NULL, *tmp = NULL;
-
- if (!table)
- return 0;
-
- INIT_LIST_HEAD (&purge);
-
- LOCK (&table->lock);
- {
- if ((table->lru_limit) &&
- (table->lru_size > table->lru_limit)) {
- while (table->lru_size > table->lru_limit) {
- next = table->b_lru.next;
- entry = list_entry (next, bctx_t, list);
-
- list_move_tail (next, &table->purge);
- __unhash_bctx (entry);
-
- table->lru_size--;
- ret++;
- }
- }
- list_move_tail (&purge, &table->purge);
- list_del_init (&table->purge);
- }
- UNLOCK (&table->lock);
-
- list_for_each_entry_safe (del, tmp, &purge, list) {
- list_del_init (&del->list);
- if (del->primary) {
- ret = del->primary->close (del->primary, 0);
- if (ret != 0) {
- gf_log (table->this->name, GF_LOG_DEBUG,
- "_BCTX_TABLE_PRUNE %s: %s "
- "(failed to close primary database)",
- del->directory, db_strerror (ret));
- } else {
- gf_log (table->this->name, GF_LOG_DEBUG,
- "_BCTX_TABLE_PRUNE %s (lru=%d)"
- "(closed primary database)",
- del->directory, table->lru_size);
- }
- }
- if (del->secondary) {
- ret = del->secondary->close (del->secondary, 0);
- if (ret != 0) {
- gf_log (table->this->name, GF_LOG_DEBUG,
- "_BCTX_TABLE_PRUNE %s: %s "
- "(failed to close secondary database)",
- del->directory, db_strerror (ret));
- } else {
- gf_log (table->this->name, GF_LOG_DEBUG,
- "_BCTX_TABLE_PRUNE %s (lru=%d)"
- "(closed secondary database)",
- del->directory, table->lru_size);
- }
- }
- __destroy_bctx (del);
- }
-
- return ret;
-}
-
-
-/* struct bdb_ctx related */
-static inline uint32_t
-bdb_key_hash (char *key, uint32_t hash_size)
-{
- uint32_t hash = 0;
-
- hash = *key;
-
- if (hash) {
- for (key += 1; *key != '\0'; key++) {
- hash = (hash << 5) - hash + *key;
- }
- }
-
- return (hash + *key) % hash_size;
-}
-
-static void
-__hash_bctx (bctx_t *bctx)
-{
- bctx_table_t *table = NULL;
- char *key = NULL;
-
- table = bctx->table;
-
- MAKE_KEY_FROM_PATH (key, bctx->directory);
- bctx->key_hash = bdb_key_hash (key, table->hash_size);
-
- list_del_init (&bctx->b_hash);
- list_add (&bctx->b_hash, &table->b_hash[bctx->key_hash]);
-}
-
-static inline bctx_t *
-__bctx_passivate (bctx_t *bctx)
-{
- if (bctx->primary) {
- list_move_tail (&bctx->list, &(bctx->table->b_lru));
- bctx->table->lru_size++;
- } else {
- list_move_tail (&bctx->list, &bctx->table->purge);
- __unhash_bctx (bctx);
- }
- return bctx;
-}
-
-static inline bctx_t *
-__bctx_activate (bctx_t *bctx)
-{
- list_move (&bctx->list, &bctx->table->active);
- bctx->table->lru_size--;
-
- return bctx;
-}
-
-static bctx_t *
-__bdb_ctx_unref (bctx_t *bctx)
-{
- assert (bctx->ref);
-
- --bctx->ref;
-
- if (!bctx->ref)
- bctx = __bctx_passivate (bctx);
-
- return bctx;
-}
-
-
-bctx_t *
-bctx_unref (bctx_t *bctx)
-{
- bctx_table_t *table = NULL;
-
- if (!bctx && !bctx->table)
- return NULL;
-
- table = bctx->table;
-
- LOCK (&table->lock);
- {
- bctx = __bdb_ctx_unref (bctx);
- }
- UNLOCK (&table->lock);
-
- bctx_table_prune (table);
-
- return bctx;
-}
-
-/*
- * NOTE: __bdb_ctx_ref() is called only after holding table->lock and
- * bctx->lock, in that order
- */
-static inline bctx_t *
-__bctx_ref (bctx_t *bctx)
-{
- if (!bctx->ref)
- __bctx_activate (bctx);
-
- bctx->ref++;
-
- return bctx;
-}
-
-bctx_t *
-bctx_ref (bctx_t *bctx)
-{
- LOCK (&(bctx->table->lock));
- {
- __bctx_ref (bctx);
- }
- UNLOCK (&(bctx->table->lock));
-
- return bctx;
-}
-
-
-#define BDB_THIS(table) (table->this)
-
-static inline bctx_t *
-__create_bctx (bctx_table_t *table,
- const char *path)
-{
- bctx_t *bctx = NULL;
- char *db_path = NULL;
-
- bctx = CALLOC (1, sizeof (*bctx));
- GF_VALIDATE_OR_GOTO ("bctx", bctx, out);
-
- bctx->table = table;
- bctx->directory = strdup (path);
- GF_VALIDATE_OR_GOTO ("bctx", bctx->directory, out);
-
- MAKE_REAL_PATH_TO_STORAGE_DB (db_path, BDB_THIS (table), path);
-
- bctx->db_path = strdup (db_path);
- GF_VALIDATE_OR_GOTO ("bctx", bctx->directory, out);
-
- INIT_LIST_HEAD (&bctx->c_list);
- INIT_LIST_HEAD (&bctx->list);
- INIT_LIST_HEAD (&bctx->b_hash);
-
- LOCK_INIT (&bctx->lock);
-
- __hash_bctx (bctx);
-
- list_add (&bctx->list, &table->b_lru);
- table->lru_size++;
-
-out:
- return bctx;
-}
-
-/* bctx_lookup - lookup bctx_t for the directory @directory.
- * (see description of bctx_t in bdb.h)
- *
- * @table: bctx_table_t for this instance of bdb.
- * @directory: directory for which bctx_t is being looked up.
- */
-bctx_t *
-bctx_lookup (bctx_table_t *table,
- const char *directory)
-{
- char *key = NULL;
- uint32_t key_hash = 0;
- bctx_t *trav = NULL, *bctx = NULL, *tmp = NULL;
- int32_t need_break = 0;
-
- GF_VALIDATE_OR_GOTO ("bctx", table, out);
- GF_VALIDATE_OR_GOTO ("bctx", directory, out);
-
- MAKE_KEY_FROM_PATH (key, directory);
- key_hash = bdb_key_hash (key, table->hash_size);
-
- LOCK (&table->lock);
- {
- if (list_empty (&table->b_hash[key_hash])) {
- goto creat_bctx;
- }
-
- list_for_each_entry_safe (trav, tmp, &table->b_hash[key_hash],
- b_hash) {
- LOCK(&trav->lock);
- {
- if (!strcmp(trav->directory, directory)) {
- bctx = __bctx_ref (trav);
- need_break = 1;
- }
- }
- UNLOCK(&trav->lock);
-
- if (need_break)
- break;
- }
-
- creat_bctx:
- if (!bctx) {
- bctx = __create_bctx (table, directory);
- bctx = __bctx_ref (bctx);
- }
- }
- UNLOCK (&table->lock);
-out:
- return bctx;
-}
-
-
-bctx_t *
-bctx_parent (bctx_table_t *table,
- const char *path)
-{
- char *pathname = NULL, *directory = NULL;
- bctx_t *bctx = NULL;
-
- GF_VALIDATE_OR_GOTO ("bctx", table, out);
- GF_VALIDATE_OR_GOTO ("bctx", path, out);
-
- pathname = strdup (path);
- GF_VALIDATE_OR_GOTO ("bctx", pathname, out);
- directory = dirname (pathname);
-
- bctx = bctx_lookup (table, directory);
- GF_VALIDATE_OR_GOTO ("bctx", bctx, out);
-
-out:
- if (pathname)
- free (pathname);
- return bctx;
-}
diff --git a/xlators/storage/bdb/src/bdb-ll.c b/xlators/storage/bdb/src/bdb-ll.c
deleted file mode 100644
index 777ff63e1..000000000
--- a/xlators/storage/bdb/src/bdb-ll.c
+++ /dev/null
@@ -1,1460 +0,0 @@
-/*
- Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#include <libgen.h>
-#include "bdb.h"
-#include <list.h>
-#include "hashfn.h"
-/*
- * implement the procedures to interact with bdb */
-
-/****************************************************************
- *
- * General wrappers and utility procedures for bdb xlator
- *
- ****************************************************************/
-
-ino_t
-bdb_inode_transform (ino_t parent,
- const char *name,
- size_t namelen)
-{
- ino_t ino = -1;
- uint64_t hash = 0;
-
- hash = gf_dm_hashfn (name, namelen);
-
- ino = (((parent << 32) | 0x00000000ffffffffULL)
- & (hash | 0xffffffff00000000ULL));
-
- return ino;
-}
-
-static int
-bdb_generate_secondary_hash (DB *secondary,
- const DBT *pkey,
- const DBT *data,
- DBT *skey)
-{
- char *primary = NULL;
- uint32_t *hash = NULL;
-
- primary = pkey->data;
-
- hash = calloc (1, sizeof (uint32_t));
-
- *hash = gf_dm_hashfn (primary, pkey->size);
-
- skey->data = hash;
- skey->size = sizeof (hash);
- skey->flags = DB_DBT_APPMALLOC;
-
- return 0;
-}
-
-/***********************************************************
- *
- * bdb storage database utilities
- *
- **********************************************************/
-
-/*
- * bdb_db_open - opens a storage db.
- *
- * @ctx: context specific to the directory for which we are supposed to open db
- *
- * see, if we have empty slots to open a db.
- * if (no-empty-slots), then prune open dbs and close as many as possible
- * if (empty-slot-available), tika muchkonDu db open maaDu
- *
- */
-static int
-bdb_db_open (bctx_t *bctx)
-{
- DB *primary = NULL;
- DB *secondary = NULL;
- int32_t ret = -1;
- bctx_table_t *table = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
-
- table = bctx->table;
- GF_VALIDATE_OR_GOTO ("bdb-ll", table, out);
-
- /* we have to do the following, we can't deny someone of db_open ;) */
- ret = db_create (&primary, table->dbenv, 0);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_OPEN %s: %s (failed to create database object"
- " for primary database)",
- bctx->directory, db_strerror (ret));
- ret = -ENOMEM;
- goto out;
- }
-
- if (table->page_size) {
- ret = primary->set_pagesize (primary,
- table->page_size);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_OPEN %s: %s (failed to set page-size "
- "to %"PRIu64")",
- bctx->directory, db_strerror (ret),
- table->page_size);
- } else {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_OPEN %s: page-size set to %"PRIu64,
- bctx->directory, table->page_size);
- }
- }
-
- ret = primary->open (primary, NULL, bctx->db_path, "primary",
- table->access_mode, table->dbflags, 0);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "_BDB_DB_OPEN %s: %s "
- "(failed to open primary database)",
- bctx->directory, db_strerror (ret));
- ret = -1;
- goto cleanup;
- }
-
- ret = db_create (&secondary, table->dbenv, 0);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_OPEN %s: %s (failed to create database object"
- " for secondary database)",
- bctx->directory, db_strerror (ret));
- ret = -ENOMEM;
- goto cleanup;
- }
-
- ret = secondary->open (secondary, NULL, bctx->db_path, "secondary",
- table->access_mode, table->dbflags, 0);
- if (ret != 0 ) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "_BDB_DB_OPEN %s: %s "
- "(failed to open secondary database)",
- bctx->directory, db_strerror (ret));
- ret = -1;
- goto cleanup;
- }
-
- ret = primary->associate (primary, NULL, secondary,
- bdb_generate_secondary_hash,
-#ifdef DB_IMMUTABLE_KEY
- DB_IMMUTABLE_KEY);
-#else
- 0);
-#endif
- if (ret != 0 ) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "_BDB_DB_OPEN %s: %s "
- "(failed to associate primary database with "
- "secondary database)",
- bctx->directory, db_strerror (ret));
- ret = -1;
- goto cleanup;
- }
-
-out:
- bctx->primary = primary;
- bctx->secondary = secondary;
-
- return ret;
-cleanup:
- if (primary)
- primary->close (primary, 0);
- if (secondary)
- secondary->close (secondary, 0);
-
- return ret;
-}
-
-int32_t
-bdb_cursor_close (bctx_t *bctx,
- DBC *cursorp)
-{
- int32_t ret = -1;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
- GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out);
-
- LOCK (&bctx->lock);
- {
-#ifdef HAVE_BDB_CURSOR_GET
- ret = cursorp->close (cursorp);
-#else
- ret = cursorp->c_close (cursorp);
-#endif
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CURSOR_CLOSE %s: %s "
- "(failed to close database cursor)",
- bctx->directory, db_strerror (ret));
- }
- }
- UNLOCK (&bctx->lock);
-
-out:
- return ret;
-}
-
-
-int32_t
-bdb_cursor_open (bctx_t *bctx,
- DBC **cursorpp)
-{
- int32_t ret = -1;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
- GF_VALIDATE_OR_GOTO ("bdb-ll", cursorpp, out);
-
- LOCK (&bctx->lock);
- {
- if (bctx->secondary) {
- /* do nothing, just continue */
- ret = 0;
- } else {
- ret = bdb_db_open (bctx);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CURSOR_OPEN %s: ENOMEM "
- "(failed to open secondary database)",
- bctx->directory);
- ret = -ENOMEM;
- } else {
- ret = 0;
- }
- }
-
- if (ret == 0) {
- /* all set, open cursor */
- ret = bctx->secondary->cursor (bctx->secondary,
- NULL, cursorpp, 0);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CURSOR_OPEN %s: %s "
- "(failed to open a cursor to database)",
- bctx->directory, db_strerror (ret));
- }
- }
- }
- UNLOCK (&bctx->lock);
-
-out:
- return ret;
-}
-
-
-/* cache related */
-static bdb_cache_t *
-bdb_cache_lookup (bctx_t *bctx,
- char *path)
-{
- bdb_cache_t *bcache = NULL;
- bdb_cache_t *trav = NULL;
- char *key = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
- GF_VALIDATE_OR_GOTO ("bdb-ll", path, out);
-
- MAKE_KEY_FROM_PATH (key, path);
-
- LOCK (&bctx->lock);
- {
- list_for_each_entry (trav, &bctx->c_list, c_list) {
- if (!strcmp (trav->key, key)){
- bcache = trav;
- break;
- }
- }
- }
- UNLOCK (&bctx->lock);
-
-out:
- return bcache;
-}
-
-static int32_t
-bdb_cache_insert (bctx_t *bctx,
- DBT *key,
- DBT *data)
-{
- bdb_cache_t *bcache = NULL;
- int32_t ret = -1;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
- GF_VALIDATE_OR_GOTO ("bdb-ll", key, out);
- GF_VALIDATE_OR_GOTO ("bdb-ll", data, out);
-
- LOCK (&bctx->lock);
- {
- if (bctx->c_count > 5) {
- /* most of the times, we enter here */
- /* FIXME: ugly, not supposed to disect any of the
- * 'struct list_head' directly */
- if (!list_empty (&bctx->c_list)) {
- bcache = list_entry (bctx->c_list.prev,
- bdb_cache_t, c_list);
- list_del_init (&bcache->c_list);
- }
- if (bcache->key) {
- free (bcache->key);
- bcache->key = calloc (key->size + 1,
- sizeof (char));
- GF_VALIDATE_OR_GOTO ("bdb-ll",
- bcache->key, unlock);
- memcpy (bcache->key, (char *)key->data,
- key->size);
- } else {
- /* should never come here */
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CACHE_INSERT %s (%s) "
- "(found a cache entry with empty key)",
- bctx->directory, (char *)key->data);
- } /* if(bcache->key)...else */
- if (bcache->data) {
- free (bcache->data);
- bcache->data = memdup (data->data, data->size);
- GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data,
- unlock);
- bcache->size = data->size;
- } else {
- /* should never come here */
- gf_log ("bdb-ll", GF_LOG_CRITICAL,
- "_BDB_CACHE_INSERT %s (%s) "
- "(found a cache entry with no data)",
- bctx->directory, (char *)key->data);
- } /* if(bcache->data)...else */
- list_add (&bcache->c_list, &bctx->c_list);
- ret = 0;
- } else {
- /* we will be entering here very rarely */
- bcache = CALLOC (1, sizeof (*bcache));
- GF_VALIDATE_OR_GOTO ("bdb-ll", bcache, unlock);
-
- bcache->key = calloc (key->size + 1, sizeof (char));
- GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->key, unlock);
- memcpy (bcache->key, key->data, key->size);
-
- bcache->data = memdup (data->data, data->size);
- GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, unlock);
-
- bcache->size = data->size;
- list_add (&bcache->c_list, &bctx->c_list);
- bctx->c_count++;
- ret = 0;
- } /* if(private->c_count < 5)...else */
- }
-unlock:
- UNLOCK (&bctx->lock);
-out:
- return ret;
-}
-
-static int32_t
-bdb_cache_delete (bctx_t *bctx,
- const char *key)
-{
- bdb_cache_t *bcache = NULL;
- bdb_cache_t *trav = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
- GF_VALIDATE_OR_GOTO ("bdb-ll", key, out);
-
- LOCK (&bctx->lock);
- {
- list_for_each_entry (trav, &bctx->c_list, c_list) {
- if (!strcmp (trav->key, key)){
- bctx->c_count--;
- bcache = trav;
- break;
- }
- }
-
- if (bcache) {
- list_del_init (&bcache->c_list);
- free (bcache->key);
- free (bcache->data);
- free (bcache);
- }
- }
- UNLOCK (&bctx->lock);
-
-out:
- return 0;
-}
-
-void *
-bdb_db_stat (bctx_t *bctx,
- DB_TXN *txnid,
- uint32_t flags)
-{
- DB *storage = NULL;
- void *stat = NULL;
- int32_t ret = -1;
-
- LOCK (&bctx->lock);
- {
- if (bctx->primary == NULL) {
- ret = bdb_db_open (bctx);
- storage = bctx->primary;
- } else {
- /* we are just fine, lets continue */
- storage = bctx->primary;
- } /* if(bctx->dbp==NULL)...else */
- }
- UNLOCK (&bctx->lock);
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
-
- ret = storage->stat (storage, txnid, &stat, flags);
-
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_STAT %s: %s "
- "(failed to do stat database)",
- bctx->directory, db_strerror (ret));
- }
-out:
- return stat;
-
-}
-
-/* bdb_storage_get - retrieve a key/value pair corresponding to @path from the
- * corresponding db file.
- *
- * @bctx: bctx_t * corresponding to the parent directory of @path. (should
- * always be a valid bctx). bdb_storage_get should never be called if
- * @bctx = NULL.
- * @txnid: NULL if bdb_storage_get is not embedded in an explicit transaction
- * or a valid DB_TXN *, when embedded in an explicit transaction.
- * @path: path of the file to read from (translated to a database key using
- * MAKE_KEY_FROM_PATH)
- * @buf: char ** - pointer to a pointer to char. a read buffer is created in
- * this procedure and pointer to the buffer is passed through @buf to the
- * caller.
- * @size: size of the file content to be read.
- * @offset: offset from which the file content to be read.
- *
- * NOTE: bdb_storage_get tries to open DB, if @bctx->dbp == NULL
- * (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by
- * bdb_table_prune()).
- *
- * NOTE: if private->cache is set (bdb xlator's internal caching enabled), then
- * bdb_storage_get first looks up the cache for key/value pair. if
- * bdb_lookup_cache fails, then only DB->get() is called. also, inserts a
- * newly read key/value pair to cache through bdb_insert_to_cache.
- *
- * return: 'number of bytes read' on success or -1 on error.
- *
- * also see: bdb_lookup_cache, bdb_insert_to_cache for details about bdb
- * xlator's internal cache.
- */
-static int32_t
-bdb_db_get (bctx_t *bctx,
- DB_TXN *txnid,
- const char *path,
- char *buf,
- size_t size,
- off_t offset)
-{
- DB *storage = NULL;
- DBT key = {0,};
- DBT value = {0,};
- int32_t ret = -1;
- size_t copy_size = 0;
- char *key_string = NULL;
- bdb_cache_t *bcache = NULL;
- int32_t db_flags = 0;
- uint8_t need_break = 0;
- int32_t retries = 1;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
- GF_VALIDATE_OR_GOTO ("bdb-ll", path, out);
-
- MAKE_KEY_FROM_PATH (key_string, path);
-
- if (bctx->cache &&
- ((bcache = bdb_cache_lookup (bctx, key_string)) != NULL)) {
- if (buf) {
- copy_size = ((bcache->size - offset) < size)?
- (bcache->size - offset) : size;
-
- memcpy (buf, (bcache->data + offset), copy_size);
- ret = copy_size;
- } else {
- ret = bcache->size;
- }
-
- goto out;
- }
-
- LOCK (&bctx->lock);
- {
- if (bctx->primary == NULL) {
- ret = bdb_db_open (bctx);
- storage = bctx->primary;
- } else {
- /* we are just fine, lets continue */
- storage = bctx->primary;
- } /* if(bctx->dbp==NULL)...else */
- }
- UNLOCK (&bctx->lock);
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
-
- key.data = (char *)key_string;
- key.size = strlen (key_string);
- key.flags = DB_DBT_USERMEM;
-
- if (bctx->cache){
- value.flags = DB_DBT_MALLOC;
- } else {
- if (size) {
- value.data = buf;
- value.ulen = size;
- value.flags = DB_DBT_USERMEM | DB_DBT_PARTIAL;
- } else {
- value.flags = DB_DBT_MALLOC;
- }
- value.dlen = size;
- value.doff = offset;
- }
-
- do {
- /* TODO: we prefer to give our own buffer to value.data
- * and ask bdb to fill in it */
- ret = storage->get (storage, txnid, &key, &value,
- db_flags);
-
- if (ret == DB_NOTFOUND) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_GET %s - %s: ENOENT"
- "(specified key not found in database)",
- bctx->directory, key_string);
- ret = -1;
- need_break = 1;
- } else if (ret == DB_LOCK_DEADLOCK) {
- retries++;
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_GET %s - %s"
- "(deadlock detected, retrying for %d "
- "time)",
- bctx->directory, key_string, retries);
- } else if (ret == 0) {
- /* successfully read data, lets set everything
- * in place and return */
- if (bctx->cache) {
- if (buf) {
- copy_size = ((value.size - offset) < size) ?
- (value.size - offset) : size;
-
- memcpy (buf, (value.data + offset),
- copy_size);
- ret = copy_size;
- }
-
- bdb_cache_insert (bctx, &key, &value);
- } else {
- ret = value.size;
- }
-
- if (size == 0)
- free (value.data);
-
- need_break = 1;
- } else {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_GET %s - %s: %s"
- "(failed to retrieve specified key from"
- " database)",
- bctx->directory, key_string,
- db_strerror (ret));
- ret = -1;
- need_break = 1;
- }
- } while (!need_break);
-
-out:
- return ret;
-}/* bdb_db_get */
-
-/* TODO: handle errors here and log. propogate only the errno to caller */
-int32_t
-bdb_db_fread (struct bdb_fd *bfd, char *buf, size_t size, off_t offset)
-{
- return bdb_db_get (bfd->ctx, NULL, bfd->key, buf, size, offset);
-}
-
-int32_t
-bdb_db_iread (struct bdb_ctx *bctx, const char *key, char **bufp)
-{
- char *buf = NULL;
- size_t size = 0;
- int64_t ret = 0;
-
- ret = bdb_db_get (bctx, NULL, key, NULL, 0, 0);
- size = ret;
-
- if (bufp) {
- buf = calloc (size, sizeof (char));
- *bufp = buf;
- ret = bdb_db_get (bctx, NULL, key, buf, size, 0);
- }
-
- return ret;
-}
-
-/* bdb_storage_put - insert a key/value specified to the corresponding DB.
- *
- * @bctx: bctx_t * corresponding to the parent directory of @path.
- * (should always be a valid bctx). bdb_storage_put should never be
- * called if @bctx = NULL.
- * @txnid: NULL if bdb_storage_put is not embedded in an explicit transaction
- * or a valid DB_TXN *, when embedded in an explicit transaction.
- * @key_string: key of the database entry.
- * @buf: pointer to the buffer data to be written as data for @key_string.
- * @size: size of @buf.
- * @offset: offset in the key's data to be modified with provided data.
- * @flags: valid flags are BDB_TRUNCATE_RECORD (to reduce the data of
- * @key_string to 0 size).
- *
- * NOTE: bdb_storage_put tries to open DB, if @bctx->dbp == NULL
- * (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by
- * bdb_table_prune()).
- *
- * NOTE: bdb_storage_put deletes the key/value from bdb xlator's internal cache.
- *
- * return: 0 on success or -1 on error.
- *
- * also see: bdb_cache_delete for details on how a cached key/value pair is
- * removed.
- */
-static int32_t
-bdb_db_put (bctx_t *bctx,
- DB_TXN *txnid,
- const char *key_string,
- const char *buf,
- size_t size,
- off_t offset,
- int32_t flags)
-{
- DB *storage = NULL;
- DBT key = {0,}, value = {0,};
- int32_t ret = -1;
- int32_t db_flags = DB_AUTO_COMMIT;
- uint8_t need_break = 0;
- int32_t retries = 1;
-
- LOCK (&bctx->lock);
- {
- if (bctx->primary == NULL) {
- ret = bdb_db_open (bctx);
- storage = bctx->primary;
- } else {
- /* we are just fine, lets continue */
- storage = bctx->primary;
- }
- }
- UNLOCK (&bctx->lock);
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
-
- if (bctx->cache) {
- ret = bdb_cache_delete (bctx, (char *)key_string);
- GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out);
- }
-
- key.data = (void *)key_string;
- key.size = strlen (key_string);
-
- /* NOTE: bdb lets us expand the file, suppose value.size > value.len,
- * then value.len bytes from value.doff offset and value.size bytes
- * will be written from value.doff and data from
- * value.doff + value.dlen will be pushed value.doff + value.size
- */
- value.data = (void *)buf;
-
- if (flags & BDB_TRUNCATE_RECORD) {
- value.size = size;
- value.doff = 0;
- value.dlen = offset;
- } else {
- value.size = size;
- value.dlen = size;
- value.doff = offset;
- }
- value.flags = DB_DBT_PARTIAL;
- if (buf == NULL && size == 0)
- /* truncate called us */
- value.flags = 0;
-
- do {
- ret = storage->put (storage, txnid, &key, &value, db_flags);
- if (ret == DB_LOCK_DEADLOCK) {
- retries++;
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_PUT %s - %s"
- "(deadlock detected, retying for %d time)",
- bctx->directory, key_string, retries);
- } else if (ret) {
- /* write failed */
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_PUT %s - %s: %s"
- "(failed to put specified entry into database)",
- bctx->directory, key_string, db_strerror (ret));
- need_break = 1;
- } else {
- /* successfully wrote */
- ret = 0;
- need_break = 1;
- }
- } while (!need_break);
-out:
- return ret;
-}/* bdb_db_put */
-
-int32_t
-bdb_db_icreate (struct bdb_ctx *bctx, const char *key)
-{
- return bdb_db_put (bctx, NULL, key, NULL, 0, 0, 0);
-}
-
-/* TODO: handle errors here and log. propogate only the errno to caller */
-int32_t
-bdb_db_fwrite (struct bdb_fd *bfd, char *buf, size_t size, off_t offset)
-{
- return bdb_db_put (bfd->ctx, NULL, bfd->key, buf, size, offset, 0);
-}
-
-/* TODO: handle errors here and log. propogate only the errno to caller */
-int32_t
-bdb_db_iwrite (struct bdb_ctx *bctx, const char *key, char *buf, size_t size)
-{
- return bdb_db_put (bctx, NULL, key, buf, size, 0, 0);
-}
-
-int32_t
-bdb_db_itruncate (struct bdb_ctx *bctx, const char *key)
-{
- return bdb_db_put (bctx, NULL, key, NULL, 0, 1, 0);
-}
-
-/* bdb_storage_del - delete a key/value pair corresponding to @path from
- * corresponding db file.
- *
- * @bctx: bctx_t * corresponding to the parent directory of @path.
- * (should always be a valid bctx). bdb_storage_del should never be called
- * if @bctx = NULL.
- * @txnid: NULL if bdb_storage_del is not embedded in an explicit transaction
- * or a valid DB_TXN *, when embedded in an explicit transaction.
- * @path: path to the file, whose key/value pair has to be deleted.
- *
- * NOTE: bdb_storage_del tries to open DB, if @bctx->dbp == NULL
- * (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by
- * bdb_table_prune()).
- *
- * return: 0 on success or -1 on error.
- */
-static int32_t
-bdb_db_del (bctx_t *bctx,
- DB_TXN *txnid,
- const char *key_string)
-{
- DB *storage = NULL;
- DBT key = {0,};
- int32_t ret = -1;
- int32_t db_flags = 0;
- uint8_t need_break = 0;
- int32_t retries = 1;
-
- LOCK (&bctx->lock);
- {
- if (bctx->primary == NULL) {
- ret = bdb_db_open (bctx);
- storage = bctx->primary;
- } else {
- /* we are just fine, lets continue */
- storage = bctx->primary;
- }
- }
- UNLOCK (&bctx->lock);
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
-
- ret = bdb_cache_delete (bctx, key_string);
- GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out);
-
- key.data = (char *)key_string;
- key.size = strlen (key_string);
- key.flags = DB_DBT_USERMEM;
-
- do {
- ret = storage->del (storage, txnid, &key, db_flags);
-
- if (ret == DB_NOTFOUND) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_DEL %s - %s: ENOENT"
- "(failed to delete entry, could not be "
- "found in the database)",
- bctx->directory, key_string);
- need_break = 1;
- } else if (ret == DB_LOCK_DEADLOCK) {
- retries++;
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_DEL %s - %s"
- "(deadlock detected, retying for %d time)",
- bctx->directory, key_string, retries);
- } else if (ret == 0) {
- /* successfully deleted the entry */
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_DEL %s - %s"
- "(successfully deleted entry from database)",
- bctx->directory, key_string);
- ret = 0;
- need_break = 1;
- } else {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_DEL %s - %s: %s"
- "(failed to delete entry from database)",
- bctx->directory, key_string, db_strerror (ret));
- ret = -1;
- need_break = 1;
- }
- } while (!need_break);
-out:
- return ret;
-}
-
-int32_t
-bdb_db_iremove (bctx_t *bctx,
- const char *key)
-{
- return bdb_db_del (bctx, NULL, key);
-}
-
-/* NOTE: bdb version compatibility wrapper */
-int32_t
-bdb_cursor_get (DBC *cursorp,
- DBT *sec, DBT *pri,
- DBT *val,
- int32_t flags)
-{
- int32_t ret = -1;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out);
-
-#ifdef HAVE_BDB_CURSOR_GET
- ret = cursorp->pget (cursorp, sec, pri, val, flags);
-#else
- ret = cursorp->c_pget (cursorp, sec, pri, val, flags);
-#endif
- if ((ret != 0) && (ret != DB_NOTFOUND)) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CURSOR_GET: %s"
- "(failed to retrieve entry from database cursor)",
- db_strerror (ret));
- }
-
-out:
- return ret;
-}/* bdb_cursor_get */
-
-int32_t
-bdb_dirent_size (DBT *key)
-{
- return ALIGN (24 /* FIX MEEEE!!! */ + key->size);
-}
-
-
-
-/* bdb_dbenv_init - initialize DB_ENV
- *
- * initialization includes:
- * 1. opening DB_ENV (db_env_create(), DB_ENV->open()).
- * NOTE: see private->envflags for flags used.
- * 2. DB_ENV->set_lg_dir - set log directory to be used for storing log files
- * (log files are the files in which transaction logs are written by db).
- * 3. DB_ENV->set_flags (DB_LOG_AUTOREMOVE) - set DB_ENV to automatically
- * clear the unwanted log files (flushed at each checkpoint).
- * 4. DB_ENV->set_errfile - set errfile to be used by db to report detailed
- * error logs. used only for debbuging purpose.
- *
- * return: returns a valid DB_ENV * on success or NULL on error.
- *
- */
-static DB_ENV *
-bdb_dbenv_init (xlator_t *this,
- char *directory)
-{
- /* Create a DB environment */
- DB_ENV *dbenv = NULL;
- int32_t ret = 0;
- bdb_private_t *private = NULL;
- int32_t fatal_flags = 0;
-
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (directory, err);
-
- private = this->private;
- VALIDATE_OR_GOTO (private, err);
-
- ret = db_env_create (&dbenv, 0);
- VALIDATE_OR_GOTO ((ret == 0), err);
-
- /* NOTE: set_errpfx returns 'void' */
- dbenv->set_errpfx(dbenv, this->name);
-
- ret = dbenv->set_lk_detect (dbenv, DB_LOCK_DEFAULT);
- VALIDATE_OR_GOTO ((ret == 0), err);
-
- ret = dbenv->open(dbenv, directory,
- private->envflags,
- S_IRUSR | S_IWUSR);
- if ((ret != 0) && (ret != DB_RUNRECOVERY)) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "failed to join Berkeley DB environment at %s: %s."
- "please run manual recovery and retry running "
- "glusterfs",
- directory, db_strerror (ret));
- dbenv = NULL;
- goto err;
- } else if (ret == DB_RUNRECOVERY) {
- fatal_flags = ((private->envflags & (~DB_RECOVER))
- | DB_RECOVER_FATAL);
- ret = dbenv->open(dbenv, directory, fatal_flags,
- S_IRUSR | S_IWUSR);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "failed to join Berkeley DB environment in "
- "recovery mode at %s: %s. please run manual "
- "recovery and retry running glusterfs",
- directory, db_strerror (ret));
- dbenv = NULL;
- goto err;
- }
- }
-
- ret = 0;
-#if (DB_VERSION_MAJOR == 4 && \
- DB_VERSION_MINOR == 7)
- if (private->log_auto_remove) {
- ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 1);
- } else {
- ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 0);
- }
-#else
- if (private->log_auto_remove) {
- ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 1);
- } else {
- ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 0);
- }
-#endif
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "autoremoval of transactional log files could not be "
- "configured (%s). you may have to do a manual "
- "monitoring of transactional log files and remove "
- "periodically.",
- db_strerror (ret));
- goto err;
- }
-
- if (private->transaction) {
- ret = dbenv->set_flags(dbenv, DB_AUTO_COMMIT, 1);
-
- if (ret != 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "configuration of auto-commit failed for "
- "database environment at %s. none of the "
- "operations will be embedded in transaction "
- "unless explicitly done so.",
- db_strerror (ret));
- goto err;
- }
-
- if (private->txn_timeout) {
- ret = dbenv->set_timeout (dbenv, private->txn_timeout,
- DB_SET_TXN_TIMEOUT);
- if (ret != 0) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "could not configure Berkeley DB "
- "transaction timeout to %d (%s). please"
- " review 'option transaction-timeout %d"
- "' option.",
- private->txn_timeout,
- db_strerror (ret),
- private->txn_timeout);
- goto err;
- }
- }
-
- if (private->lock_timeout) {
- ret = dbenv->set_timeout(dbenv,
- private->txn_timeout,
- DB_SET_LOCK_TIMEOUT);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "could not configure Berkeley DB "
- "lock timeout to %d (%s). please"
- " review 'option lock-timeout %d"
- "' option.",
- private->lock_timeout,
- db_strerror (ret),
- private->lock_timeout);
- goto err;
- }
- }
-
- ret = dbenv->set_lg_dir (dbenv, private->logdir);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "failed to configure libdb transaction log "
- "directory at %s. please review the "
- "'option logdir %s' option.",
- db_strerror (ret), private->logdir);
- goto err;
- }
- }
-
- if (private->errfile) {
- private->errfp = fopen (private->errfile, "a+");
- if (private->errfp) {
- dbenv->set_errfile (dbenv, private->errfp);
- } else {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "failed to open error logging file for "
- "libdb (Berkeley DB) internal logging (%s)."
- "please review the 'option errfile %s' option.",
- strerror (errno), private->errfile);
- goto err;
- }
- }
-
- return dbenv;
-err:
- if (dbenv) {
- dbenv->close (dbenv, 0);
- }
-
- return NULL;
-}
-
-#define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv)
-
-/* bdb_checkpoint - during transactional usage, db does not directly write the
- * data to db files, instead db writes a 'log' (similar to a journal entry)
- * into a log file. db normally clears the log files during opening of an
- * environment. since we expect a filesystem server to run for a pretty long
- * duration and flushing 'log's during dbenv->open would prove very costly, if
- * we accumulate the log entries for one complete run of glusterfs server. to
- * flush the logs frequently, db provides a mechanism called 'checkpointing'.
- * when we do a checkpoint, db flushes the logs to disk (writes changes to db
- * files) and we can also clear the accumulated log files after checkpointing.
- * NOTE: removing unwanted log files is not part of dbenv->txn_checkpoint()
- * call.
- *
- * @data: xlator_t of the current instance of bdb xlator.
- *
- * bdb_checkpoint is called in a different thread from the main glusterfs
- * thread. bdb xlator creates the checkpoint thread after successfully opening
- * the db environment.
- * NOTE: bdb_checkpoint thread shares the DB_ENV handle with the filesystem
- * thread.
- *
- * db environment checkpointing frequency is controlled by
- * 'option checkpoint-timeout <time-in-seconds>' in volfile.
- *
- * NOTE: checkpointing thread is started only if 'option transaction on'
- * specified in volfile. checkpointing is not valid for non-transactional
- * environments.
- *
- */
-static void *
-bdb_checkpoint (void *data)
-{
- xlator_t *this = NULL;
- struct bdb_private *private = NULL;
- DB_ENV *dbenv = NULL;
- int32_t ret = 0;
- uint32_t active = 0;
-
- this = (xlator_t *) data;
- dbenv = BDB_ENV(this);
- private = this->private;
-
- for (;;sleep (private->checkpoint_interval)) {
- LOCK (&private->active_lock);
- active = private->active;
- UNLOCK (&private->active_lock);
-
- if (active) {
- ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0);
- if (ret) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CHECKPOINT: %s"
- "(failed to checkpoint environment)",
- db_strerror (ret));
- } else {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CHECKPOINT: successfully "
- "checkpointed");
- }
- } else {
- ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0);
- if (ret) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "_BDB_CHECKPOINT: %s"
- "(final checkpointing failed. might "
- "need to run recovery tool manually on "
- "next usage of this database "
- "environment)",
- db_strerror (ret));
- } else {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CHECKPOINT: final successfully "
- "checkpointed");
- }
- break;
- }
- }
-
- return NULL;
-}
-
-
-/* bdb_db_init - initialize bdb xlator
- *
- * reads the options from @options dictionary and sets appropriate values in
- * @this->private. also initializes DB_ENV.
- *
- * return: 0 on success or -1 on error
- * (with logging the error through gf_log()).
- */
-int
-bdb_db_init (xlator_t *this,
- dict_t *options)
-{
- /* create a db entry for root */
- int32_t op_ret = 0;
- bdb_private_t *private = NULL;
- bctx_table_t *table = NULL;
-
- char *checkpoint_interval_str = NULL;
- char *page_size_str = NULL;
- char *lru_limit_str = NULL;
- char *timeout_str = NULL;
- char *access_mode = NULL;
- char *endptr = NULL;
- char *errfile = NULL;
- char *directory = NULL;
- char *logdir = NULL;
- char *mode = NULL;
- char *mode_str = NULL;
- int ret = -1;
- int idx = 0;
- struct stat stbuf = {0,};
-
- private = this->private;
-
- /* cache is always on */
- private->cache = ON;
-
- ret = dict_get_str (options, "access-mode", &access_mode);
- if ((ret == 0)
- && (!strcmp (access_mode, "btree"))) {
- gf_log (this->name, GF_LOG_DEBUG,
- "using BTREE access mode to access libdb "
- "(Berkeley DB)");
- private->access_mode = DB_BTREE;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "using HASH access mode to access libdb (Berkeley DB)");
- private->access_mode = DB_HASH;
- }
-
- ret = dict_get_str (options, "mode", &mode);
- if ((ret == 0)
- && (!strcmp (mode, "cache"))) {
- gf_log (this->name, GF_LOG_DEBUG,
- "cache data mode selected for 'storage/bdb'. filesystem"
- " operations are not transactionally protected and "
- "system crash does not guarantee recoverability of "
- "data");
- private->envflags = DB_CREATE | DB_INIT_LOG |
- DB_INIT_MPOOL | DB_THREAD;
- private->dbflags = DB_CREATE | DB_THREAD;
- private->transaction = OFF;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "persistent data mode selected for 'storage/bdb'. each"
- "filesystem operation is guaranteed to be Berkeley DB "
- "transaction protected.");
- private->transaction = ON;
- private->envflags = DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG |
- DB_INIT_MPOOL | DB_INIT_TXN | DB_RECOVER | DB_THREAD;
- private->dbflags = DB_CREATE | DB_THREAD;
-
-
- ret = dict_get_str (options, "lock-timeout", &timeout_str);
-
- if (ret == 0) {
- ret = gf_string2time (timeout_str,
- &private->lock_timeout);
-
- if (private->lock_timeout > 4260000) {
- /* db allows us to DB_SET_LOCK_TIMEOUT to be
- * set to a maximum of 71 mins
- * (4260000 milliseconds) */
- gf_log (this->name, GF_LOG_DEBUG,
- "Berkeley DB lock-timeout parameter "
- "(%d) is out of range. please specify"
- " a valid timeout value for "
- "lock-timeout and retry.",
- private->lock_timeout);
- goto err;
- }
- }
- ret = dict_get_str (options, "transaction-timeout",
- &timeout_str);
- if (ret == 0) {
- ret = gf_string2time (timeout_str,
- &private->txn_timeout);
-
- if (private->txn_timeout > 4260000) {
- /* db allows us to DB_SET_TXN_TIMEOUT to be set
- * to a maximum of 71 mins
- * (4260000 milliseconds) */
- gf_log (this->name, GF_LOG_DEBUG,
- "Berkeley DB lock-timeout parameter "
- "(%d) is out of range. please specify"
- " a valid timeout value for "
- "lock-timeout and retry.",
- private->lock_timeout);
- goto err;
- }
- }
-
- private->checkpoint_interval = BDB_DEFAULT_CHECKPOINT_INTERVAL;
- ret = dict_get_str (options, "checkpoint-interval",
- &checkpoint_interval_str);
- if (ret == 0) {
- ret = gf_string2time (checkpoint_interval_str,
- &private->checkpoint_interval);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "'%"PRIu32"' is not a valid parameter "
- "for checkpoint-interval option. "
- "please specify a valid "
- "checkpoint-interval and retry",
- private->checkpoint_interval);
- goto err;
- }
- }
- }
-
- ret = dict_get_str (options, "file-mode", &mode_str);
- if (ret == 0) {
- private->file_mode = strtol (mode_str, &endptr, 8);
-
- if ((*endptr) ||
- (!IS_VALID_FILE_MODE(private->file_mode))) {
- gf_log (this->name, GF_LOG_DEBUG,
- "'%o' is not a valid parameter for file-mode "
- "option. please specify a valid parameter for "
- "file-mode and retry.",
- private->file_mode);
- goto err;
- }
- } else {
- private->file_mode = DEFAULT_FILE_MODE;
- }
- private->symlink_mode = private->file_mode | S_IFLNK;
- private->file_mode = private->file_mode | S_IFREG;
-
- ret = dict_get_str (options, "dir-mode", &mode_str);
- if (ret == 0) {
- private->dir_mode = strtol (mode_str, &endptr, 8);
- if ((*endptr) ||
- (!IS_VALID_FILE_MODE(private->dir_mode))) {
- gf_log (this->name, GF_LOG_DEBUG,
- "'%o' is not a valid parameter for dir-mode "
- "option. please specify a valid parameter for "
- "dir-mode and retry.",
- private->dir_mode);
- goto err;
- }
- } else {
- private->dir_mode = DEFAULT_DIR_MODE;
- }
-
- private->dir_mode = private->dir_mode | S_IFDIR;
-
- table = CALLOC (1, sizeof (*table));
- if (table == NULL) {
- gf_log ("bdb-ll", GF_LOG_CRITICAL,
- "memory allocation for 'storage/bdb' internal "
- "context table failed.");
- goto err;
- }
-
- INIT_LIST_HEAD(&(table->b_lru));
- INIT_LIST_HEAD(&(table->active));
- INIT_LIST_HEAD(&(table->purge));
-
- LOCK_INIT (&table->lock);
- LOCK_INIT (&table->checkpoint_lock);
-
- table->transaction = private->transaction;
- table->access_mode = private->access_mode;
- table->dbflags = private->dbflags;
- table->this = this;
-
- ret = dict_get_str (options, "lru-limit",
- &lru_limit_str);
-
- /* TODO: set max lockers and max txns to accomodate
- * for more than lru_limit */
- if (ret == 0) {
- ret = gf_string2uint32 (lru_limit_str,
- &table->lru_limit);
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "setting lru limit of 'storage/bdb' internal context"
- "table to %d. maximum of %d unused databases can be "
- "open at any given point of time.",
- table->lru_limit, table->lru_limit);
- } else {
- table->lru_limit = BDB_DEFAULT_LRU_LIMIT;
- }
-
- ret = dict_get_str (options, "page-size",
- &page_size_str);
-
- if (ret == 0) {
- ret = gf_string2bytesize (page_size_str,
- &table->page_size);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "\"%s\" is an invalid parameter to "
- "\"option page-size\". please specify a valid "
- "size and retry.",
- page_size_str);
- goto err;
- }
-
- if (!PAGE_SIZE_IN_RANGE(table->page_size)) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "\"%s\" is out of range for Berkeley DB "
- "page-size. allowed page-size range is %d to "
- "%d. please specify a page-size value in the "
- "range and retry.",
- page_size_str, BDB_LL_PAGE_SIZE_MIN,
- BDB_LL_PAGE_SIZE_MAX);
- goto err;
- }
- } else {
- table->page_size = BDB_LL_PAGE_SIZE_DEFAULT;
- }
-
- table->hash_size = BDB_DEFAULT_HASH_SIZE;
- table->b_hash = CALLOC (BDB_DEFAULT_HASH_SIZE,
- sizeof (struct list_head));
-
- for (idx = 0; idx < table->hash_size; idx++)
- INIT_LIST_HEAD(&(table->b_hash[idx]));
-
- private->b_table = table;
-
- ret = dict_get_str (options, "errfile", &errfile);
- if (ret == 0) {
- private->errfile = strdup (errfile);
- gf_log (this->name, GF_LOG_DEBUG,
- "using %s as error logging file for libdb (Berkeley DB "
- "library) internal logging.", private->errfile);
- }
-
- ret = dict_get_str (options, "directory", &directory);
-
- if (ret == 0) {
- ret = dict_get_str (options, "logdir", &logdir);
-
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "using the database environment home "
- "directory (%s) itself as transaction log "
- "directory", directory);
- private->logdir = strdup (directory);
-
- } else {
- private->logdir = strdup (logdir);
-
- op_ret = stat (private->logdir, &stbuf);
- if ((op_ret != 0)
- || (!S_ISDIR (stbuf.st_mode))) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "specified logdir %s does not exist. "
- "please provide a valid existing "
- "directory as parameter to 'option "
- "logdir'",
- private->logdir);
- goto err;
- }
- }
-
- private->b_table->dbenv = bdb_dbenv_init (this, directory);
- if (private->b_table->dbenv == NULL) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "initialization of database environment "
- "failed");
- goto err;
- } else {
- if (private->transaction) {
- /* all well, start the checkpointing thread */
- LOCK_INIT (&private->active_lock);
-
- LOCK (&private->active_lock);
- {
- private->active = 1;
- }
- UNLOCK (&private->active_lock);
- pthread_create (&private->checkpoint_thread,
- NULL, bdb_checkpoint, this);
- }
- }
- }
-
- return op_ret;
-err:
- if (table) {
- FREE (table->b_hash);
- FREE (table);
- }
- if (private) {
- if (private->errfile)
- FREE (private->errfile);
-
- if (private->logdir)
- FREE (private->logdir);
- }
-
- return -1;
-}
diff --git a/xlators/storage/bdb/src/bdb.c b/xlators/storage/bdb/src/bdb.c
deleted file mode 100644
index 086d7d9e7..000000000
--- a/xlators/storage/bdb/src/bdb.c
+++ /dev/null
@@ -1,3624 +0,0 @@
-/*
- Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-/* bdb based storage translator - named as 'bdb' translator
- *
- *
- * There can be only two modes for files existing on bdb translator:
- * 1. DIRECTORY - directories are stored by bdb as regular directories on
- * back-end file-system. directories also have an entry in the ns_db.db of
- * their parent directory.
- * 2. REGULAR FILE - regular files are stored as records in the storage_db.db
- * present in the directory. regular files also have an entry in ns_db.db
- *
- * Internally bdb has a maximum of three different types of logical files
- * associated with each directory:
- * 1. storage_db.db - storage database, used to store the data corresponding to
- * regular files in the form of key/value pair. file-name is the 'key' and
- * data is 'value'.
- * 2. directory (all subdirectories) - any subdirectory will have a regular
- * directory entry.
- */
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#define __XOPEN_SOURCE 500
-
-#include <stdint.h>
-#include <sys/time.h>
-#include <errno.h>
-#include <ftw.h>
-#include <libgen.h>
-
-#include "glusterfs.h"
-#include "dict.h"
-#include "logging.h"
-#include "bdb.h"
-#include "xlator.h"
-#include "defaults.h"
-#include "common-utils.h"
-
-/* to be used only by fops, nobody else */
-#define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv)
-#define B_TABLE(this) (((struct bdb_private *)this->private)->b_table)
-
-
-int32_t
-bdb_mknod (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- mode_t mode,
- dev_t dev)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- char *key_string = NULL; /* after translating path to DB key */
- char *db_path = NULL;
- bctx_t *bctx = NULL;
- struct stat stbuf = {0,};
-
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- if (!S_ISREG(mode)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "MKNOD %"PRId64"/%s (%s): EPERM"
- "(mknod supported only for regular files. "
- "file mode '%o' not supported)",
- loc->parent->ino, loc->name, loc->path, mode);
- op_ret = -1;
- op_errno = EPERM;
- goto out;
- } /* if(!S_ISREG(mode)) */
-
- bctx = bctx_parent (B_TABLE(this), loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "MKNOD %"PRId64"/%s (%s): ENOMEM"
- "(failed to lookup database handle)",
- loc->parent->ino, loc->name, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory);
-
- op_ret = lstat (db_path, &stbuf);
- if (op_ret != 0) {
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_DEBUG,
- "MKNOD %"PRId64"/%s (%s): EINVAL"
- "(failed to lookup database handle)",
- loc->parent->ino, loc->name, loc->path);
- goto out;
- }
-
- MAKE_KEY_FROM_PATH (key_string, loc->path);
- op_ret = bdb_db_icreate (bctx, key_string);
- if (op_ret > 0) {
- /* create successful */
- stbuf.st_ino = bdb_inode_transform (loc->parent->ino,
- key_string,
- strlen (key_string));
- stbuf.st_mode = mode;
- stbuf.st_size = 0;
- stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, \
- stbuf.st_blksize);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "MKNOD %"PRId64"/%s (%s): ENOMEM"
- "(failed to create database entry)",
- loc->parent->ino, loc->name, loc->path);
- op_ret = -1;
- op_errno = EINVAL; /* TODO: errno sari illa */
- goto out;
- }/* if (!op_ret)...else */
-
-out:
- if (bctx) {
- /* NOTE: bctx_unref always returns success,
- * see description of bctx_unref for more details */
- bctx_unref (bctx);
- }
-
- STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf);
- return 0;
-}
-
-static inline int32_t
-is_dir_empty (xlator_t *this,
- loc_t *loc)
-{
- int32_t ret = 1;
- bctx_t *bctx = NULL;
- DIR *dir = NULL;
- char *real_path = NULL;
- void *dbstat = NULL;
- struct dirent *entry = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- bctx = bctx_lookup (B_TABLE(this), loc->path);
- if (bctx == NULL) {
- ret = -ENOMEM;
- goto out;
- }
-
- dbstat = bdb_db_stat (bctx, NULL, 0);
- if (dbstat) {
- switch (bctx->table->access_mode)
- {
- case DB_HASH:
- ret = (((DB_HASH_STAT *)dbstat)->hash_nkeys == 0);
- break;
- case DB_BTREE:
- case DB_RECNO:
- ret = (((DB_BTREE_STAT *)dbstat)->bt_nkeys == 0);
- break;
- case DB_QUEUE:
- ret = (((DB_QUEUE_STAT *)dbstat)->qs_nkeys == 0);
- break;
- case DB_UNKNOWN:
- gf_log (this->name, GF_LOG_CRITICAL,
- "unknown access-mode set for database");
- ret = 0;
- }
- } else {
- ret = -EBUSY;
- goto out;
- }
-
- MAKE_REAL_PATH (real_path, this, loc->path);
- dir = opendir (real_path);
- if (dir == NULL) {
- ret = -errno;
- goto out;
- }
-
- while ((entry = readdir (dir))) {
- if ((!IS_BDB_PRIVATE_FILE(entry->d_name)) &&
- (!IS_DOT_DOTDOT(entry->d_name))) {
- ret = 0;
- break;
- }/* if(!IS_BDB_PRIVATE_FILE()) */
- } /* while(true) */
- closedir (dir);
-out:
- if (bctx) {
- /* NOTE: bctx_unref always returns success,
- * see description of bctx_unref for more details */
- bctx_unref (bctx);
- }
-
- return ret;
-}
-
-int32_t
-bdb_rename (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc)
-{
- STACK_UNWIND (frame, -1, EXDEV, NULL);
- return 0;
-}
-
-int32_t
-bdb_link (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc)
-{
- STACK_UNWIND (frame, -1, EXDEV, NULL, NULL);
- return 0;
-}
-
-int32_t
-is_space_left (xlator_t *this,
- size_t size)
-{
- struct bdb_private *private = this->private;
- struct statvfs stbuf = {0,};
- int32_t ret = -1;
- fsblkcnt_t req_blocks = 0;
- fsblkcnt_t usable_blocks = 0;
-
- ret = statvfs (private->export_path, &stbuf);
- if (ret != 0) {
- ret = 0;
- } else {
- req_blocks = (size / stbuf.f_frsize) + 1;
-
- usable_blocks = (stbuf.f_bfree - BDB_ENOSPC_THRESHOLD);
-
- if (req_blocks < usable_blocks)
- ret = 1;
- else
- ret = 0;
- }
-
- return ret;
-}
-
-int32_t
-bdb_create (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flags,
- mode_t mode,
- fd_t *fd)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EPERM;
- char *db_path = NULL;
- struct stat stbuf = {0,};
- bctx_t *bctx = NULL;
- struct bdb_private *private = NULL;
- char *key_string = NULL;
- struct bdb_fd *bfd = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
-
- private = this->private;
-
- bctx = bctx_parent (B_TABLE(this), loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "CREATE %"PRId64"/%s (%s): ENOMEM"
- "(failed to lookup database handle)",
- loc->parent->ino, loc->name, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory);
- op_ret = lstat (db_path, &stbuf);
- if (op_ret != 0) {
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_DEBUG,
- "CREATE %"PRId64"/%s (%s): EINVAL"
- "(database file missing)",
- loc->parent->ino, loc->name, loc->path);
- goto out;
- }
-
- MAKE_KEY_FROM_PATH (key_string, loc->path);
- op_ret = bdb_db_icreate (bctx, key_string);
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "CREATE %"PRId64"/%s (%s): ENOMEM"
- "(failed to create database entry)",
- loc->parent->ino, loc->name, loc->path);
- op_errno = EINVAL; /* TODO: errno sari illa */
- goto out;
- }
-
- /* create successful */
- bfd = CALLOC (1, sizeof (*bfd));
- if (bfd == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "CREATE %"PRId64"/%s (%s): ENOMEM"
- "(failed to allocate memory for internal fd context)",
- loc->parent->ino, loc->name, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- /* NOTE: bdb_get_bctx_from () returns bctx with a ref */
- bfd->ctx = bctx;
- bfd->key = strdup (key_string);
- if (bfd->key == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "CREATE %"PRId64" (%s): ENOMEM"
- "(failed to allocate memory for internal fd->key)",
- loc->ino, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- BDB_FCTX_SET (fd, this, bfd);
-
- stbuf.st_ino = bdb_inode_transform (loc->parent->ino,
- key_string,
- strlen (key_string));
- stbuf.st_mode = private->file_mode;
- stbuf.st_size = 0;
- stbuf.st_nlink = 1;
- stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize);
- op_ret = 0;
- op_errno = 0;
-out:
- STACK_UNWIND (frame, op_ret, op_errno, fd, loc->inode, &stbuf);
-
- return 0;
-}
-
-
-/* bdb_open
- *
- * as input parameters bdb_open gets the file name, i.e key. bdb_open should
- * effectively
- * do: store key, open storage db, store storage-db pointer.
- *
- */
-int32_t
-bdb_open (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flags,
- fd_t *fd)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- bctx_t *bctx = NULL;
- char *key_string = NULL;
- struct bdb_fd *bfd = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
-
- bctx = bctx_parent (B_TABLE(this), loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "OPEN %"PRId64" (%s): ENOMEM"
- "(failed to lookup database handle)",
- loc->ino, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- bfd = CALLOC (1, sizeof (*bfd));
- if (bfd == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "OPEN %"PRId64" (%s): ENOMEM"
- "(failed to allocate memory for internal fd context)",
- loc->ino, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- /* NOTE: bctx_parent () returns bctx with a ref */
- bfd->ctx = bctx;
-
- MAKE_KEY_FROM_PATH (key_string, loc->path);
- bfd->key = strdup (key_string);
- if (bfd->key == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "OPEN %"PRId64" (%s): ENOMEM"
- "(failed to allocate memory for internal fd->key)",
- loc->ino, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- BDB_FCTX_SET (fd, this, bfd);
- op_ret = 0;
-out:
- STACK_UNWIND (frame, op_ret, op_errno, fd);
-
- return 0;
-}
-
-int32_t
-bdb_readv (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t offset)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- struct iovec vec = {0,};
- struct stat stbuf = {0,};
- struct bdb_fd *bfd = NULL;
- char *db_path = NULL;
- int32_t read_size = 0;
- struct iobref *iobref = NULL;
- struct iobuf *iobuf = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
-
- BDB_FCTX_GET (fd, this, &bfd);
- if (bfd == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "READV %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EBADFD"
- "(internal fd not found through fd)",
- fd->inode->ino, size, offset);
- op_errno = EBADFD;
- op_ret = -1;
- goto out;
- }
-
- MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory);
- op_ret = lstat (db_path, &stbuf);
- if (op_ret != 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "READV %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EINVAL"
- "(database file missing)",
- fd->inode->ino, size, offset);
- goto out;
- }
-
- iobuf = iobuf_get (this->ctx->iobuf_pool);
- if (!iobuf) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- /* we are ready to go */
- op_ret = bdb_db_fread (bfd, iobuf->ptr, size, offset);
- read_size = op_ret;
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "READV %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EBADFD"
- "(failed to find entry in database)",
- fd->inode->ino, size, offset);
- op_ret = -1;
- op_errno = ENOENT;
- goto out;
- } else if (op_ret == 0) {
- goto out;
- }
-
- iobref = iobref_new ();
- if (iobref == NULL) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- if (size < read_size) {
- op_ret = size;
- read_size = size;
- }
-
- iobref_add (iobref, iobuf);
-
- vec.iov_base = iobuf->ptr;
- vec.iov_len = read_size;
-
- stbuf.st_ino = fd->inode->ino;
- stbuf.st_size = bdb_db_fread (bfd, NULL, 0, 0);
- stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize);
- op_ret = size;
-out:
- STACK_UNWIND (frame, op_ret, op_errno, &vec, 1, &stbuf, iobuf);
-
- if (iobref)
- iobref_unref (iobref);
-
- if (iobuf)
- iobuf_unref (iobuf);
-
- return 0;
-}
-
-
-int32_t
-bdb_writev (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iovec *vector,
- int32_t count,
- off_t offset,
- struct iobref *iobref)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- struct stat stbuf = {0,};
- struct bdb_fd *bfd = NULL;
- int32_t idx = 0;
- off_t c_off = offset;
- int32_t c_ret = -1;
- char *db_path = NULL;
- size_t total_size = 0;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
- GF_VALIDATE_OR_GOTO (this->name, vector, out);
-
- BDB_FCTX_GET (fd, this, &bfd);
- if (bfd == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "WRITEV %"PRId64" - %"PRId32",%"PRId64": EBADFD"
- "(internal fd not found through fd)",
- fd->inode->ino, count, offset);
- op_ret = -1;
- op_errno = EBADFD;
- goto out;
- }
-
- MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory);
- op_ret = lstat (db_path, &stbuf);
- if (op_ret != 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "WRITEV %"PRId64" - %"PRId32",%"PRId64": EINVAL"
- "(database file missing)",
- fd->inode->ino, count, offset);
- goto out;
- }
-
- for (idx = 0; idx < count; idx++)
- total_size += vector[idx].iov_len;
-
- if (!is_space_left (this, total_size)) {
- gf_log (this->name, GF_LOG_ERROR,
- "WRITEV %"PRId64" - %"PRId32" (%"GF_PRI_SIZET"),%"
- PRId64": ENOSPC "
- "(not enough space after internal measurement)",
- fd->inode->ino, count, total_size, offset);
- op_ret = -1;
- op_errno = ENOSPC;
- goto out;
- }
-
- /* we are ready to go */
- for (idx = 0; idx < count; idx++) {
- c_ret = bdb_db_fwrite (bfd, vector[idx].iov_base,
- vector[idx].iov_len, c_off);
- if (c_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "WRITEV %"PRId64" - %"PRId32",%"PRId64": EINVAL"
- "(database write at %"PRId64" failed)",
- fd->inode->ino, count, offset, c_off);
- break;
- } else {
- c_off += vector[idx].iov_len;
- }
- op_ret += vector[idx].iov_len;
- } /* for(idx=0;...)... */
-
- if (c_ret) {
- /* write failed after a point, not an error */
- stbuf.st_size = bdb_db_fread (bfd, NULL, 0, 0);
- stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size,
- stbuf.st_blksize);
- goto out;
- }
-
- /* NOTE: we want to increment stbuf->st_size, as stored in db */
- stbuf.st_size = op_ret;
- stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize);
- op_errno = 0;
-
-out:
- STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
- return 0;
-}
-
-int32_t
-bdb_flush (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EPERM;
- struct bdb_fd *bfd = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
-
- BDB_FCTX_GET (fd, this, &bfd);
- if (bfd == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "FLUSH %"PRId64": EBADFD"
- "(internal fd not found through fd)",
- fd->inode->ino);
- op_ret = -1;
- op_errno = EBADFD;
- goto out;
- }
-
- /* do nothing */
- op_ret = 0;
- op_errno = 0;
-
-out:
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-int32_t
-bdb_release (xlator_t *this,
- fd_t *fd)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EBADFD;
- struct bdb_fd *bfd = NULL;
-
- BDB_FCTX_GET (fd, this, &bfd);
- if (bfd == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "RELEASE %"PRId64": EBADFD"
- "(internal fd not found through fd)",
- fd->inode->ino);
- op_ret = -1;
- op_errno = EBADFD;
- goto out;
- }
-
- bctx_unref (bfd->ctx);
- bfd->ctx = NULL;
-
- if (bfd->key)
- FREE (bfd->key); /* we did strdup() in bdb_open() */
- FREE (bfd);
- op_ret = 0;
- op_errno = 0;
-
-out:
- return 0;
-}/* bdb_release */
-
-
-int32_t
-bdb_fsync (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t datasync)
-{
- STACK_UNWIND (frame, 0, 0);
- return 0;
-}/* bdb_fsync */
-
-static int gf_bdb_lk_log;
-
-int32_t
-bdb_lk (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t cmd,
- struct flock *lock)
-{
- struct flock nullock = {0, };
-
- if (BDB_TIMED_LOG (ENOTSUP, gf_bdb_lk_log)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "LK %"PRId64": ENOTSUP "
- "(load \"features/locks\" translator to enable "
- "lock support)",
- fd->inode->ino);
- }
-
- STACK_UNWIND (frame, -1, ENOTSUP, &nullock);
- return 0;
-}/* bdb_lk */
-
-/* bdb_lookup
- *
- * there are four possibilities for a file being looked up:
- * 1. file exists and is a directory.
- * 2. file exists and is a symlink.
- * 3. file exists and is a regular file.
- * 4. file does not exist.
- * case 1 and 2 are handled by doing lstat() on the @loc. if the file is a
- * directory or symlink, lstat() succeeds. lookup continues to check if the
- * @loc belongs to case-3 only if lstat() fails.
- * to check for case 3, bdb_lookup does a bdb_db_iread() for the given @loc.
- * (see description of bdb_db_iread() for more details on how @loc is transformed
- * into db handle and key). if check for case 1, 2 and 3 fail, we proceed to
- * conclude that file doesn't exist (case 4).
- *
- * @frame: call frame.
- * @this: xlator_t of this instance of bdb xlator.
- * @loc: loc_t specifying the file to operate upon.
- * @need_xattr: if need_xattr != 0, we are asked to return all the extended
- * attributed of @loc, if any exist, in a dictionary. if @loc is a regular
- * file and need_xattr is set, then we look for value of need_xattr. if
- * need_xattr > sizo-of-the-file @loc, then the file content of @loc is
- * returned in dictionary of xattr with 'glusterfs.content' as dictionary key.
- *
- * NOTE: bdb currently supports only directories, symlinks and regular files.
- *
- * NOTE: bdb_lookup returns the 'struct stat' of underlying file itself, in
- * case of directory and symlink (st_ino is modified as bdb allocates its own
- * set of inodes of all files). for regular files, bdb uses 'struct stat' of
- * the database file in which the @loc is stored as templete and modifies
- * st_ino (see bdb_inode_transform for more details), st_mode (can be set in
- * volfile 'option file-mode <mode>'), st_size (exact size of the @loc
- * contents), st_blocks (block count on the underlying filesystem to
- * accomodate st_size, see BDB_COUNT_BLOCKS in bdb.h for more details).
- */
-int32_t
-bdb_lookup (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- dict_t *xattr_req)
-{
- struct stat stbuf = {0, };
- int32_t op_ret = -1;
- int32_t op_errno = ENOENT;
- dict_t *xattr = NULL;
- char *pathname = NULL;
- char *directory = NULL;
- char *real_path = NULL;
- bctx_t *bctx = NULL;
- char *db_path = NULL;
- struct bdb_private *private = NULL;
- char *key_string = NULL;
- int32_t entry_size = 0;
- char *file_content = NULL;
- uint64_t need_xattr = 0;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- private = this->private;
-
- MAKE_REAL_PATH (real_path, this, loc->path);
-
- pathname = strdup (loc->path);
- GF_VALIDATE_OR_GOTO (this->name, pathname, out);
-
- directory = dirname (pathname);
- GF_VALIDATE_OR_GOTO (this->name, directory, out);
-
- if (!strcmp (directory, loc->path)) {
- /* SPECIAL CASE: looking up root */
- op_ret = lstat (real_path, &stbuf);
- if (op_ret != 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "LOOKUP %"PRId64" (%s): %s",
- loc->ino, loc->path, strerror (op_errno));
- goto out;
- }
-
- /* bctx_lookup() returns NULL only when its time to wind up,
- * we should shutdown functioning */
- bctx = bctx_lookup (B_TABLE(this), (char *)loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "LOOKUP %"PRId64" (%s): ENOMEM"
- "(failed to lookup database handle)",
- loc->ino, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- stbuf.st_ino = 1;
- stbuf.st_mode = private->dir_mode;
-
- op_ret = 0;
- goto out;
- }
-
- MAKE_KEY_FROM_PATH (key_string, loc->path);
- op_ret = lstat (real_path, &stbuf);
- if ((op_ret == 0) && (S_ISDIR (stbuf.st_mode))){
- bctx = bctx_lookup (B_TABLE(this), (char *)loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "LOOKUP %"PRId64"/%s (%s): ENOMEM"
- "(failed to lookup database handle)",
- loc->parent->ino, loc->name, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- if (loc->ino) {
- /* revalidating directory inode */
- stbuf.st_ino = loc->ino;
- } else {
- stbuf.st_ino = bdb_inode_transform (loc->parent->ino,
- key_string,
- strlen (key_string));
- }
- stbuf.st_mode = private->dir_mode;
-
- op_ret = 0;
- goto out;
-
- } else if (op_ret == 0) {
- /* a symlink */
- bctx = bctx_parent (B_TABLE(this), loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "LOOKUP %"PRId64"/%s (%s): ENOMEM"
- "(failed to lookup database handle)",
- loc->parent->ino, loc->name, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- if (loc->ino) {
- stbuf.st_ino = loc->ino;
- } else {
- stbuf.st_ino = bdb_inode_transform (loc->parent->ino,
- key_string,
- strlen (key_string));
- }
-
- stbuf.st_mode = private->symlink_mode;
-
- op_ret = 0;
- goto out;
-
- }
-
- /* for regular files */
- bctx = bctx_parent (B_TABLE(this), loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "LOOKUP %"PRId64"/%s (%s): ENOMEM"
- "(failed to lookup database handle for parent)",
- loc->parent->ino, loc->name, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- if (GF_FILE_CONTENT_REQUESTED(xattr_req, &need_xattr)) {
- entry_size = bdb_db_iread (bctx, key_string, &file_content);
- } else {
- entry_size = bdb_db_iread (bctx, key_string, NULL);
- }
-
- op_ret = entry_size;
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "LOOKUP %"PRId64"/%s (%s): ENOENT"
- "(database entry not found)",
- loc->parent->ino, loc->name, loc->path);
- op_errno = ENOENT;
- goto out;
- }
-
- MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory);
- op_ret = lstat (db_path, &stbuf);
- if (op_ret != 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "LOOKUP %"PRId64"/%s (%s): %s",
- loc->parent->ino, loc->name, loc->path,
- strerror (op_errno));
- goto out;
- }
-
- if (entry_size
- && (need_xattr >= entry_size)
- && (file_content)) {
- xattr = dict_new ();
- op_ret = dict_set_dynptr (xattr, "glusterfs.content",
- file_content, entry_size);
- if (op_ret < 0) {
- /* continue without giving file contents */
- FREE (file_content);
- }
- } else {
- if (file_content)
- FREE (file_content);
- }
-
- if (loc->ino) {
- /* revalidate */
- stbuf.st_ino = loc->ino;
- stbuf.st_size = entry_size;
- stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size,
- stbuf.st_blksize);
- } else {
- /* fresh lookup, create an inode number */
- stbuf.st_ino = bdb_inode_transform (loc->parent->ino,
- key_string,
- strlen (key_string));
- stbuf.st_size = entry_size;
- stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size,
- stbuf.st_blksize);
- }/* if(inode->ino)...else */
- stbuf.st_nlink = 1;
- stbuf.st_mode = private->file_mode;
-
- op_ret = 0;
-out:
- if (bctx) {
- /* NOTE: bctx_unref always returns success,
- * see description of bctx_unref for more details */
- bctx_unref (bctx);
- }
-
- if (pathname)
- free (pathname);
-
- if (xattr)
- dict_ref (xattr);
-
- STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf, xattr);
-
- if (xattr)
- dict_unref (xattr);
-
- return 0;
-
-}/* bdb_lookup */
-
-int32_t
-bdb_stat (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
-
- struct stat stbuf = {0,};
- char *real_path = NULL;
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- struct bdb_private *private = NULL;
- char *db_path = NULL;
- bctx_t *bctx = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- private = this->private;
- GF_VALIDATE_OR_GOTO (this->name, private, out);
-
- MAKE_REAL_PATH (real_path, this, loc->path);
-
- op_ret = lstat (real_path, &stbuf);
- op_errno = errno;
- if (op_ret == 0) {
- /* directory or symlink */
- stbuf.st_ino = loc->inode->ino;
- if (S_ISDIR(stbuf.st_mode))
- stbuf.st_mode = private->dir_mode;
- else
- stbuf.st_mode = private->symlink_mode;
- /* we are done, lets unwind the stack */
- goto out;
- }
-
- bctx = bctx_parent (B_TABLE(this), loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "STAT %"PRId64" (%s): ENOMEM"
- "(no database handle for parent)",
- loc->ino, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory);
- op_ret = lstat (db_path, &stbuf);
- if (op_ret < 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "STAT %"PRId64" (%s): %s"
- "(failed to stat on database file)",
- loc->ino, loc->path, strerror (op_errno));
- goto out;
- }
-
- stbuf.st_size = bdb_db_iread (bctx, loc->path, NULL);
- stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize);
- stbuf.st_ino = loc->inode->ino;
-
-out:
- if (bctx) {
- /* NOTE: bctx_unref always returns success,
- * see description of bctx_unref for more details */
- bctx_unref (bctx);
- }
-
- STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
-
- return 0;
-}/* bdb_stat */
-
-
-
-/* bdb_opendir - in the world of bdb, open/opendir is all about opening
- * correspondind databases. opendir in particular, opens the database for the
- * directory which is to be opened. after opening the database, a cursor to
- * the database is also created. cursor helps us get the dentries one after
- * the other, and cursor maintains the state about current positions in
- * directory. pack 'pointer to db', 'pointer to the cursor' into
- * struct bdb_dir and store it in fd->ctx, we get from our parent xlator.
- *
- * @frame: call frame
- * @this: our information, as we filled during init()
- * @loc: location information
- * @fd: file descriptor structure (glusterfs internal)
- *
- * return value - immaterial, async call.
- *
- */
-int32_t
-bdb_opendir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- fd_t *fd)
-{
- char *real_path = NULL;
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- bctx_t *bctx = NULL;
- struct bdb_dir *bfd = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
-
- MAKE_REAL_PATH (real_path, this, loc->path);
-
- bctx = bctx_lookup (B_TABLE(this), (char *)loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "OPENDIR %"PRId64" (%s): ENOMEM"
- "(no database handle for directory)",
- loc->ino, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- bfd = CALLOC (1, sizeof (*bfd));
- if (bfd == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "OPENDIR %"PRId64" (%s): ENOMEM"
- "(failed to allocate memory for internal fd)",
- loc->ino, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto err;
- }
-
- bfd->dir = opendir (real_path);
- if (bfd->dir == NULL) {
- op_ret = -1;
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "OPENDIR %"PRId64" (%s): %s",
- loc->ino, loc->path, strerror (op_errno));
- goto err;
- }
-
- /* NOTE: bctx_lookup() return bctx with ref */
- bfd->ctx = bctx;
-
- bfd->path = strdup (real_path);
- if (bfd == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "OPENDIR %"PRId64" (%s): ENOMEM"
- "(failed to allocate memory for internal fd->path)",
- loc->ino, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto err;
- }
-
- BDB_FCTX_SET (fd, this, bfd);
- op_ret = 0;
-out:
- STACK_UNWIND (frame, op_ret, op_errno, fd);
- return 0;
-err:
- if (bctx)
- bctx_unref (bctx);
- if (bfd) {
- if (bfd->dir)
- closedir (bfd->dir);
-
- FREE (bfd);
- }
-
- return 0;
-}/* bdb_opendir */
-
-int32_t
-bdb_getdents (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t off,
- int32_t flag)
-{
- struct bdb_dir *bfd = NULL;
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- size_t filled = 0;
- dir_entry_t entries = {0, };
- dir_entry_t *this_entry = NULL;
- char *entry_path = NULL;
- struct dirent *dirent = NULL;
- off_t in_case = 0;
- int32_t this_size = 0;
- DBC *cursorp = NULL;
- int32_t ret = -1;
- int32_t real_path_len = 0;
- int32_t entry_path_len = 0;
- int32_t count = 0;
- off_t offset = 0;
- size_t tmp_name_len = 0;
- struct stat db_stbuf = {0,};
- struct stat buf = {0,};
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
-
- BDB_FCTX_GET (fd, this, &bfd);
- if (bfd == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64
- " %o: EBADFD "
- "(failed to find internal context in fd)",
- fd->inode->ino, size, off, flag);
- op_errno = EBADFD;
- op_ret = -1;
- goto out;
- }
-
- op_ret = bdb_cursor_open (bfd->ctx, &cursorp);
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64
- ": EBADFD "
- "(failed to open cursor to database handle)",
- fd->inode->ino, size, off);
- op_errno = EBADFD;
- goto out;
- }
-
- if (off) {
- DBT sec = {0,}, pri = {0,}, val = {0,};
- sec.data = &(off);
- sec.size = sizeof (off);
- sec.flags = DB_DBT_USERMEM;
- val.dlen = 0;
- val.doff = 0;
- val.flags = DB_DBT_PARTIAL;
-
- op_ret = bdb_cursor_get (cursorp, &sec, &pri, &val, DB_SET);
- if (op_ret == DB_NOTFOUND) {
- offset = off;
- goto dir_read;
- }
- }
-
- while (filled <= size) {
- DBT sec = {0,}, pri = {0,}, val = {0,};
-
- this_entry = NULL;
-
- sec.flags = DB_DBT_MALLOC;
- pri.flags = DB_DBT_MALLOC;
- val.dlen = 0;
- val.doff = 0;
- val.flags = DB_DBT_PARTIAL;
- op_ret = bdb_cursor_get (cursorp, &sec, &pri, &val, DB_NEXT);
-
- if (op_ret == DB_NOTFOUND) {
- /* we reached end of the directory */
- op_ret = 0;
- op_errno = 0;
- break;
- } else if (op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "GETDENTS %"PRId64" - %"GF_PRI_SIZET
- ",%"PRId64":"
- "(failed to read the next entry from database)",
- fd->inode->ino, size, off);
- op_errno = ENOENT;
- break;
- } /* if (op_ret == DB_NOTFOUND)...else if...else */
-
- if (pri.data == NULL) {
- /* NOTE: currently ignore when we get key.data == NULL.
- * FIXME: we should not get key.data = NULL */
- gf_log (this->name, GF_LOG_DEBUG,
- "GETDENTS %"PRId64" - %"GF_PRI_SIZET
- ",%"PRId64":"
- "(null key read for entry from database)",
- fd->inode->ino, size, off);
- continue;
- }/* if(key.data)...else */
-
- this_entry = CALLOC (1, sizeof (*this_entry));
- if (this_entry == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64
- " - %s:"
- "(failed to allocate memory for an entry)",
- fd->inode->ino, size, off, strerror (errno));
- op_errno = ENOMEM;
- op_ret = -1;
- goto out;
- }
-
- this_entry->name = CALLOC (pri.size + 1, sizeof (char));
- if (this_entry->name == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64
- " - %s:"
- "(failed to allocate memory for an "
- "entry->name)",
- fd->inode->ino, size, off, strerror (errno));
- op_errno = ENOMEM;
- op_ret = -1;
- goto out;
- }
-
- memcpy (this_entry->name, pri.data, pri.size);
- this_entry->buf = db_stbuf;
- this_entry->buf.st_size = bdb_db_iread (bfd->ctx,
- this_entry->name, NULL);
- this_entry->buf.st_blocks = BDB_COUNT_BLOCKS (
- this_entry->buf.st_size,
- this_entry->buf.st_blksize);
-
- this_entry->buf.st_ino = bdb_inode_transform (fd->inode->ino,
- pri.data,
- pri.size);
- count++;
-
- this_entry->next = entries.next;
- this_entry->link = "";
- entries.next = this_entry;
- /* if size is 0, count can never be = size,
- * so entire dir is read */
- if (sec.data)
- FREE (sec.data);
-
- if (pri.data)
- FREE (pri.data);
-
- if (count == size)
- break;
- }/* while */
- bdb_cursor_close (bfd->ctx, cursorp);
- op_ret = count;
- op_errno = 0;
- if (count >= size)
- goto out;
-dir_read:
- /* hungry kyaa? */
- if (!offset) {
- rewinddir (bfd->dir);
- } else {
- seekdir (bfd->dir, offset);
- }
-
- while (filled <= size) {
- this_entry = NULL;
- this_size = 0;
-
- in_case = telldir (bfd->dir);
- dirent = readdir (bfd->dir);
- if (!dirent)
- break;
-
- if (IS_BDB_PRIVATE_FILE(dirent->d_name))
- continue;
-
- tmp_name_len = strlen (dirent->d_name);
- if (entry_path_len < (real_path_len + 1 + (tmp_name_len) + 1)) {
- entry_path_len = real_path_len + tmp_name_len + 1024;
- entry_path = realloc (entry_path, entry_path_len);
- if (entry_path == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "GETDENTS %"PRId64" - %"GF_PRI_SIZET","
- "%"PRId64" - %s: (failed to allocate "
- "memory for an entry_path)",
- fd->inode->ino, size, off,
- strerror (errno));
- op_errno = ENOMEM;
- op_ret = -1;
- goto out;
- }
- }
-
- strncpy (&entry_path[real_path_len+1], dirent->d_name,
- tmp_name_len);
- op_ret = stat (entry_path, &buf);
- if (op_ret < 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64
- " - %s:"
- " (failed to stat on an entry '%s')",
- fd->inode->ino, size, off,
- strerror (errno), entry_path);
- goto out; /* FIXME: shouldn't we continue here */
- }
-
- if ((flag == GF_GET_DIR_ONLY) &&
- ((ret != -1) && (!S_ISDIR(buf.st_mode)))) {
- continue;
- }
-
- this_entry = CALLOC (1, sizeof (*this_entry));
- if (this_entry == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64
- " - %s:"
- "(failed to allocate memory for an entry)",
- fd->inode->ino, size, off, strerror (errno));
- op_errno = ENOMEM;
- op_ret = -1;
- goto out;
- }
-
- this_entry->name = strdup (dirent->d_name);
- if (this_entry->name == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64
- " - %s:"
- "(failed to allocate memory for an "
- "entry->name)",
- fd->inode->ino, size, off, strerror (errno));
- op_errno = ENOMEM;
- op_ret = -1;
- goto out;
- }
-
- this_entry->buf = buf;
-
- this_entry->buf.st_ino = -1;
- if (S_ISLNK(this_entry->buf.st_mode)) {
- char linkpath[ZR_PATH_MAX] = {0,};
- ret = readlink (entry_path, linkpath, ZR_PATH_MAX);
- if (ret != -1) {
- linkpath[ret] = '\0';
- this_entry->link = strdup (linkpath);
- }
- } else {
- this_entry->link = "";
- }
-
- count++;
-
- this_entry->next = entries.next;
- entries.next = this_entry;
-
- /* if size is 0, count can never be = size,
- * so entire dir is read */
- if (count == size)
- break;
- }
- op_ret = filled;
- op_errno = 0;
-
-out:
- gf_log (this->name, GF_LOG_DEBUG,
- "GETDENTS %"PRId64" - %"GF_PRI_SIZET" (%"PRId32")"
- "/%"GF_PRI_SIZET",%"PRId64":"
- "(failed to read the next entry from database)",
- fd->inode->ino, filled, count, size, off);
-
- STACK_UNWIND (frame, count, op_errno, &entries);
-
- while (entries.next) {
- this_entry = entries.next;
- entries.next = entries.next->next;
- FREE (this_entry->name);
- FREE (this_entry);
- }
-
- return 0;
-}/* bdb_getdents */
-
-
-int32_t
-bdb_releasedir (xlator_t *this,
- fd_t *fd)
-{
- int32_t op_ret = 0;
- int32_t op_errno = 0;
- struct bdb_dir *bfd = NULL;
-
- BDB_FCTX_GET (fd, this, &bfd);
- if (bfd == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "RELEASEDIR %"PRId64": EBADFD",
- fd->inode->ino);
- op_errno = EBADFD;
- op_ret = -1;
- goto out;
- }
-
- if (bfd->path) {
- free (bfd->path);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "RELEASEDIR %"PRId64": (bfd->path is NULL)",
- fd->inode->ino);
- }
-
- if (bfd->dir) {
- closedir (bfd->dir);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "RELEASEDIR %"PRId64": (bfd->dir is NULL)",
- fd->inode->ino);
- }
-
- if (bfd->ctx) {
- bctx_unref (bfd->ctx);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "RELEASEDIR %"PRId64": (bfd->ctx is NULL)",
- fd->inode->ino);
- }
-
- free (bfd);
-
-out:
- return 0;
-}/* bdb_releasedir */
-
-
-int32_t
-bdb_readlink (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- size_t size)
-{
- char *dest = NULL;
- int32_t op_ret = -1;
- int32_t op_errno = EPERM;
- char *real_path = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- dest = alloca (size + 1);
- GF_VALIDATE_OR_GOTO (this->name, dest, out);
-
- MAKE_REAL_PATH (real_path, this, loc->path);
-
- op_ret = readlink (real_path, dest, size);
-
- if (op_ret > 0)
- dest[op_ret] = 0;
-
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "READLINK %"PRId64" (%s): %s",
- loc->ino, loc->path, strerror (op_errno));
- }
-out:
- STACK_UNWIND (frame, op_ret, op_errno, dest);
-
- return 0;
-}/* bdb_readlink */
-
-
-int32_t
-bdb_mkdir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- mode_t mode)
-{
- int32_t op_ret = -1;
- int32_t ret = -1;
- int32_t op_errno = EINVAL;
- char *real_path = NULL;
- struct stat stbuf = {0, };
- bctx_t *bctx = NULL;
- char *key_string = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- MAKE_KEY_FROM_PATH (key_string, loc->path);
- MAKE_REAL_PATH (real_path, this, loc->path);
-
- op_ret = mkdir (real_path, mode);
- if (op_ret < 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "MKDIR %"PRId64" (%s): %s",
- loc->ino, loc->path, strerror (op_errno));
- goto out;
- }
-
- op_ret = chown (real_path, frame->root->uid, frame->root->gid);
- if (op_ret < 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "MKDIR %"PRId64" (%s): %s "
- "(failed to do chmod)",
- loc->ino, loc->path, strerror (op_errno));
- goto err;
- }
-
- op_ret = lstat (real_path, &stbuf);
- if (op_ret < 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "MKDIR %"PRId64" (%s): %s "
- "(failed to do lstat)",
- loc->ino, loc->path, strerror (op_errno));
- goto err;
- }
-
- bctx = bctx_lookup (B_TABLE(this), (char *)loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "MKDIR %"PRId64" (%s): ENOMEM"
- "(no database handle for parent)",
- loc->ino, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto err;
- }
-
- stbuf.st_ino = bdb_inode_transform (loc->parent->ino, key_string,
- strlen (key_string));
-
- goto out;
-
-err:
- ret = rmdir (real_path);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "MKDIR %"PRId64" (%s): %s"
- "(failed to do rmdir)",
- loc->ino, loc->path, strerror (errno));
- }
-
-out:
- if (bctx) {
- /* NOTE: bctx_unref always returns success,
- * see description of bctx_unref for more details */
- bctx_unref (bctx);
- }
-
- STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf);
-
- return 0;
-}/* bdb_mkdir */
-
-
-int32_t
-bdb_unlink (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- bctx_t *bctx = NULL;
- char *real_path = NULL;
- char *key_string = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- bctx = bctx_parent (B_TABLE(this), loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "UNLINK %"PRId64" (%s): ENOMEM"
- "(no database handle for parent)",
- loc->ino, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- MAKE_KEY_FROM_PATH (key_string, loc->path);
- op_ret = bdb_db_iremove (bctx, key_string);
- if (op_ret == DB_NOTFOUND) {
- MAKE_REAL_PATH (real_path, this, loc->path);
- op_ret = unlink (real_path);
- if (op_ret != 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "UNLINK %"PRId64" (%s): %s"
- "(symlink unlink failed)",
- loc->ino, loc->path, strerror (op_errno));
- goto out;
- }
- } else if (op_ret == 0) {
- op_errno = 0;
- }
-out:
- if (bctx) {
- /* NOTE: bctx_unref always returns success,
- * see description of bctx_unref for more details */
- bctx_unref (bctx);
- }
-
- STACK_UNWIND (frame, op_ret, op_errno);
-
- return 0;
-}/* bdb_unlink */
-
-
-
-static int32_t
-bdb_do_rmdir (xlator_t *this,
- loc_t *loc)
-{
- char *real_path = NULL;
- int32_t ret = -1;
- bctx_t *bctx = NULL;
- DB_ENV *dbenv = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- dbenv = BDB_ENV(this);
- GF_VALIDATE_OR_GOTO (this->name, dbenv, out);
-
- MAKE_REAL_PATH (real_path, this, loc->path);
-
- bctx = bctx_lookup (B_TABLE(this), loc->path);
- if (bctx == NULL) {
- ret = -ENOMEM;
- goto out;
- }
-
- LOCK(&bctx->lock);
- {
- if ((bctx->primary == NULL)
- || (bctx->secondary == NULL)) {
- goto unlock;
- }
-
- ret = bctx->primary->close (bctx->primary, 0);
- if (ret < 0) {
- ret = -EINVAL;
- }
-
- ret = bctx->secondary->close (bctx->secondary, 0);
- if (ret < 0) {
- ret = -EINVAL;
- }
-
- ret = dbenv->dbremove (dbenv, NULL, bctx->db_path,
- "primary", 0);
- if (ret < 0) {
- ret = -EBUSY;
- }
-
- ret = dbenv->dbremove (dbenv, NULL, bctx->db_path,
- "secondary", 0);
- if (ret != 0) {
- ret = -EBUSY;
- }
- }
-unlock:
- UNLOCK(&bctx->lock);
-
- if (ret) {
- goto out;
- }
- ret = rmdir (real_path);
-
-out:
- if (bctx) {
- /* NOTE: bctx_unref always returns success,
- * see description of bctx_unref for more details */
- bctx_unref (bctx);
- }
-
- return ret;
-}
-
-int32_t
-bdb_rmdir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- op_ret = is_dir_empty (this, loc);
- if (op_ret < 0) {
- op_errno = -op_ret;
- gf_log (this->name, GF_LOG_DEBUG,
- "RMDIR %"PRId64" (%s): %s"
- "(internal rmdir routine returned error)",
- loc->ino, loc->path, strerror (op_errno));
- } else if (op_ret == 0) {
- op_ret = -1;
- op_errno = ENOTEMPTY;
- gf_log (this->name, GF_LOG_DEBUG,
- "RMDIR %"PRId64" (%s): ENOTEMPTY",
- loc->ino, loc->path);
- goto out;
- }
-
- op_ret = bdb_do_rmdir (this, loc);
- if (op_ret < 0) {
- op_errno = -op_ret;
- gf_log (this->name, GF_LOG_DEBUG,
- "RMDIR %"PRId64" (%s): %s"
- "(internal rmdir routine returned error)",
- loc->ino, loc->path, strerror (op_errno));
- goto out;
- }
-
-out:
- STACK_UNWIND (frame, op_ret, op_errno);
-
- return 0;
-} /* bdb_rmdir */
-
-int32_t
-bdb_symlink (call_frame_t *frame,
- xlator_t *this,
- const char *linkname,
- loc_t *loc)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- char *real_path = NULL;
- struct stat stbuf = {0,};
- struct bdb_private *private = NULL;
- bctx_t *bctx = NULL;
- char *key_string = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
- GF_VALIDATE_OR_GOTO (this->name, linkname, out);
-
- private = this->private;
- GF_VALIDATE_OR_GOTO (this->name, private, out);
-
- MAKE_KEY_FROM_PATH (key_string, loc->path);
-
- MAKE_REAL_PATH (real_path, this, loc->path);
- op_ret = symlink (linkname, real_path);
- op_errno = errno;
- if (op_ret == 0) {
- op_ret = lstat (real_path, &stbuf);
- if (op_ret != 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "SYMLINK %"PRId64" (%s): %s",
- loc->ino, loc->path, strerror (op_errno));
- goto err;
- }
-
- bctx = bctx_parent (B_TABLE(this), loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "SYMLINK %"PRId64" (%s): ENOMEM"
- "(no database handle for parent)",
- loc->ino, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto err;
- }
-
- stbuf.st_ino = bdb_inode_transform (loc->parent->ino,
- key_string,
- strlen (key_string));
- stbuf.st_mode = private->symlink_mode;
-
- goto out;
- }
-err:
- op_ret = unlink (real_path);
- op_errno = errno;
- if (op_ret != 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "SYMLINK %"PRId64" (%s): %s"
- "(failed to unlink the created symlink)",
- loc->ino, loc->path, strerror (op_errno));
- }
- op_ret = -1;
- op_errno = ENOENT;
-out:
- if (bctx) {
- /* NOTE: bctx_unref always returns success,
- * see description of bctx_unref for more details */
- bctx_unref (bctx);
- }
-
- STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf);
-
- return 0;
-} /* bdb_symlink */
-
-int32_t
-bdb_chmod (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- mode_t mode)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- char *real_path = NULL;
- struct stat stbuf = {0,};
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- MAKE_REAL_PATH (real_path, this, loc->path);
- op_ret = lstat (real_path, &stbuf);
- op_errno = errno;
- if (op_ret != 0) {
- if (op_errno == ENOENT) {
- op_errno = EPERM;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "CHMOD %"PRId64" (%s): %s"
- "(lstat failed)",
- loc->ino, loc->path, strerror (op_errno));
- }
- goto out;
- }
-
- /* directory or symlink */
- op_ret = chmod (real_path, mode);
- op_errno = errno;
-
-out:
- STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
-
- return 0;
-}/* bdb_chmod */
-
-
-int32_t
-bdb_chown (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- uid_t uid,
- gid_t gid)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- char *real_path = NULL;
- struct stat stbuf = {0,};
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- MAKE_REAL_PATH (real_path, this, loc->path);
- op_ret = lstat (real_path, &stbuf);
- if (op_ret != 0) {
- op_errno = errno;
- if (op_errno == ENOENT) {
- op_errno = EPERM;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "CHOWN %"PRId64" (%s): %s"
- "(lstat failed)",
- loc->ino, loc->path, strerror (op_errno));
- }
- goto out;
- }
-
- /* directory or symlink */
- op_ret = lchown (real_path, uid, gid);
- op_errno = errno;
-out:
- STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
-
- return 0;
-}/* bdb_chown */
-
-
-int32_t
-bdb_truncate (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- off_t offset)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- char *real_path = NULL;
- struct stat stbuf = {0,};
- char *db_path = NULL;
- bctx_t *bctx = NULL;
- char *key_string = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- bctx = bctx_parent (B_TABLE(this), loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "TRUNCATE %"PRId64" (%s): ENOMEM"
- "(no database handle for parent)",
- loc->ino, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- MAKE_REAL_PATH (real_path, this, loc->path);
- MAKE_KEY_FROM_PATH (key_string, loc->path);
-
- /* now truncate */
- MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory);
- op_ret = lstat (db_path, &stbuf);
- if (op_ret != 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "TRUNCATE %"PRId64" (%s): %s"
- "(lstat on database file failed)",
- loc->ino, loc->path, strerror (op_errno));
- goto out;
- }
-
- if (loc->inode->ino) {
- stbuf.st_ino = loc->inode->ino;
- }else {
- stbuf.st_ino = bdb_inode_transform (loc->parent->ino,
- key_string,
- strlen (key_string));
- }
-
- op_ret = bdb_db_itruncate (bctx, key_string);
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "TRUNCATE %"PRId64" (%s): EINVAL"
- "(truncating entry in database failed - %s)",
- loc->ino, loc->path, db_strerror (op_ret));
- op_errno = EINVAL; /* TODO: better errno */
- }
-
-out:
- if (bctx) {
- /* NOTE: bctx_unref always returns success,
- * see description of bctx_unref for more details */
- bctx_unref (bctx);
- }
-
- STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
-
- return 0;
-}/* bdb_truncate */
-
-
-int32_t
-bdb_utimens (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- struct timespec ts[2])
-{
- int32_t op_ret = -1;
- int32_t op_errno = EPERM;
- char *real_path = NULL;
- struct stat stbuf = {0,};
- struct timeval tv[2] = {{0,},};
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- MAKE_REAL_PATH (real_path, this, loc->path);
- op_ret = sys_lstat (real_path, &stbuf);
- if (op_ret != 0) {
- op_errno = errno;
- if (op_errno == ENOENT) {
- op_errno = EPERM;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "UTIMENS %"PRId64" (%s): %s",
- loc->ino, loc->path, strerror (op_errno));
- }
- goto out;
- }
-
- /* directory or symlink */
- tv[0].tv_sec = ts[0].tv_sec;
- tv[0].tv_usec = ts[0].tv_nsec / 1000;
- tv[1].tv_sec = ts[1].tv_sec;
- tv[1].tv_usec = ts[1].tv_nsec / 1000;
-
- op_ret = lutimes (real_path, tv);
- if ((op_ret == -1) && (errno == ENOSYS)) {
- op_ret = sys_utimes (real_path, tv);
- }
-
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "UTIMENS %"PRId64" (%s): %s",
- loc->ino, loc->path, strerror (op_errno));
- goto out;
- }
-
- op_ret = sys_lstat (real_path, &stbuf);
- if (op_ret != 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "UTIMENS %"PRId64" (%s): %s",
- loc->ino, loc->path, strerror (op_errno));
- goto out;
- }
-
- stbuf.st_ino = loc->inode->ino;
-
-out:
- STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
-
- return 0;
-}/* bdb_utimens */
-
-int32_t
-bdb_statfs (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- char *real_path = NULL;
- struct statvfs buf = {0, };
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- MAKE_REAL_PATH (real_path, this, loc->path);
-
- op_ret = statvfs (real_path, &buf);
- op_errno = errno;
-out:
- STACK_UNWIND (frame, op_ret, op_errno, &buf);
- return 0;
-}/* bdb_statfs */
-
-static int gf_bdb_xattr_log;
-
-/* bdb_setxattr - set extended attributes.
- *
- * bdb allows setxattr operation only on directories.
- * bdb reservers 'glusterfs.file.<attribute-name>' to operate on the content
- * of the files under the specified directory.
- * 'glusterfs.file.<attribute-name>' transforms to contents of file of name
- * '<attribute-name>' under specified directory.
- *
- * @frame: call frame.
- * @this: xlator_t of this instance of bdb xlator.
- * @loc: loc_t specifying the file to operate upon.
- * @dict: list of extended attributes to set on @loc.
- * @flags: can be XATTR_REPLACE (replace an existing extended attribute only if
- * it exists) or XATTR_CREATE (create an extended attribute only if it
- * doesn't already exist).
- *
- *
- */
-int32_t
-bdb_setxattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- dict_t *dict,
- int flags)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- data_pair_t *trav = dict->members_list;
- bctx_t *bctx = NULL;
- char *real_path = NULL;
- char *key = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
- GF_VALIDATE_OR_GOTO (this->name, dict, out);
-
- MAKE_REAL_PATH (real_path, this, loc->path);
- if (!S_ISDIR (loc->inode->st_mode)) {
- op_ret = -1;
- op_errno = ENOATTR;
- goto out;
- }
-
- while (trav) {
- if (GF_FILE_CONTENT_REQUEST(trav->key) ) {
- key = BDB_KEY_FROM_FREQUEST_KEY(trav->key);
-
- bctx = bctx_lookup (B_TABLE(this), loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "SETXATTR %"PRId64" (%s) - %s: ENOMEM"
- "(no database handle for directory)",
- loc->ino, loc->path, key);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- if (flags & XATTR_REPLACE) {
- op_ret = bdb_db_itruncate (bctx, key);
- if (op_ret == -1) {
- /* key doesn't exist in database */
- gf_log (this->name, GF_LOG_DEBUG,
- "SETXATTR %"PRId64" (%s) - %s:"
- " (entry not present in "
- "database)",
- loc->ino, loc->path, key);
- op_ret = -1;
- op_errno = ENOATTR;
- break;
- }
- op_ret = bdb_db_iwrite (bctx, key,
- trav->value->data,
- trav->value->len);
- if (op_ret != 0) {
- op_ret = -1;
- op_errno = ENOATTR;
- break;
- }
- } else {
- /* fresh create */
- op_ret = bdb_db_iwrite (bctx, key,
- trav->value->data,
- trav->value->len);
- if (op_ret != 0) {
- op_ret = -1;
- op_errno = EEXIST;
- break;
- } else {
- op_ret = 0;
- op_errno = 0;
- } /* if(op_ret!=0)...else */
- } /* if(flags&XATTR_REPLACE)...else */
- if (bctx) {
- /* NOTE: bctx_unref always returns success, see
- * description of bctx_unref for more details */
- bctx_unref (bctx);
- }
- } else {
- /* do plain setxattr */
- op_ret = lsetxattr (real_path,
- trav->key, trav->value->data,
- trav->value->len,
- flags);
- op_errno = errno;
-
- if ((op_errno == ENOATTR) || (op_errno == EEXIST)) {
- /* don't log, normal behaviour */
- ;
- } else if (BDB_TIMED_LOG (op_errno, gf_bdb_xattr_log)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "SETXATTR %"PRId64" (%s) - %s: %s",
- loc->ino, loc->path, trav->key,
- strerror (op_errno));
- /* do not continue, break out */
- break;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "SETXATTR %"PRId64" (%s) - %s: %s",
- loc->ino, loc->path, trav->key,
- strerror (op_errno));
- }
- } /* if(ZR_FILE_CONTENT_REQUEST())...else */
- trav = trav->next;
- }/* while(trav) */
-out:
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}/* bdb_setxattr */
-
-
-/* bdb_gettxattr - get extended attributes.
- *
- * bdb allows getxattr operation only on directories.
- * bdb_getxattr retrieves the whole content of the file, when
- * glusterfs.file.<attribute-name> is specified.
- *
- * @frame: call frame.
- * @this: xlator_t of this instance of bdb xlator.
- * @loc: loc_t specifying the file to operate upon.
- * @name: name of extended attributes to get for @loc.
- *
- * NOTE: see description of bdb_setxattr for details on how
- * 'glusterfs.file.<attribute-name>' is handles by bdb.
- */
-int32_t
-bdb_getxattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- const char *name)
-{
- int32_t op_ret = 0;
- int32_t op_errno = 0;
- dict_t *dict = NULL;
- bctx_t *bctx = NULL;
- char *buf = NULL;
- char *key_string = NULL;
- int32_t list_offset = 0;
- size_t size = 0;
- size_t remaining_size = 0;
- char *real_path = NULL;
- char key[1024] = {0,};
- char *value = NULL;
- char *list = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
- GF_VALIDATE_OR_GOTO (this->name, name, out);
-
- dict = dict_new ();
- GF_VALIDATE_OR_GOTO (this->name, dict, out);
-
- if (!S_ISDIR (loc->inode->st_mode)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "GETXATTR %"PRId64" (%s) - %s: ENOATTR "
- "(not a directory)",
- loc->ino, loc->path, name);
- op_ret = -1;
- op_errno = ENOATTR;
- goto out;
- }
-
- if (name && GF_FILE_CONTENT_REQUEST(name)) {
- bctx = bctx_lookup (B_TABLE(this), loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "GETXATTR %"PRId64" (%s) - %s: ENOMEM"
- "(no database handle for directory)",
- loc->ino, loc->path, name);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- key_string = BDB_KEY_FROM_FREQUEST_KEY(name);
-
- op_ret = bdb_db_iread (bctx, key_string, &buf);
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "GETXATTR %"PRId64" (%s) - %s: ENOATTR"
- "(attribute not present in database)",
- loc->ino, loc->path, name);
- op_errno = ENOATTR;
- goto out;
- }
-
- op_ret = dict_set_dynptr (dict, (char *)name, buf, op_ret);
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "GETXATTR %"PRId64" (%s) - %s: ENOATTR"
- "(attribute present in database, "
- "dict set failed)",
- loc->ino, loc->path, name);
- op_errno = ENODATA;
- }
-
- goto out;
- }
-
- MAKE_REAL_PATH (real_path, this, loc->path);
- size = sys_llistxattr (real_path, NULL, 0);
- op_errno = errno;
- if (size < 0) {
- if (BDB_TIMED_LOG (op_errno, gf_bdb_xattr_log)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "GETXATTR %"PRId64" (%s) - %s: %s",
- loc->ino, loc->path, name, strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "GETXATTR %"PRId64" (%s) - %s: %s",
- loc->ino, loc->path, name, strerror (op_errno));
- }
- op_ret = -1;
- op_errno = ENOATTR;
-
- goto out;
- }
-
- if (size == 0)
- goto done;
-
- list = alloca (size + 1);
- if (list == NULL) {
- op_ret = -1;
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "GETXATTR %"PRId64" (%s) - %s: %s",
- loc->ino, loc->path, name, strerror (op_errno));
- }
-
- size = sys_llistxattr (real_path, list, size);
- op_ret = size;
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "GETXATTR %"PRId64" (%s) - %s: %s",
- loc->ino, loc->path, name, strerror (op_errno));
- goto out;
- }
-
- remaining_size = size;
- list_offset = 0;
- while (remaining_size > 0) {
- if(*(list+list_offset) == '\0')
- break;
-
- strcpy (key, list + list_offset);
-
- op_ret = sys_lgetxattr (real_path, key, NULL, 0);
- if (op_ret == -1)
- break;
-
- value = CALLOC (op_ret + 1, sizeof(char));
- GF_VALIDATE_OR_GOTO (this->name, value, out);
-
- op_ret = sys_lgetxattr (real_path, key, value,
- op_ret);
- if (op_ret == -1)
- break;
- value [op_ret] = '\0';
- op_ret = dict_set_dynptr (dict, key,
- value, op_ret);
- if (op_ret < 0) {
- FREE (value);
- gf_log (this->name, GF_LOG_DEBUG,
- "GETXATTR %"PRId64" (%s) - %s: "
- "(skipping key %s)",
- loc->ino, loc->path, name, key);
- continue;
- }
- remaining_size -= strlen (key) + 1;
- list_offset += strlen (key) + 1;
- } /* while(remaining_size>0) */
-done:
-out:
- if(bctx) {
- /* NOTE: bctx_unref always returns success,
- * see description of bctx_unref for more details */
- bctx_unref (bctx);
- }
-
- STACK_UNWIND (frame, op_ret, op_errno, dict);
-
- if (dict)
- dict_unref (dict);
-
- return 0;
-}/* bdb_getxattr */
-
-
-int32_t
-bdb_removexattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- const char *name)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- bctx_t *bctx = NULL;
- char *real_path = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
- GF_VALIDATE_OR_GOTO (this->name, name, out);
-
- if (!S_ISDIR(loc->inode->st_mode)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "REMOVEXATTR %"PRId64" (%s) - %s: ENOATTR "
- "(not a directory)",
- loc->ino, loc->path, name);
- op_ret = -1;
- op_errno = ENOATTR;
- goto out;
- }
-
- if (GF_FILE_CONTENT_REQUEST(name)) {
- bctx = bctx_lookup (B_TABLE(this), loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "REMOVEXATTR %"PRId64" (%s) - %s: ENOATTR"
- "(no database handle for directory)",
- loc->ino, loc->path, name);
- op_ret = -1;
- op_errno = ENOATTR;
- goto out;
- }
-
- op_ret = bdb_db_iremove (bctx, name);
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "REMOVEXATTR %"PRId64" (%s) - %s: ENOATTR"
- "(no such attribute in database)",
- loc->ino, loc->path, name);
- op_errno = ENOATTR;
- }
- goto out;
- }
-
- MAKE_REAL_PATH(real_path, this, loc->path);
- op_ret = lremovexattr (real_path, name);
- op_errno = errno;
- if (op_ret == -1) {
- if (BDB_TIMED_LOG (op_errno, gf_bdb_xattr_log)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "REMOVEXATTR %"PRId64" (%s) - %s: %s",
- loc->ino, loc->path, name, strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "REMOVEXATTR %"PRId64" (%s) - %s: %s",
- loc->ino, loc->path, name, strerror (op_errno));
- }
- } /* if(op_ret == -1) */
-out:
- if (bctx) {
- /* NOTE: bctx_unref always returns success,
- * see description of bctx_unref for more details */
- bctx_unref (bctx);
- }
-
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}/* bdb_removexattr */
-
-
-int32_t
-bdb_fsyncdir (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int datasync)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- struct bdb_fd *bfd = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
-
- BDB_FCTX_GET (fd, this, &bfd);
- if (bfd == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "FSYNCDIR %"PRId64": EBADFD"
- "(failed to find internal context from fd)",
- fd->inode->ino);
- op_errno = EBADFD;
- op_ret = -1;
- }
-
-out:
- STACK_UNWIND (frame, op_ret, op_errno);
-
- return 0;
-}/* bdb_fsycndir */
-
-
-int32_t
-bdb_access (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t mask)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- char *real_path = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- MAKE_REAL_PATH (real_path, this, loc->path);
-
- op_ret = access (real_path, mask);
- op_errno = errno;
- /* TODO: implement for db entries */
-out:
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}/* bdb_access */
-
-
-int32_t
-bdb_ftruncate (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- off_t offset)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EPERM;
- struct stat buf = {0,};
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
- /* TODO: impelement */
-out:
- STACK_UNWIND (frame, op_ret, op_errno, &buf);
-
- return 0;
-}
-
-int32_t
-bdb_fchown (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- uid_t uid,
- gid_t gid)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EPERM;
- struct stat buf = {0,};
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
-
- /* TODO: implement */
-out:
- STACK_UNWIND (frame, op_ret, op_errno, &buf);
-
- return 0;
-}
-
-
-int32_t
-bdb_fchmod (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- mode_t mode)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EPERM;
- struct stat buf = {0,};
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
-
- /* TODO: impelement */
-out:
- STACK_UNWIND (frame, op_ret, op_errno, &buf);
-
- return 0;
-}
-
-int32_t
-bdb_setdents (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags,
- dir_entry_t *entries,
- int32_t count)
-{
- int32_t op_ret = -1, op_errno = EINVAL;
- char *entry_path = NULL;
- int32_t real_path_len = 0;
- int32_t entry_path_len = 0;
- int32_t ret = 0;
- struct bdb_dir *bfd = NULL;
- dir_entry_t *trav = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
- GF_VALIDATE_OR_GOTO (this->name, entries, out);
-
- BDB_FCTX_GET (fd, this, &bfd);
- if (bfd == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "SETDENTS %"PRId64": EBADFD",
- fd->inode->ino);
- op_errno = EBADFD;
- op_ret = -1;
- goto out;
- }
-
- real_path_len = strlen (bfd->path);
- entry_path_len = real_path_len + 256;
- entry_path = CALLOC (1, entry_path_len);
- GF_VALIDATE_OR_GOTO (this->name, entry_path, out);
-
- strcpy (entry_path, bfd->path);
- entry_path[real_path_len] = '/';
-
- trav = entries->next;
- while (trav) {
- char pathname[ZR_PATH_MAX] = {0,};
- strcpy (pathname, entry_path);
- strcat (pathname, trav->name);
-
- if (S_ISDIR(trav->buf.st_mode)) {
- /* If the entry is directory, create it by calling
- * 'mkdir'. If directory is not present, it will be
- * created, if its present, no worries even if it fails.
- */
- ret = mkdir (pathname, trav->buf.st_mode);
- if ((ret == -1) && (errno != EEXIST)) {
- op_errno = errno;
- op_ret = ret;
- gf_log (this->name, GF_LOG_DEBUG,
- "SETDENTS %"PRId64" - %s: %s "
- "(mkdir failed)",
- fd->inode->ino, pathname,
- strerror (op_errno));
- goto loop;
- }
-
- /* Change the mode
- * NOTE: setdents tries its best to restore the state
- * of storage. if chmod and chown fail, they can
- * be ignored now */
- ret = chmod (pathname, trav->buf.st_mode);
- if (ret < 0) {
- op_ret = -1;
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "SETDENTS %"PRId64" - %s: %s "
- "(chmod failed)",
- fd->inode->ino, pathname,
- strerror (op_errno));
- goto loop;
- }
- /* change the ownership */
- ret = chown (pathname, trav->buf.st_uid,
- trav->buf.st_gid);
- if (ret != 0) {
- op_ret = -1;
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "SETDENTS %"PRId64" - %s: %s "
- "(chown failed)",
- fd->inode->ino, pathname,
- strerror (op_errno));
- goto loop;
- }
- } else if ((flags == GF_SET_IF_NOT_PRESENT) ||
- (flags != GF_SET_DIR_ONLY)) {
- /* Create a 0 byte file here */
- if (S_ISREG (trav->buf.st_mode)) {
- op_ret = bdb_db_icreate (bfd->ctx,
- trav->name);
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "SETDENTS %"PRId64" (%s) - %s: "
- "%s (database entry creation"
- " failed)",
- fd->inode->ino,
- bfd->ctx->directory, trav->name,
- strerror (op_errno));
- }
- } else if (S_ISLNK (trav->buf.st_mode)) {
- /* TODO: impelement */;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "SETDENTS %"PRId64" (%s) - %s mode=%o: "
- "(unsupported file type)",
- fd->inode->ino,
- bfd->ctx->directory, trav->name,
- trav->buf.st_mode);
- } /* if(S_ISREG())...else */
- } /* if(S_ISDIR())...else if */
- loop:
- /* consider the next entry */
- trav = trav->next;
- } /* while(trav) */
-
-out:
- STACK_UNWIND (frame, op_ret, op_errno);
-
- FREE (entry_path);
- return 0;
-}
-
-int32_t
-bdb_fstat (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- struct stat stbuf = {0,};
- struct bdb_fd *bfd = NULL;
- bctx_t *bctx = NULL;
- char *db_path = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
-
- BDB_FCTX_GET (fd, this, &bfd);
- if (bfd == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "FSTAT %"PRId64": EBADFD "
- "(failed to find internal context in fd)",
- fd->inode->ino);
- op_errno = EBADFD;
- op_ret = -1;
- goto out;
- }
-
- bctx = bfd->ctx;
-
- MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory);
- op_ret = lstat (db_path, &stbuf);
- op_errno = errno;
- if (op_ret != 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "FSTAT %"PRId64": %s"
- "(failed to stat database file %s)",
- fd->inode->ino, strerror (op_errno), db_path);
- goto out;
- }
-
- stbuf.st_ino = fd->inode->ino;
- stbuf.st_size = bdb_db_fread (bfd, NULL, 0, 0);
- stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize);
-
-out:
- STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
- return 0;
-}
-
-gf_dirent_t *
-gf_dirent_for_namen (const char *name,
- size_t len)
-{
- char *tmp_name = NULL;
-
- tmp_name = alloca (len + 1);
-
- memcpy (tmp_name, name, len);
-
- tmp_name[len] = 0;
-
- return gf_dirent_for_name (tmp_name);
-}
-
-int32_t
-bdb_readdir (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t off)
-{
- struct bdb_dir *bfd = NULL;
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- size_t filled = 0;
- gf_dirent_t *this_entry = NULL;
- gf_dirent_t entries;
- struct dirent *entry = NULL;
- off_t in_case = 0;
- int32_t this_size = 0;
- DBC *cursorp = NULL;
- int32_t count = 0;
- off_t offset = 0;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
-
- INIT_LIST_HEAD (&entries.list);
-
- BDB_FCTX_GET (fd, this, &bfd);
- if (bfd == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "READDIR %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EBADFD "
- "(failed to find internal context in fd)",
- fd->inode->ino, size, off);
- op_errno = EBADFD;
- op_ret = -1;
- goto out;
- }
-
- op_ret = bdb_cursor_open (bfd->ctx, &cursorp);
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "READDIR %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EBADFD "
- "(failed to open cursor to database handle)",
- fd->inode->ino, size, off);
- op_errno = EBADFD;
- goto out;
- }
-
- if (off) {
- DBT sec = {0,}, pri = {0,}, val = {0,};
- sec.data = &(off);
- sec.size = sizeof (off);
- sec.flags = DB_DBT_USERMEM;
- val.dlen = 0;
- val.doff = 0;
- val.flags = DB_DBT_PARTIAL;
-
- op_ret = bdb_cursor_get (cursorp, &sec, &pri, &val, DB_SET);
- if (op_ret == DB_NOTFOUND) {
- offset = off;
- goto dir_read;
- }
- }
-
- while (filled <= size) {
- DBT sec = {0,}, pri = {0,}, val = {0,};
-
- this_entry = NULL;
-
- sec.flags = DB_DBT_MALLOC;
- pri.flags = DB_DBT_MALLOC;
- val.dlen = 0;
- val.doff = 0;
- val.flags = DB_DBT_PARTIAL;
- op_ret = bdb_cursor_get (cursorp, &sec, &pri, &val, DB_NEXT);
-
- if (op_ret == DB_NOTFOUND) {
- /* we reached end of the directory */
- op_ret = 0;
- op_errno = 0;
- break;
- } else if (op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "READDIR %"PRId64" - %"GF_PRI_SIZET",%"PRId64":"
- "(failed to read the next entry from database)",
- fd->inode->ino, size, off);
- op_errno = ENOENT;
- break;
- } /* if (op_ret == DB_NOTFOUND)...else if...else */
-
- if (pri.data == NULL) {
- /* NOTE: currently ignore when we get key.data == NULL.
- * TODO: we should not get key.data = NULL */
- gf_log (this->name, GF_LOG_DEBUG,
- "READDIR %"PRId64" - %"GF_PRI_SIZET",%"PRId64":"
- "(null key read for entry from database)",
- fd->inode->ino, size, off);
- continue;
- }/* if(key.data)...else */
- count++;
- this_size = bdb_dirent_size (&pri);
- if (this_size + filled > size)
- break;
- /* TODO - consider endianness here */
- this_entry = gf_dirent_for_namen ((const char *)pri.data,
- pri.size);
-
- this_entry->d_ino = bdb_inode_transform (fd->inode->ino,
- pri.data,
- pri.size);
- this_entry->d_off = *(uint32_t *)sec.data;
- this_entry->d_type = 0;
- this_entry->d_len = pri.size + 1;
-
- if (sec.data) {
- FREE (sec.data);
- }
-
- if (pri.data)
- FREE (pri.data);
-
- list_add_tail (&this_entry->list, &entries.list);
-
- filled += this_size;
- }/* while */
- bdb_cursor_close (bfd->ctx, cursorp);
- op_ret = filled;
- op_errno = 0;
- if (filled >= size) {
- goto out;
- }
-dir_read:
- /* hungry kyaa? */
- if (!offset) {
- rewinddir (bfd->dir);
- } else {
- seekdir (bfd->dir, offset);
- }
-
- while (filled <= size) {
- this_entry = NULL;
- entry = NULL;
- this_size = 0;
-
- in_case = telldir (bfd->dir);
- entry = readdir (bfd->dir);
- if (!entry)
- break;
-
- if (IS_BDB_PRIVATE_FILE(entry->d_name))
- continue;
-
- this_size = dirent_size (entry);
-
- if (this_size + filled > size) {
- seekdir (bfd->dir, in_case);
- break;
- }
-
- count++;
-
- this_entry = gf_dirent_for_name (entry->d_name);
- this_entry->d_ino = entry->d_ino;
-
- this_entry->d_off = entry->d_off;
-
- this_entry->d_type = entry->d_type;
- this_entry->d_len = entry->d_reclen;
-
-
- list_add_tail (&this_entry->list, &entries.list);
-
- filled += this_size;
- }
- op_ret = filled;
- op_errno = 0;
-
-out:
- gf_log (this->name, GF_LOG_DEBUG,
- "READDIR %"PRId64" - %"GF_PRI_SIZET" (%"PRId32")"
- "/%"GF_PRI_SIZET",%"PRId64":"
- "(failed to read the next entry from database)",
- fd->inode->ino, filled, count, size, off);
-
- STACK_UNWIND (frame, count, op_errno, &entries);
-
- gf_dirent_free (&entries);
-
- return 0;
-}
-
-
-int32_t
-bdb_stats (call_frame_t *frame,
- xlator_t *this,
- int32_t flags)
-
-{
- int32_t op_ret = 0;
- int32_t op_errno = 0;
-
- struct xlator_stats xlstats = {0, }, *stats = NULL;
- struct statvfs buf = {0,};
- struct timeval tv;
- struct bdb_private *private = NULL;
- int64_t avg_read = 0;
- int64_t avg_write = 0;
- int64_t _time_ms = 0;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
-
- private = (struct bdb_private *)(this->private);
- stats = &xlstats;
-
- op_ret = statvfs (private->export_path, &buf);
- if (op_ret != 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "STATS %s: %s",
- private->export_path, strerror (op_errno));
- goto out;
- }
-
- stats->nr_files = private->stats.nr_files;
-
- /* client info is maintained at FSd */
- stats->nr_clients = private->stats.nr_clients;
-
- /* Number of Free block in the filesystem. */
- stats->free_disk = buf.f_bfree * buf.f_bsize;
- stats->total_disk_size = buf.f_blocks * buf.f_bsize; /* */
- stats->disk_usage = (buf.f_blocks - buf.f_bavail) * buf.f_bsize;
-
- /* Calculate read and write usage */
- gettimeofday (&tv, NULL);
-
- /* Read */
- _time_ms = (tv.tv_sec - private->init_time.tv_sec) * 1000 +
- ((tv.tv_usec - private->init_time.tv_usec) / 1000);
-
- avg_read = (_time_ms) ? (private->read_value / _time_ms) : 0;/* KBps */
- avg_write = (_time_ms) ? (private->write_value / _time_ms) : 0;
-
- _time_ms = (tv.tv_sec - private->prev_fetch_time.tv_sec) * 1000 +
- ((tv.tv_usec - private->prev_fetch_time.tv_usec) / 1000);
- if (_time_ms
- && ((private->interval_read / _time_ms) > private->max_read)) {
- private->max_read = (private->interval_read / _time_ms);
- }
- if (_time_ms
- && ((private->interval_write / _time_ms) > private->max_write)) {
- private->max_write = private->interval_write / _time_ms;
- }
-
- stats->read_usage = avg_read / private->max_read;
- stats->write_usage = avg_write / private->max_write;
-
- gettimeofday (&(private->prev_fetch_time), NULL);
- private->interval_read = 0;
- private->interval_write = 0;
-
-out:
- STACK_UNWIND (frame, op_ret, op_errno, stats);
- return 0;
-}
-
-
-int32_t
-bdb_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd, struct flock *lock)
-{
- gf_log (this->name, GF_LOG_ERROR,
- "glusterfs internal locking request. please load "
- "'features/locks' translator to enable glusterfs "
- "support");
-
- STACK_UNWIND (frame, -1, ENOSYS);
- return 0;
-}
-
-
-int32_t
-bdb_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd, struct flock *lock)
-{
- gf_log (this->name, GF_LOG_ERROR,
- "glusterfs internal locking request. please load "
- "'features/locks' translator to enable glusterfs "
- "support");
-
- STACK_UNWIND (frame, -1, ENOSYS);
- return 0;
-}
-
-
-int32_t
-bdb_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
-{
- gf_log (this->name, GF_LOG_ERROR,
- "glusterfs internal locking request. please load "
- "'features/locks' translator to enable glusterfs "
- "support");
-
- STACK_UNWIND (frame, -1, ENOSYS);
- return 0;
-}
-
-
-int32_t
-bdb_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
-{
- gf_log (this->name, GF_LOG_ERROR,
- "glusterfs internal locking request. please load "
- "'features/locks' translator to enable glusterfs "
- "support");
-
- STACK_UNWIND (frame, -1, ENOSYS);
- return 0;
-}
-
-int32_t
-bdb_checksum (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flag)
-{
- char *real_path = NULL;
- DIR *dir = NULL;
- struct dirent *dirent = NULL;
- uint8_t file_checksum[ZR_FILENAME_MAX] = {0,};
- uint8_t dir_checksum[ZR_FILENAME_MAX] = {0,};
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- int32_t idx = 0, length = 0;
- bctx_t *bctx = NULL;
- DBC *cursorp = NULL;
- char *data = NULL;
- uint8_t no_break = 1;
-
- GF_VALIDATE_OR_GOTO ("bdb", frame, out);
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
-
- MAKE_REAL_PATH (real_path, this, loc->path);
-
- {
- dir = opendir (real_path);
- op_errno = errno;
- GF_VALIDATE_OR_GOTO (this->name, dir, out);
- while ((dirent = readdir (dir))) {
- if (!dirent)
- break;
-
- if (IS_BDB_PRIVATE_FILE(dirent->d_name))
- continue;
-
- length = strlen (dirent->d_name);
- for (idx = 0; idx < length; idx++)
- dir_checksum[idx] ^= dirent->d_name[idx];
- } /* while((dirent...)) */
- closedir (dir);
- }
-
- {
- bctx = bctx_lookup (B_TABLE(this), (char *)loc->path);
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "CHECKSUM %"PRId64" (%s): ENOMEM"
- "(failed to lookup database handle)",
- loc->inode->ino, loc->path);
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- op_ret = bdb_cursor_open (bctx, &cursorp);
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "CHECKSUM %"PRId64" (%s): EBADFD"
- "(failed to open cursor to database handle)",
- loc->inode->ino, loc->path);
- op_ret = -1;
- op_errno = EBADFD;
- goto out;
- }
-
-
- do {
- DBT key = {0,}, value = {0,}, sec = {0,};
-
- key.flags = DB_DBT_MALLOC;
- value.doff = 0;
- value.dlen = 0;
- op_ret = bdb_cursor_get (cursorp, &sec, &key,
- &value, DB_NEXT);
-
- if (op_ret == DB_NOTFOUND) {
- op_ret = 0;
- op_errno = 0;
- no_break = 0;
- } else if (op_ret == 0){
- /* successfully read */
- data = key.data;
- length = key.size;
- for (idx = 0; idx < length; idx++)
- file_checksum[idx] ^= data[idx];
-
- FREE (key.data);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "CHECKSUM %"PRId64" (%s)",
- loc->inode->ino, loc->path);
- op_ret = -1;
- op_errno = ENOENT; /* TODO: watch errno */
- no_break = 0;
- }/* if(op_ret == DB_NOTFOUND)...else if...else */
- } while (no_break);
- bdb_cursor_close (bctx, cursorp);
- }
-out:
- if (bctx) {
- /* NOTE: bctx_unref always returns success,
- * see description of bctx_unref for more details */
- bctx_unref (bctx);
- }
-
- STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum);
-
- return 0;
-}
-
-/**
- * notify - when parent sends PARENT_UP, send CHILD_UP event from here
- */
-int32_t
-notify (xlator_t *this,
- int32_t event,
- void *data,
- ...)
-{
- switch (event)
- {
- case GF_EVENT_PARENT_UP:
- {
- /* Tell the parent that bdb xlator is up */
- assert ((this->private != NULL) &&
- (BDB_ENV(this) != NULL));
- default_notify (this, GF_EVENT_CHILD_UP, data);
- }
- break;
- default:
- /* */
- break;
- }
- return 0;
-}
-
-
-
-/**
- * init -
- */
-int32_t
-init (xlator_t *this)
-{
- int32_t ret = -1;
- struct stat buf = {0,};
- struct bdb_private *_private = NULL;
- char *directory = NULL;
- bctx_t *bctx = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb", this, out);
-
- if (this->children) {
- gf_log (this->name, GF_LOG_ERROR,
- "'storage/bdb' translator should be used as leaf node "
- "in translator tree. please remove the subvolumes"
- " specified and retry.");
- goto err;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_ERROR,
- "'storage/bdb' translator needs at least one among "
- "'protocol/server' or 'mount/fuse' translator as "
- "parent. please add 'protocol/server' or 'mount/fuse' "
- "as parent of 'storage/bdb' and retry. or you can also"
- " try specifying mount-point on command-line.");
- goto err;
- }
-
- _private = CALLOC (1, sizeof (*_private));
- if (_private == NULL) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not allocate memory for 'storage/bdb' "
- "configuration data-structure. cannot continue from "
- "here");
- goto err;
- }
-
-
- ret = dict_get_str (this->options, "directory", &directory);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "'storage/bdb' needs at least "
- "'option directory <path-to-export-directory>' as "
- "minimal configuration option. please specify an "
- "export directory using "
- "'option directory <path-to-export-directory>' and "
- "retry.");
- goto err;
- }
-
- umask (000); /* umask `masking' is done at the client side */
-
- /* Check whether the specified directory exists, if not create it. */
- ret = stat (directory, &buf);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "specified export path '%s' does not exist. "
- "please create the export path '%s' and retry.",
- directory, directory);
- goto err;
- } else if (!S_ISDIR (buf.st_mode)) {
- gf_log (this->name, GF_LOG_ERROR,
- "specified export path '%s' is not a directory. "
- "please specify a valid and existing directory as "
- "export directory and retry.",
- directory);
- goto err;
- } else {
- ret = 0;
- }
-
-
- _private->export_path = strdup (directory);
- if (_private->export_path == NULL) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not allocate memory for 'storage/bdb' "
- "configuration data-structure. cannot continue from "
- "here");
- goto err;
- }
-
- _private->export_path_length = strlen (_private->export_path);
-
- {
- /* Stats related variables */
- gettimeofday (&_private->init_time, NULL);
- gettimeofday (&_private->prev_fetch_time, NULL);
- _private->max_read = 1;
- _private->max_write = 1;
- }
-
- this->private = (void *)_private;
-
- {
- ret = bdb_db_init (this, this->options);
-
- if (ret < 0){
- gf_log (this->name, GF_LOG_ERROR,
- "database environment initialisation failed. "
- "manually run database recovery tool and "
- "retry to run glusterfs");
- goto err;
- } else {
- bctx = bctx_lookup (_private->b_table, "/");
- /* NOTE: we are not doing bctx_unref() for root bctx,
- * let it remain in active list forever */
- if (bctx == NULL) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not allocate memory for "
- "'storage/bdb' configuration data-"
- "structure. cannot continue from "
- "here");
- goto err;
- } else {
- ret = 0;
- goto out;
- }
- }
- }
-err:
- if (_private) {
- if (_private->export_path)
- FREE (_private->export_path);
-
- FREE (_private);
- }
-out:
- return ret;
-}
-
-void
-bctx_cleanup (struct list_head *head)
-{
- bctx_t *trav = NULL;
- bctx_t *tmp = NULL;
- DB *storage = NULL;
- DB *secondary = NULL;
-
- list_for_each_entry_safe (trav, tmp, head, list) {
- LOCK (&trav->lock);
- {
- storage = trav->primary;
- trav->primary = NULL;
-
- secondary = trav->secondary;
- trav->secondary = NULL;
-
- list_del_init (&trav->list);
- }
- UNLOCK (&trav->lock);
-
- if (storage) {
- storage->close (storage, 0);
- storage = NULL;
- }
-
- if (secondary) {
- secondary->close (secondary, 0);
- secondary = NULL;
- }
- }
- return;
-}
-
-void
-fini (xlator_t *this)
-{
- struct bdb_private *private = NULL;
- int32_t ret = 0;
-
- private = this->private;
-
- if (B_TABLE(this)) {
- /* close all the dbs from lru list */
- bctx_cleanup (&(B_TABLE(this)->b_lru));
- bctx_cleanup (&(B_TABLE(this)->active));
-
- if (BDB_ENV(this)) {
- LOCK (&private->active_lock);
- {
- private->active = 0;
- }
- UNLOCK (&private->active_lock);
-
- ret = pthread_join (private->checkpoint_thread, NULL);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "could not complete checkpointing "
- "database environment. this might "
- "result in inconsistencies in few"
- " recent data and meta-data "
- "operations");
- }
-
- BDB_ENV(this)->close (BDB_ENV(this), 0);
- } else {
- /* impossible to reach here */
- }
-
- FREE (B_TABLE(this));
- }
- FREE (private);
- return;
-}
-
-struct xlator_mops mops = {
- .stats = bdb_stats,
-};
-
-struct xlator_fops fops = {
- .lookup = bdb_lookup,
- .stat = bdb_stat,
- .opendir = bdb_opendir,
- .readdir = bdb_readdir,
- .readlink = bdb_readlink,
- .mknod = bdb_mknod,
- .mkdir = bdb_mkdir,
- .unlink = bdb_unlink,
- .rmdir = bdb_rmdir,
- .symlink = bdb_symlink,
- .rename = bdb_rename,
- .link = bdb_link,
- .chmod = bdb_chmod,
- .chown = bdb_chown,
- .truncate = bdb_truncate,
- .utimens = bdb_utimens,
- .create = bdb_create,
- .open = bdb_open,
- .readv = bdb_readv,
- .writev = bdb_writev,
- .statfs = bdb_statfs,
- .flush = bdb_flush,
- .fsync = bdb_fsync,
- .setxattr = bdb_setxattr,
- .getxattr = bdb_getxattr,
- .removexattr = bdb_removexattr,
- .fsyncdir = bdb_fsyncdir,
- .access = bdb_access,
- .ftruncate = bdb_ftruncate,
- .fstat = bdb_fstat,
- .lk = bdb_lk,
- .inodelk = bdb_inodelk,
- .finodelk = bdb_finodelk,
- .entrylk = bdb_entrylk,
- .fentrylk = bdb_fentrylk,
- .fchown = bdb_fchown,
- .fchmod = bdb_fchmod,
- .setdents = bdb_setdents,
- .getdents = bdb_getdents,
- .checksum = bdb_checksum,
-};
-
-struct xlator_cbks cbks = {
- .release = bdb_release,
- .releasedir = bdb_releasedir
-};
-
-
-struct volume_options options[] = {
- { .key = { "directory" },
- .type = GF_OPTION_TYPE_PATH,
- .description = "export directory"
- },
- { .key = { "logdir" },
- .type = GF_OPTION_TYPE_PATH,
- .description = "directory to be used by libdb for writing"
- "transaction logs. NOTE: in absence of 'logdir' "
- "export directory itself will be used as 'logdir' also"
- },
- { .key = { "errfile" },
- .type = GF_OPTION_TYPE_PATH,
- .description = "path to be used for libdb error logging. "
- "NOTE: absence of 'errfile' will disable any "
- "error logging by libdb."
- },
- { .key = { "dir-mode" },
- .type = GF_OPTION_TYPE_ANY /* base 8 number */
- },
- { .key = { "file-mode" },
- .type = GF_OPTION_TYPE_ANY,
- .description = "file mode for regular files. stat() on a regular file"
- " returns the mode specified by this option. "
- "NOTE: specify value in octal"
- },
- { .key = { "page-size" },
- .type = GF_OPTION_TYPE_SIZET,
- .min = 512,
- .max = 16384,
- .description = "size of pages used to hold data by libdb. set it to "
- "block size of exported filesystem for "
- "optimal performance"
- },
- { .key = { "open-db-lru-limit" },
- .type = GF_OPTION_TYPE_INT,
- .min = 1,
- .max = 2048,
- .description = "maximum number of per directory databases that can "
- "be kept open. NOTE: for _advanced_ users only."
- },
- { .key = { "lock-timeout" },
- .type = GF_OPTION_TYPE_TIME,
- .min = 0,
- .max = 4260000,
- .description = "define the maximum time a lock request can "
- "be blocked by libdb. NOTE: only for _advanced_ users."
- " do not specify this option when not sure."
- },
- { .key = { "checkpoint-interval" },
- .type = GF_OPTION_TYPE_TIME,
- .min = 1,
- .max = 86400,
- .description = "define the time interval between two consecutive "
- "libdb checpoints. setting to lower value will leave "
- "bdb perform slowly, but guarantees that minimum data"
- " will be lost in case of a crash. NOTE: this option "
- "is valid only when "
- "'option mode=\"persistent\"' is set."
- },
- { .key = { "transaction-timeout" },
- .type = GF_OPTION_TYPE_TIME,
- .min = 0,
- .max = 4260000,
- .description = "maximum time for which a transaction can block "
- "waiting for required resources."
- },
- { .key = { "mode" },
- .type = GF_OPTION_TYPE_BOOL,
- .value = { "cache", "persistent" },
- .description = "cache: data recovery is not guaranteed in case "
- "of crash. persistent: data recovery is guaranteed, "
- "since all operations are transaction protected."
- },
- { .key = { "access-mode" },
- .type = GF_OPTION_TYPE_STR,
- .value = {"btree", "hash" },
- .description = "chose the db access method. "
- "NOTE: for _advanced_ users. leave the choice to "
- "glusterfs when in doubt."
- },
- { .key = { NULL } }
-};
diff --git a/xlators/storage/bdb/src/bdb.h b/xlators/storage/bdb/src/bdb.h
deleted file mode 100644
index ffdadd1cc..000000000
--- a/xlators/storage/bdb/src/bdb.h
+++ /dev/null
@@ -1,530 +0,0 @@
-/*
- Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _BDB_H
-#define _BDB_H
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include <stdio.h>
-#include <dirent.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <dirent.h>
-
-#include <db.h>
-
-#ifdef linux
-#ifdef __GLIBC__
-#include <sys/fsuid.h>
-#else
-#include <unistd.h>
-#endif
-#endif
-
-#ifdef HAVE_SYS_XATTR_H
-#include <sys/xattr.h>
-#endif
-
-#ifdef HAVE_SYS_EXTATTR_H
-#include <sys/extattr.h>
-#endif
-
-#include <pthread.h>
-#include "xlator.h"
-#include "inode.h"
-#include "compat.h"
-#include "compat-errno.h"
-#include "fd.h"
-#include "syscall.h"
-
-#define BDB_STORAGE "/glusterfs_storage.db"
-
-/* numbers are not so reader-friendly, so lets have ON and OFF macros */
-#define ON 1
-#define OFF 0
-
-#define BDB_DEFAULT_LRU_LIMIT 100
-#define BDB_DEFAULT_HASH_SIZE 100
-
-#define BDB_ENOSPC_THRESHOLD 25600
-
-#define BDB_DEFAULT_CHECKPOINT_INTERVAL 30
-
-#define BCTX_ENV(bctx) (bctx->table->dbenv)
-
-#define BDB_EXPORT_PATH_LEN(_private) \
- (((struct bdb_private *)_private)->export_path_length)
-
-#define BDB_KEY_FROM_FREQUEST_KEY(_key) (&(key[15]))
-
-#define BDB_EXPORT_PATH(_private) \
- (((struct bdb_private *)_private)->export_path)
-/* MAKE_REAL_PATH(var,this,path)
- * make the real path on the underlying file-system
- *
- * @var: destination to hold the real path
- * @this: pointer to xlator_t corresponding to bdb xlator
- * @path: path, as seen from mount-point
- */
-#define MAKE_REAL_PATH(var, this, path) do { \
- int base_len = BDB_EXPORT_PATH_LEN(this->private); \
- var = alloca (strlen (path) + base_len + 2); \
- strcpy (var, BDB_EXPORT_PATH(this->private)); \
- strcpy (&var[base_len], path); \
- } while (0)
-
-
-#define BDB_TIMED_LOG(_errno,_counter) \
- ((_errno == ENOTSUP) && (((++_counter) % GF_UNIVERSAL_ANSWER) == 1))
-
-#define GF_FILE_CONTENT_REQUEST ZR_FILE_CONTENT_REQUEST
-
-/* MAKE_REAL_PATH_TO_STORAGE_DB(var,this,path)
- * make the real path to the storage-database file on file-system
- *
- * @var: destination to hold the real path
- * @this: pointer to xlator_t corresponding to bdb xlator
- * @path: path of the directory, as seen from mount-point
- */
-#define MAKE_REAL_PATH_TO_STORAGE_DB(var, this, path) do { \
- int base_len = BDB_EXPORT_PATH_LEN(this->private); \
- var = alloca (strlen (path) + \
- base_len + \
- strlen (BDB_STORAGE)); \
- strcpy (var, BDB_EXPORT_PATH(this->private)); \
- strcpy (&var[base_len], path); \
- strcat (var, BDB_STORAGE); \
- } while (0)
-
-/* MAKE_KEY_FROM_PATH(key,path)
- * make a 'key', which we use as key in the underlying database by using
- * the path
- *
- * @key: destination to hold the key
- * @path: path to file as seen from mount-point
- */
-#define MAKE_KEY_FROM_PATH(key, path) do { \
- char *tmp = alloca (strlen (path)); \
- strcpy (tmp, path); \
- key = basename (tmp); \
- }while (0);
-
-/* IS_BDB_PRIVATE_FILE(name)
- * check if a given 'name' is bdb xlator's internal file name
- *
- * @name: basename of a file.
- *
- * bdb xlator reserves file names 'glusterfs_storage.db',
- * 'glusterfs_ns.db'(used by bdb xlator itself), 'log.*', '__db.*'
- * (used by libdb)
- */
-#define IS_BDB_PRIVATE_FILE(name) ((!strncmp(name, "__db.", 5)) || \
- (!strcmp(name, "glusterfs_storage.db")) || \
- (!strcmp(name, "glusterfs_ns.db")) || \
- (!strncmp(name, "log.0000", 8)))
-
-/* check if 'name' is '.' or '..' entry */
-#define IS_DOT_DOTDOT(name) \
- ((!strncmp(name,".", 1)) || (!strncmp(name,"..", 2)))
-
-/* BDB_ICTX_SET(this,inode,bctx)
- * pointer to 'struct bdb_ctx' is stored in inode's ctx of all directories.
- * this will happen either in lookup() or mkdir().
- *
- * @this: pointer xlator_t of bdb xlator.
- * @inode: inode where 'struct bdb_ctx *' has to be stored.
- * @bctx: a 'struct bdb_ctx *'
- */
-#define BDB_ICTX_SET(_inode,_this,_bctx) do{ \
- inode_ctx_put(_inode, _this, (uint64_t)(long)_bctx); \
- }while (0);
-
-#define BDB_ICTX_GET(_inode,_this,_bctxp) do { \
- uint64_t tmp_bctx = 0; \
- inode_ctx_get (_inode, _this, &tmp_bctx); \
- *_bctxp = tmp_bctx; \
- }while (0);
-
-/* BDB_FCTX_SET(this,fd,bctx)
- * pointer to 'struct bdb_ctx' is stored in inode's ctx of all directories.
- * this will happen either in lookup() or mkdir().
- *
- * @this: pointer xlator_t of bdb xlator.
- * @inode: inode where 'struct bdb_ctx *' has to be stored.
- * @bctx: a 'struct bdb_ctx *'
- */
-#define BDB_FCTX_SET(_fd,_this,_bfd) do{ \
- fd_ctx_set(_fd, _this, (uint64_t)(long)_bfd); \
- }while (0);
-
-#define BDB_FCTX_GET(_fd,_this,_bfdp) do { \
- uint64_t tmp_bfd = 0; \
- fd_ctx_get (_fd, _this, &tmp_bfd); \
- *_bfdp = (void *)(long)tmp_bfd; \
- }while (0);
-
-
-/* maximum number of open dbs that bdb xlator will ever have */
-#define BDB_MAX_OPEN_DBS 100
-
-/* convert file size to block-count */
-#define BDB_COUNT_BLOCKS(size,blksize) (((size + blksize - 1)/blksize) - 1)
-
-/* file permissions, again macros are more readable */
-#define RWXRWXRWX 0777
-#define DEFAULT_FILE_MODE 0644
-#define DEFAULT_DIR_MODE 0755
-
-/* see, if have a valid file permissions specification in @mode */
-#define IS_VALID_FILE_MODE(mode) (!(mode & (~RWXRWXRWX)))
-#define IS_VALID_DIR_MODE(mode) (!(mode & (~(RWXRWXRWX)))
-
-/* maximum retries for a failed transactional operation */
-#define BDB_MAX_RETRIES 10
-
-#define BDB_LL_PAGE_SIZE_DEFAULT 4096
-#define BDB_LL_PAGE_SIZE_MIN 4096
-#define BDB_LL_PAGE_SIZE_MAX 65536
-
-#define PAGE_SIZE_IN_RANGE(_page_size) \
- ((_page_size >= BDB_LL_PAGE_SIZE_MIN) \
- && (table->page_size <= BDB_LL_PAGE_SIZE_MAX))
-
-typedef struct bctx_table bctx_table_t;
-typedef struct bdb_ctx bctx_t;
-typedef struct bdb_cache bdb_cache_t;
-typedef struct bdb_private bdb_private_t;
-
-struct bctx_table {
- /* flags to be used for opening each database */
- uint64_t dbflags;
-
- /* cache: can be either ON or OFF */
- uint64_t cache;
-
- /* used to lock the 'struct bctx_table *' */
- gf_lock_t lock;
-
- /* lock for checkpointing */
- gf_lock_t checkpoint_lock;
-
- /* hash table of 'struct bdb_ctx' */
- struct list_head *b_hash;
-
- /* list of active 'struct bdb_ctx' */
- struct list_head active;
-
- /* lru list of inactive 'struct bdb_ctx' */
- struct list_head b_lru;
- struct list_head purge;
- uint32_t lru_limit;
- uint32_t lru_size;
- uint32_t hash_size;
-
- /* access mode for accessing the databases, can be DB_HASH, DB_BTREE */
- DBTYPE access_mode;
-
- /* DB_ENV under which every db operation is carried over */
- DB_ENV *dbenv;
- int32_t transaction;
- xlator_t *this;
-
- /* page-size of DB, DB->set_pagesize(), should be set before DB->open */
- uint64_t page_size;
-};
-
-struct bdb_ctx {
- /* controller members */
-
- /* lru list of 'struct bdb_ctx's, a bdb_ctx can exist in one of
- * b_hash or lru lists */
- struct list_head list;
-
- /* directory 'name' hashed list of 'struct bdb_ctx's */
- struct list_head b_hash;
-
- struct bctx_table *table;
- int32_t ref; /* reference count */
- gf_lock_t lock; /* used to lock this 'struct bdb_ctx' */
-
- char *directory; /* directory path */
-
- /* pointer to open database, that resides inside this directory */
- DB *primary;
- DB *secondary;
- uint32_t cache; /* cache ON or OFF */
-
- /* per directory cache, bdb xlator's internal cache */
- struct list_head c_list; /* linked list of cached records */
- int32_t c_count; /* number of cached records */
-
- /* index to hash table list, to which this ctx belongs */
- int32_t key_hash;
- char *db_path; /* absolute path to db file */
-};
-
-struct bdb_fd {
- /* pointer to bdb_ctx of the parent directory */
- struct bdb_ctx *ctx;
-
- /* name of the file. NOTE: basename, not the complete path */
- char *key;
- int32_t flags; /* open flags */
-};
-
-struct bdb_dir {
- /* pointer to bdb_ctx of this directory */
- struct bdb_ctx *ctx;
-
- /* open directory pointer, as returned by opendir() */
- DIR *dir;
-
- char *path; /* path to this directory */
-};
-
-/* cache */
-struct bdb_cache {
- /* list of 'struct bdb_cache' under a 'struct bdb_ctx' */
- struct list_head c_list;
-
- /* name of the file this cache holds. NOTE: basename of file */
- char *key;
- char *data; /* file content */
-
- /* size of the file content that this cache holds */
- size_t size;
-};
-
-
-struct bdb_private {
- /* pointer to inode table that we use */
- inode_table_t *itable;
- int32_t temp; /**/
- char is_stateless; /**/
-
- /* path to the export directory
- * (option directory <export-path>) */
- char *export_path;
-
- /* length of 'export_path' string */
- int32_t export_path_length;
-
- /* statistics */
- /* Statistics, provides activity of the server */
- struct xlator_stats stats;
-
- struct timeval prev_fetch_time;
- struct timeval init_time;
- int32_t max_read; /* */
- int32_t max_write; /* */
-
- /* Used to calculate the max_read value */
- int64_t interval_read;
-
- /* Used to calculate the max_write value */
- int64_t interval_write;
- int64_t read_value; /* Total read, from init */
- int64_t write_value; /* Total write, from init */
-
- /* bdb xlator specific private data */
-
- /* flags used for opening DB_ENV for this xlator */
- uint64_t envflags;
-
- /* flags to be used for opening each database */
- uint64_t dbflags;
-
- /* cache: can be either ON or OFF */
- uint64_t cache;
-
- /* transaction: can be either ON or OFF */
- uint32_t transaction;
- uint32_t active;
- gf_lock_t active_lock;
- struct bctx_table *b_table;
-
- /* access mode for accessing the databases, can be DB_HASH, DB_BTREE
- * (option access-mode <mode>) */
- DBTYPE access_mode;
-
- /* mode for each and every file stored on bdb
- * (option file-mode <mode>) */
- mode_t file_mode;
-
- /* mode for each and every directory stored on bdb
- * (option dir-mode <mode>) */
- mode_t dir_mode;
-
- /* mode for each and every symlink stored on bdb */
- mode_t symlink_mode;
-
- /* pthread_t object used for creating checkpoint thread */
- pthread_t checkpoint_thread;
-
- /* time duration between two consecutive checkpoint operations.
- * (option checkpoint-interval <time-in-seconds>) */
- uint32_t checkpoint_interval;
-
- /* environment log directory (option logdir <directory>) */
- char *logdir;
-
- /* errfile path, used by environment to print detailed error log.
- * (option errfile <errfile-path>) */
- char *errfile;
-
- /* DB_ENV->set_errfile() expects us to fopen
- * the errfile before doing DB_ENV->set_errfile() */
- FILE *errfp;
-
- /* used by DB_ENV->set_timeout to set the timeout for
- * a transactionally encapsulated DB->operation() to
- * timeout before waiting for locks to be released.
- * (option transaction-timeout <time-in-milliseconds>)
- */
- uint32_t txn_timeout;
- uint32_t lock_timeout;
-
- /* DB_AUTO_LOG_REMOVE flag for DB_ENV*/
- uint32_t log_auto_remove;
- uint32_t log_region_max;
-};
-
-
-static inline int32_t
-bdb_txn_begin (DB_ENV *dbenv,
- DB_TXN **ptxnid)
-{
- return dbenv->txn_begin (dbenv, NULL, ptxnid, 0);
-}
-
-static inline int32_t
-bdb_txn_abort (DB_TXN *txnid)
-{
- return txnid->abort (txnid);
-}
-
-static inline int32_t
-bdb_txn_commit (DB_TXN *txnid)
-{
- return txnid->commit (txnid, 0);
-}
-
-void *
-bdb_db_stat (bctx_t *bctx,
- DB_TXN *txnid,
- uint32_t flags);
-
-/*int32_t
-bdb_db_get(struct bdb_ctx *bctx,
- DB_TXN *txnid,
- const char *key_string,
- char **buf,
- size_t size,
- off_t offset);
-*/
-int32_t
-bdb_db_fread (struct bdb_fd *bfd, char *bufp, size_t size, off_t offset);
-
-int32_t
-bdb_db_iread (struct bdb_ctx *bctx, const char *key, char **bufp);
-
-#define BDB_TRUNCATE_RECORD 0xcafebabe
-
-/*int32_t
-bdb_db_put (struct bdb_ctx *bctx,
- DB_TXN *txnid,
- const char *key_string,
- const char *buf,
- size_t size,
- off_t offset,
- int32_t flags);
-*/
-int32_t
-bdb_db_icreate (struct bdb_ctx *bctx, const char *key);
-
-int32_t
-bdb_db_fwrite (struct bdb_fd *bfd, char *buf, size_t size, off_t offset);
-
-int32_t
-bdb_db_iwrite (struct bdb_ctx *bctx, const char *key, char *buf, size_t size);
-
-int32_t
-bdb_db_itruncate (struct bdb_ctx *bctx, const char *key);
-
-int32_t
-bdb_db_iremove (struct bdb_ctx *bctx,
- const char *key);
-
-ino_t
-bdb_inode_transform (ino_t parent,
- const char *name,
- size_t namelen);
-
-int32_t
-bdb_cursor_open (struct bdb_ctx *bctx,
- DBC **cursorp);
-
-int32_t
-bdb_cursor_get (DBC *cursorp,
- DBT *sec, DBT *pri,
- DBT *value,
- int32_t flags);
-
-
-int32_t
-bdb_cursor_close (struct bdb_ctx *ctx,
- DBC *cursorp);
-
-
-int32_t
-bdb_dirent_size (DBT *key);
-
-int32_t
-dirent_size (struct dirent *entry);
-
-int
-bdb_db_init (xlator_t *this,
- dict_t *options);
-
-void
-bdb_dbs_from_dict_close (dict_t *this,
- char *key,
- data_t *value,
- void *data);
-
-bctx_t *
-bctx_lookup (struct bctx_table *table,
- const char *path);
-
-bctx_t *
-bctx_parent
-(struct bctx_table *table,
- const char *path);
-
-bctx_t *
-bctx_unref (bctx_t *ctx);
-
-bctx_t *
-bctx_ref (bctx_t *ctx);
-
-#endif /* _BDB_H */
diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am
index 9acaad651..88efcc784 100644
--- a/xlators/storage/posix/src/Makefile.am
+++ b/xlators/storage/posix/src/Makefile.am
@@ -2,16 +2,18 @@
xlator_LTLIBRARIES = posix.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage
-posix_la_LDFLAGS = -module -avoidversion
+posix_la_LDFLAGS = -module -avoid-version
-posix_la_SOURCES = posix.c
-posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c posix-aio.c
+posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBAIO)
-noinst_HEADERS = posix.h
+noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h posix-aio.h
-AM_CFLAGS = -fPIC -fno-strict-aliasing -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) -Wall \
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \
- $(GF_CFLAGS)
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
-CLEANFILES =
+AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c
new file mode 100644
index 000000000..c3bbddd67
--- /dev/null
+++ b/xlators/storage/posix/src/posix-aio.c
@@ -0,0 +1,569 @@
+/*
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "glusterfs.h"
+#include "posix.h"
+#include <sys/uio.h>
+
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+
+
+void
+__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags,
+ off_t offset, size_t size)
+{
+ int odirect = 0;
+ int flags = 0;
+ int ret = 0;
+
+ odirect = pfd->odirect;
+
+ if ((fd->flags|opflags) & O_DIRECT) {
+ /* if instructed, use O_DIRECT always */
+ odirect = 1;
+ } else {
+ /* else use O_DIRECT when feasible */
+ if ((offset|size) & 0xfff)
+ odirect = 0;
+ else
+ odirect = 1;
+ }
+
+ if (!odirect && pfd->odirect) {
+ flags = fcntl (pfd->fd, F_GETFL);
+ ret = fcntl (pfd->fd, F_SETFL, (flags & (~O_DIRECT)));
+ pfd->odirect = 0;
+ }
+
+ if (odirect && !pfd->odirect) {
+ flags = fcntl (pfd->fd, F_GETFL);
+ ret = fcntl (pfd->fd, F_SETFL, (flags | O_DIRECT));
+ pfd->odirect = 1;
+ }
+
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "fcntl() failed (%s). fd=%d flags=%d pfd->odirect=%d",
+ strerror (errno), pfd->fd, flags, pfd->odirect);
+ }
+}
+
+
+struct posix_aio_cb {
+ struct iocb iocb;
+ call_frame_t *frame;
+ struct iobuf *iobuf;
+ struct iobref *iobref;
+ struct iatt prebuf;
+ int fd;
+ int op;
+ off_t offset;
+};
+
+
+int
+posix_aio_readv_complete (struct posix_aio_cb *paiocb, int res, int res2)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct iobuf *iobuf = NULL;
+ struct iatt postbuf = {0,};
+ int _fd = -1;
+ int op_ret = -1;
+ int op_errno = 0;
+ struct iovec iov;
+ struct iobref *iobref = NULL;
+ int ret = 0;
+ off_t offset = 0;
+ struct posix_private * priv = NULL;
+
+
+ frame = paiocb->frame;
+ this = frame->this;
+ priv = this->private;
+ iobuf = paiocb->iobuf;
+ _fd = paiocb->fd;
+ offset = paiocb->offset;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_log (this->name, GF_LOG_ERROR,
+ "readv(async) failed fd=%d,size=%lu,offset=%llu (%d/%s)",
+ _fd, paiocb->iocb.u.c.nbytes,
+ (unsigned long long) paiocb->offset,
+ res, strerror (op_errno));
+ goto out;
+ }
+
+ ret = posix_fdstat (this, _fd, &postbuf);
+ if (ret != 0) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "fstat failed on fd=%d: %s", _fd,
+ strerror (op_errno));
+ goto out;
+ }
+
+ op_ret = res;
+ op_errno = 0;
+
+ iobref = iobref_new ();
+ if (!iobref) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ iobref_add (iobref, iobuf);
+
+ iov.iov_base = iobuf_ptr (iobuf);
+ iov.iov_len = op_ret;
+
+
+ /* Hack to notify higher layers of EOF. */
+ if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size)
+ op_errno = ENOENT;
+
+ LOCK (&priv->lock);
+ {
+ priv->read_value += op_ret;
+ }
+ UNLOCK (&priv->lock);
+
+out:
+ STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, &iov, 1,
+ &postbuf, iobref, NULL);
+ if (iobuf)
+ iobuf_unref (iobuf);
+ if (iobref)
+ iobref_unref (iobref);
+
+ GF_FREE (paiocb);
+
+ return 0;
+}
+
+
+int
+posix_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ size_t size, off_t offset, uint32_t flags, dict_t *xdata)
+{
+ int32_t op_errno = EINVAL;
+ int _fd = -1;
+ struct iobuf *iobuf = NULL;
+ struct posix_fd * pfd = NULL;
+ int ret = -1;
+ struct posix_aio_cb *paiocb = NULL;
+ struct posix_private *priv = NULL;
+ struct iocb *iocb = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ priv = this->private;
+
+ ret = posix_fd_ctx_get (fd, this, &pfd);
+ if (ret < 0) {
+ op_errno = -ret;
+ gf_log (this->name, GF_LOG_WARNING,
+ "pfd is NULL from fd=%p", fd);
+ goto err;
+ }
+ _fd = pfd->fd;
+
+ if (!size) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size);
+ goto err;
+ }
+
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
+ if (!iobuf) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_posix_mt_paiocb);
+ if (!paiocb) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+
+ paiocb->frame = frame;
+ paiocb->iobuf = iobuf;
+ paiocb->offset = offset;
+ paiocb->fd = _fd;
+ paiocb->op = GF_FOP_READ;
+
+ paiocb->iocb.data = paiocb;
+ paiocb->iocb.aio_fildes = _fd;
+ paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD;
+ paiocb->iocb.aio_reqprio = 0;
+ paiocb->iocb.u.c.buf = iobuf_ptr (iobuf);
+ paiocb->iocb.u.c.nbytes = size;
+ paiocb->iocb.u.c.offset = offset;
+
+ iocb = &paiocb->iocb;
+
+ LOCK (&fd->lock);
+ {
+ __posix_fd_set_odirect (fd, pfd, flags, offset, size);
+
+ ret = io_submit (priv->ctxp, 1, &iocb);
+ }
+ UNLOCK (&fd->lock);
+
+ if (ret != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_submit() returned %d", ret);
+ op_errno = -ret;
+ goto err;
+ }
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
+ if (iobuf)
+ iobuf_unref (iobuf);
+
+ if (paiocb)
+ GF_FREE (paiocb);
+
+ return 0;
+}
+
+
+int
+posix_aio_writev_complete (struct posix_aio_cb *paiocb, int res, int res2)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct iatt prebuf = {0,};
+ struct iatt postbuf = {0,};
+ int _fd = -1;
+ int op_ret = -1;
+ int op_errno = 0;
+ int ret = 0;
+ struct posix_private * priv = NULL;
+
+
+ frame = paiocb->frame;
+ this = frame->this;
+ priv = this->private;
+ prebuf = paiocb->prebuf;
+ _fd = paiocb->fd;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_log (this->name, GF_LOG_ERROR,
+ "writev(async) failed fd=%d,offset=%llu (%d/%s)",
+ _fd, (unsigned long long) paiocb->offset, res,
+ strerror (op_errno));
+
+ goto out;
+ }
+
+ ret = posix_fdstat (this, _fd, &postbuf);
+ if (ret != 0) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "fstat failed on fd=%d: %s", _fd,
+ strerror (op_errno));
+ goto out;
+ }
+
+
+ op_ret = res;
+ op_errno = 0;
+
+ LOCK (&priv->lock);
+ {
+ priv->write_value += op_ret;
+ }
+ UNLOCK (&priv->lock);
+
+out:
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &prebuf, &postbuf,
+ NULL);
+
+ if (paiocb) {
+ if (paiocb->iobref)
+ iobref_unref (paiocb->iobref);
+ GF_FREE (paiocb);
+ }
+
+ return 0;
+}
+
+
+int
+posix_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *iov, int count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ int32_t op_errno = EINVAL;
+ int _fd = -1;
+ struct posix_fd * pfd = NULL;
+ int ret = -1;
+ struct posix_aio_cb *paiocb = NULL;
+ struct posix_private *priv = NULL;
+ struct iocb *iocb = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ priv = this->private;
+
+ ret = posix_fd_ctx_get (fd, this, &pfd);
+ if (ret < 0) {
+ op_errno = -ret;
+ gf_log (this->name, GF_LOG_WARNING,
+ "pfd is NULL from fd=%p", fd);
+ goto err;
+ }
+ _fd = pfd->fd;
+
+ paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_posix_mt_paiocb);
+ if (!paiocb) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+
+ paiocb->frame = frame;
+ paiocb->offset = offset;
+ paiocb->fd = _fd;
+ paiocb->op = GF_FOP_WRITE;
+
+ paiocb->iocb.data = paiocb;
+ paiocb->iocb.aio_fildes = _fd;
+ paiocb->iobref = iobref_ref (iobref);
+ paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV;
+ paiocb->iocb.aio_reqprio = 0;
+ paiocb->iocb.u.v.vec = iov;
+ paiocb->iocb.u.v.nr = count;
+ paiocb->iocb.u.v.offset = offset;
+
+ iocb = &paiocb->iocb;
+
+ ret = posix_fdstat (this, _fd, &paiocb->prebuf);
+ if (ret != 0) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "fstat failed on fd=%p: %s", fd,
+ strerror (op_errno));
+ goto err;
+ }
+
+
+ LOCK (&fd->lock);
+ {
+ __posix_fd_set_odirect (fd, pfd, flags, offset,
+ iov_length (iov, count));
+
+ ret = io_submit (priv->ctxp, 1, &iocb);
+ }
+ UNLOCK (&fd->lock);
+
+ if (ret != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_submit() returned %d", ret);
+ op_errno = -ret;
+ goto err;
+ }
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (writev, frame, -1, op_errno, 0, 0, 0);
+
+ if (paiocb) {
+ if (paiocb->iobref)
+ iobref_unref (paiocb->iobref);
+ GF_FREE (paiocb);
+ }
+
+ return 0;
+}
+
+
+void *
+posix_aio_thread (void *data)
+{
+ xlator_t *this = NULL;
+ struct posix_private *priv = NULL;
+ int ret = 0;
+ int i = 0;
+ struct io_event events[POSIX_AIO_MAX_NR_GETEVENTS];
+ struct io_event *event = NULL;
+ struct posix_aio_cb *paiocb = NULL;
+
+ this = data;
+ THIS = this;
+ priv = this->private;
+
+ for (;;) {
+ memset (&events[0], 0, sizeof (events));
+ ret = io_getevents (priv->ctxp, 1, POSIX_AIO_MAX_NR_GETEVENTS,
+ &events[0], NULL);
+ if (ret <= 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_getevents() returned %d", ret);
+ if (ret == -EINTR)
+ continue;
+ break;
+ }
+
+ for (i = 0; i < ret; i++) {
+ event = &events[i];
+
+ paiocb = event->data;
+
+ switch (paiocb->op) {
+ case GF_FOP_READ:
+ posix_aio_readv_complete (paiocb, event->res,
+ event->res2);
+ break;
+ case GF_FOP_WRITE:
+ posix_aio_writev_complete (paiocb, event->res,
+ event->res2);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "unknown op %d found in piocb",
+ paiocb->op);
+ break;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+
+int
+posix_aio_init (xlator_t *this)
+{
+ struct posix_private *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ ret = io_setup (POSIX_AIO_MAX_NR_EVENTS, &priv->ctxp);
+ if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Linux AIO not available at run-time."
+ " Continuing with synchronous IO");
+ ret = 0;
+ goto out;
+ }
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "io_setup() failed. ret=%d, errno=%d",
+ ret, errno);
+ goto out;
+ }
+
+ ret = gf_thread_create (&priv->aiothread, NULL,
+ posix_aio_thread, this);
+ if (ret != 0) {
+ io_destroy (priv->ctxp);
+ goto out;
+ }
+
+ this->fops->readv = posix_aio_readv;
+ this->fops->writev = posix_aio_writev;
+out:
+ return ret;
+}
+
+
+int
+posix_aio_on (xlator_t *this)
+{
+ struct posix_private *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ if (!priv->aio_init_done) {
+ ret = posix_aio_init (this);
+ if (ret == 0)
+ priv->aio_capable = _gf_true;
+ else
+ priv->aio_capable = _gf_false;
+ priv->aio_init_done = _gf_true;
+ }
+
+ if (priv->aio_capable) {
+ this->fops->readv = posix_aio_readv;
+ this->fops->writev = posix_aio_writev;
+ }
+
+ return ret;
+}
+
+int
+posix_aio_off (xlator_t *this)
+{
+ this->fops->readv = posix_readv;
+ this->fops->writev = posix_writev;
+
+ return 0;
+}
+
+
+#else
+
+
+int
+posix_aio_on (xlator_t *this)
+{
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return 0;
+}
+
+int
+posix_aio_off (xlator_t *this)
+{
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return 0;
+}
+
+void
+__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags,
+ off_t offset, size_t size)
+{
+ xlator_t *this = THIS;
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return;
+}
+#endif
diff --git a/xlators/storage/posix/src/posix-aio.h b/xlators/storage/posix/src/posix-aio.h
new file mode 100644
index 000000000..5bde71601
--- /dev/null
+++ b/xlators/storage/posix/src/posix-aio.h
@@ -0,0 +1,39 @@
+/*
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _POSIX_AIO_H
+#define _POSIX_AIO_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "glusterfs.h"
+
+// Maximum number of concurrently submitted IO events. The heaviest load
+// GlusterFS has been able to handle had 60-80 concurrent calls
+#define POSIX_AIO_MAX_NR_EVENTS 256
+
+// Maximum number of completed IO operations to reap per getevents syscall
+#define POSIX_AIO_MAX_NR_GETEVENTS 16
+
+
+int posix_aio_on (xlator_t *this);
+int posix_aio_off (xlator_t *this);
+
+int posix_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata);
+
+int posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata);
+
+#endif /* !_POSIX_AIO_H */
diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c
new file mode 100644
index 000000000..219a582c9
--- /dev/null
+++ b/xlators/storage/posix/src/posix-handle.c
@@ -0,0 +1,744 @@
+/*
+ Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <libgen.h>
+#ifdef GF_LINUX_HOST_OS
+#include <alloca.h>
+#endif
+
+#include "posix-handle.h"
+#include "posix.h"
+#include "xlator.h"
+#include "syscall.h"
+
+
+#define HANDLE_PFX ".glusterfs"
+#define TRASH_DIR "landfill"
+
+#define UUID0_STR "00000000-0000-0000-0000-000000000000"
+#define SLEN(str) (sizeof(str) - 1)
+
+
+int
+posix_handle_relpath (xlator_t *this, uuid_t gfid, const char *basename,
+ char *buf, size_t buflen)
+{
+ char *uuid_str = NULL;
+ int len = 0;
+
+ len = SLEN("../")
+ + SLEN("../")
+ + SLEN("00/")
+ + SLEN("00/")
+ + SLEN(UUID0_STR)
+ + 1 /* '\0' */
+ ;
+
+ if (basename) {
+ len += (strlen (basename) + 1);
+ }
+
+ if (buflen < len || !buf)
+ return len;
+
+ uuid_str = uuid_utoa (gfid);
+
+ if (basename) {
+ len = snprintf (buf, buflen, "../../%02x/%02x/%s/%s",
+ gfid[0], gfid[1], uuid_str, basename);
+ } else {
+ len = snprintf (buf, buflen, "../../%02x/%02x/%s",
+ gfid[0], gfid[1], uuid_str);
+ }
+
+ return len;
+}
+
+
+/*
+ TODO: explain how this pump fixes ELOOP
+*/
+int
+posix_handle_pump (xlator_t *this, char *buf, int len, int maxlen,
+ char *base_str, int base_len, int pfx_len)
+{
+ char linkname[512] = {0,}; /* "../../<gfid>/<NAME_MAX>" */
+ int ret = 0;
+ int blen = 0;
+ int link_len = 0;
+
+ /* is a directory's symlink-handle */
+ ret = readlink (base_str, linkname, 512);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "internal readlink failed on %s (%s)",
+ base_str, strerror (errno));
+ goto err;
+ }
+
+ if (ret < 512)
+ linkname[ret] = 0;
+
+ link_len = ret;
+
+ if ((ret == 8) && memcmp (linkname, "../../..", 8) == 0) {
+ if (strcmp (base_str, buf) == 0) {
+ strcpy (buf + pfx_len, "..");
+ }
+ goto out;
+ }
+
+ if (ret < 50 || ret >= 512) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "malformed internal link %s for %s",
+ linkname, base_str);
+ goto err;
+ }
+
+ if (memcmp (linkname, "../../", 6) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "malformed internal link %s for %s",
+ linkname, base_str);
+ goto err;
+ }
+
+ if ((linkname[2] != '/') ||
+ (linkname[5] != '/') ||
+ (linkname[8] != '/') ||
+ (linkname[11] != '/') ||
+ (linkname[48] != '/')) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "malformed internal link %s for %s",
+ linkname, base_str);
+ goto err;
+ }
+
+ if ((linkname[20] != '-') ||
+ (linkname[25] != '-') ||
+ (linkname[30] != '-') ||
+ (linkname[35] != '-')) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "malformed internal link %s for %s",
+ linkname, base_str);
+ goto err;
+ }
+
+ blen = link_len - 48;
+ memmove (buf + base_len + blen, buf + base_len,
+ (strlen (buf) - base_len) + 1);
+
+ strncpy (base_str + pfx_len, linkname + 6, 42);
+
+ if (len + blen < maxlen)
+ strncpy (buf + pfx_len, linkname + 6, link_len - 6);
+out:
+ return len + blen;
+err:
+ return -1;
+}
+
+
+/*
+ posix_handle_path differs from posix_handle_gfid_path in the way that the
+ path filled in @buf by posix_handle_path will return type IA_IFDIR when
+ an lstat() is performed on it, whereas posix_handle_gfid_path returns path
+ to the handle symlink (typically used for the purpose of unlinking it).
+
+ posix_handle_path also guarantees immunity to ELOOP on the path returned by it
+*/
+
+int
+posix_handle_path (xlator_t *this, uuid_t gfid, const char *basename,
+ char *ubuf, size_t size)
+{
+ struct posix_private *priv = NULL;
+ char *uuid_str = NULL;
+ int len = 0;
+ int ret = -1;
+ struct stat stat;
+ char *base_str = NULL;
+ int base_len = 0;
+ int pfx_len;
+ int maxlen;
+ char *buf;
+
+ priv = this->private;
+
+ uuid_str = uuid_utoa (gfid);
+
+ if (ubuf) {
+ buf = ubuf;
+ maxlen = size;
+ } else {
+ maxlen = PATH_MAX;
+ buf = alloca (maxlen);
+ }
+
+ base_len = (priv->base_path_length + SLEN(HANDLE_PFX) + 45);
+ base_str = alloca (base_len + 1);
+ base_len = snprintf (base_str, base_len + 1, "%s/%s/%02x/%02x/%s",
+ priv->base_path, HANDLE_PFX, gfid[0], gfid[1],
+ uuid_str);
+
+ pfx_len = priv->base_path_length + 1 + SLEN(HANDLE_PFX) + 1;
+
+ if (basename) {
+ len = snprintf (buf, maxlen, "%s/%s", base_str, basename);
+ } else {
+ len = snprintf (buf, maxlen, "%s", base_str);
+ }
+
+ ret = lstat (base_str, &stat);
+
+ if (!(ret == 0 && S_ISLNK(stat.st_mode) && stat.st_nlink == 1))
+ goto out;
+
+ do {
+ errno = 0;
+ ret = posix_handle_pump (this, buf, len, maxlen,
+ base_str, base_len, pfx_len);
+ if (ret == -1)
+ break;
+
+ len = ret;
+
+ ret = lstat (buf, &stat);
+ } while ((ret == -1) && errno == ELOOP);
+
+out:
+ return len + 1;
+}
+
+
+int
+posix_handle_gfid_path (xlator_t *this, uuid_t gfid, const char *basename,
+ char *buf, size_t buflen)
+{
+ struct posix_private *priv = NULL;
+ char *uuid_str = NULL;
+ int len = 0;
+
+ priv = this->private;
+
+ len = priv->base_path_length /* option directory "/export" */
+ + SLEN("/")
+ + SLEN(HANDLE_PFX)
+ + SLEN("/")
+ + SLEN("00/")
+ + SLEN("00/")
+ + SLEN(UUID0_STR)
+ + 1 /* '\0' */
+ ;
+
+ if (basename) {
+ len += (strlen (basename) + 1);
+ } else {
+ len += 256; /* worst-case for directory's symlink-handle expansion */
+ }
+
+ if ((buflen < len) || !buf)
+ return len;
+
+ uuid_str = uuid_utoa (gfid);
+
+ if (__is_root_gfid (gfid)) {
+ if (basename) {
+ len = snprintf (buf, buflen, "%s/%s", priv->base_path,
+ basename);
+ } else {
+ strncpy (buf, priv->base_path, buflen);
+ }
+ goto out;
+ }
+
+ if (basename) {
+ len = snprintf (buf, buflen, "%s/%s/%02x/%02x/%s/%s", priv->base_path,
+ HANDLE_PFX, gfid[0], gfid[1], uuid_str, basename);
+ } else {
+ len = snprintf (buf, buflen, "%s/%s/%02x/%02x/%s", priv->base_path,
+ HANDLE_PFX, gfid[0], gfid[1], uuid_str);
+ }
+out:
+ return len;
+}
+
+
+int
+posix_handle_init (xlator_t *this)
+{
+ struct posix_private *priv = NULL;
+ char *handle_pfx = NULL;
+ int ret = 0;
+ int len = 0;
+ struct stat stbuf;
+ struct stat rootbuf;
+ struct stat exportbuf;
+ char *rootstr = NULL;
+ uuid_t gfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1};
+
+ priv = this->private;
+
+ ret = stat (priv->base_path, &exportbuf);
+ if (ret || !S_ISDIR (exportbuf.st_mode)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Not a directory: %s", priv->base_path);
+ return -1;
+ }
+
+ handle_pfx = alloca (priv->base_path_length + 1 + strlen (HANDLE_PFX)
+ + 1);
+
+ sprintf (handle_pfx, "%s/%s", priv->base_path, HANDLE_PFX);
+
+ ret = stat (handle_pfx, &stbuf);
+ switch (ret) {
+ case -1:
+ if (errno == ENOENT) {
+ ret = mkdir (handle_pfx, 0600);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Creating directory %s failed: %s",
+ handle_pfx, strerror (errno));
+ return -1;
+ }
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Checking for %s failed: %s",
+ handle_pfx, strerror (errno));
+ return -1;
+ }
+ break;
+ case 0:
+ if (!S_ISDIR (stbuf.st_mode)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Not a directory: %s",
+ handle_pfx);
+ return -1;
+ }
+ break;
+ default:
+ break;
+ }
+
+ stat (handle_pfx, &priv->handledir);
+
+ len = posix_handle_path (this, gfid, NULL, NULL, 0);
+ rootstr = alloca (len);
+ posix_handle_path (this, gfid, NULL, rootstr, len);
+
+ ret = stat (rootstr, &rootbuf);
+ switch (ret) {
+ case -1:
+ if (errno != ENOENT) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: %s", priv->base_path,
+ strerror (errno));
+ return -1;
+ }
+
+ ret = posix_handle_mkdir_hashes (this, rootstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "mkdir %s failed (%s)",
+ rootstr, strerror (errno));
+ return -1;
+ }
+
+ ret = symlink ("../../..", rootstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "symlink %s creation failed (%s)",
+ rootstr, strerror (errno));
+ return -1;
+ }
+ break;
+ case 0:
+ if ((exportbuf.st_ino == rootbuf.st_ino) &&
+ (exportbuf.st_dev == rootbuf.st_dev))
+ return 0;
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "Different dirs %s (%lld/%lld) != %s (%lld/%lld)",
+ priv->base_path, (long long) exportbuf.st_ino,
+ (long long) exportbuf.st_dev, rootstr,
+ (long long) rootbuf.st_ino, (long long) rootbuf.st_dev);
+ return -1;
+
+ break;
+ }
+
+ return 0;
+}
+
+gf_boolean_t
+posix_does_old_trash_exists (char *old_trash)
+{
+ uuid_t gfid = {0};
+ gf_boolean_t exists = _gf_false;
+ struct stat stbuf = {0};
+ int ret = 0;
+
+ ret = lstat (old_trash, &stbuf);
+ if ((ret == 0) && S_ISDIR (stbuf.st_mode)) {
+ ret = sys_lgetxattr (old_trash, "trusted.gfid", gfid, 16);
+ if ((ret < 0) && (errno == ENODATA))
+ exists = _gf_true;
+ }
+ return exists;
+}
+
+int
+posix_handle_new_trash_init (xlator_t *this, char *trash)
+{
+ int ret = 0;
+ struct stat stbuf = {0};
+
+ ret = lstat (trash, &stbuf);
+ switch (ret) {
+ case -1:
+ if (errno == ENOENT) {
+ ret = mkdir (trash, 0755);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Creating directory %s failed: %s",
+ trash, strerror (errno));
+ }
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "Checking for %s "
+ "failed: %s", trash, strerror (errno));
+ }
+ break;
+ case 0:
+ if (!S_ISDIR (stbuf.st_mode)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Not a directory: %s", trash);
+ ret = -1;
+ }
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+int
+posix_mv_old_trash_into_new_trash (xlator_t *this, char *old, char *new)
+{
+ char dest_old[PATH_MAX] = {0};
+ int ret = 0;
+ uuid_t dest_name = {0};
+
+ if (!posix_does_old_trash_exists (old))
+ goto out;
+ uuid_generate (dest_name);
+ snprintf (dest_old, sizeof (dest_old), "%s/%s", new,
+ uuid_utoa (dest_name));
+ ret = rename (old, dest_old);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Not able to move "
+ "%s -> %s (%s)", old, dest_old, strerror (errno));
+ }
+out:
+ return ret;
+}
+
+int
+posix_handle_trash_init (xlator_t *this)
+{
+ int ret = -1;
+ struct posix_private *priv = NULL;
+ char old_trash[PATH_MAX] = {0};
+
+ priv = this->private;
+
+ priv->trash_path = GF_CALLOC (1, priv->base_path_length + strlen ("/")
+ + strlen (HANDLE_PFX) + strlen ("/")
+ + strlen (TRASH_DIR) + 1,
+ gf_posix_mt_trash_path);
+
+ if (!priv->trash_path)
+ goto out;
+
+ strncpy (priv->trash_path, priv->base_path, priv->base_path_length);
+ strcat (priv->trash_path, "/" HANDLE_PFX "/" TRASH_DIR);
+ ret = posix_handle_new_trash_init (this, priv->trash_path);
+ if (ret)
+ goto out;
+ snprintf (old_trash, sizeof (old_trash), "%s/.landfill",
+ priv->base_path);
+ ret = posix_mv_old_trash_into_new_trash (this, old_trash,
+ priv->trash_path);
+out:
+ return ret;
+}
+
+int
+posix_handle_mkdir_hashes (xlator_t *this, const char *newpath)
+{
+ char *duppath = NULL;
+ char *parpath = NULL;
+ int ret = 0;
+
+ duppath = strdupa (newpath);
+ parpath = dirname (duppath);
+ parpath = dirname (duppath);
+
+ ret = mkdir (parpath, 0700);
+ if (ret == -1 && errno != EEXIST) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error mkdir hash-1 %s (%s)",
+ parpath, strerror (errno));
+ return -1;
+ }
+
+ strcpy (duppath, newpath);
+ parpath = dirname (duppath);
+
+ ret = mkdir (parpath, 0700);
+ if (ret == -1 && errno != EEXIST) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error mkdir hash-2 %s (%s)",
+ parpath, strerror (errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+
+int
+posix_handle_hard (xlator_t *this, const char *oldpath, uuid_t gfid, struct stat *oldbuf)
+{
+ char *newpath = NULL;
+ struct stat newbuf;
+ int ret = -1;
+
+
+ MAKE_HANDLE_PATH (newpath, this, gfid, NULL);
+
+ ret = lstat (newpath, &newbuf);
+ if (ret == -1 && errno != ENOENT) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: %s", newpath, strerror (errno));
+ return -1;
+ }
+
+ if (ret == -1 && errno == ENOENT) {
+ ret = posix_handle_mkdir_hashes (this, newpath);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "mkdir %s failed (%s)",
+ newpath, strerror (errno));
+ return -1;
+ }
+
+#ifdef HAVE_LINKAT
+ /*
+ * Use linkat if the target may be a symlink to a directory
+ * or without an existing target. See comment about linkat()
+ * usage in posix_link() in posix.c for details
+ */
+ ret = linkat (AT_FDCWD, oldpath, AT_FDCWD, newpath, 0);
+#else
+ ret = link (oldpath, newpath);
+#endif
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "link %s -> %s failed (%s)",
+ oldpath, newpath, strerror (errno));
+ return -1;
+ }
+
+ ret = lstat (newpath, &newbuf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "lstat on %s failed (%s)",
+ newpath, strerror (errno));
+ return -1;
+ }
+ }
+
+ if (newbuf.st_ino != oldbuf->st_ino ||
+ newbuf.st_dev != oldbuf->st_dev) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "mismatching ino/dev between file %s (%lld/%lld) "
+ "and handle %s (%lld/%lld)",
+ oldpath, (long long) oldbuf->st_ino, (long long) oldbuf->st_dev,
+ newpath, (long long) newbuf.st_ino, (long long) newbuf.st_dev);
+ ret = -1;
+ }
+
+ return ret;
+}
+
+
+int
+posix_handle_soft (xlator_t *this, const char *real_path, loc_t *loc,
+ uuid_t gfid, struct stat *oldbuf)
+{
+ char *oldpath = NULL;
+ char *newpath = NULL;
+ struct stat newbuf;
+ int ret = -1;
+
+
+ MAKE_HANDLE_PATH (newpath, this, gfid, NULL);
+ MAKE_HANDLE_RELPATH (oldpath, this, loc->pargfid, loc->name);
+
+
+ ret = lstat (newpath, &newbuf);
+ if (ret == -1 && errno != ENOENT) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: %s", newpath, strerror (errno));
+ return -1;
+ }
+
+ if (ret == -1 && errno == ENOENT) {
+ ret = posix_handle_mkdir_hashes (this, newpath);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "mkdir %s failed (%s)",
+ newpath, strerror (errno));
+ return -1;
+ }
+
+ ret = symlink (oldpath, newpath);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "symlink %s -> %s failed (%s)",
+ oldpath, newpath, strerror (errno));
+ return -1;
+ }
+
+ ret = lstat (newpath, &newbuf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "stat on %s failed (%s)",
+ newpath, strerror (errno));
+ return -1;
+ }
+ }
+
+ ret = stat (real_path, &newbuf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "stat on %s failed (%s)", newpath, strerror (errno));
+ return -1;
+ }
+
+ if (!oldbuf)
+ return ret;
+
+ if (newbuf.st_ino != oldbuf->st_ino ||
+ newbuf.st_dev != oldbuf->st_dev) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "mismatching ino/dev between file %s (%lld/%lld) "
+ "and handle %s (%lld/%lld)",
+ oldpath, (long long) oldbuf->st_ino, (long long) oldbuf->st_dev,
+ newpath, (long long) newbuf.st_ino, (long long) newbuf.st_dev);
+ ret = -1;
+ }
+
+ return ret;
+}
+
+
+static int
+posix_handle_unset_gfid (xlator_t *this, uuid_t gfid)
+{
+ char *path = NULL;
+ int ret = 0;
+ struct stat stat;
+
+ MAKE_HANDLE_GFID_PATH (path, this, gfid, NULL);
+
+ ret = lstat (path, &stat);
+
+ if (ret == -1) {
+ if (errno != ENOENT) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: %s", path, strerror (errno));
+ }
+ goto out;
+ }
+
+ ret = unlink (path);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "unlink %s failed (%s)", path, strerror (errno));
+ }
+
+out:
+ return ret;
+}
+
+
+int
+posix_handle_unset (xlator_t *this, uuid_t gfid, const char *basename)
+{
+ int ret;
+ struct iatt stat;
+ char *path = NULL;
+
+
+ if (!basename) {
+ ret = posix_handle_unset_gfid (this, gfid);
+ return ret;
+ }
+
+ MAKE_HANDLE_PATH (path, this, gfid, basename);
+
+ ret = posix_istat (this, gfid, basename, &stat);
+
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: %s", path, strerror (errno));
+ return -1;
+ }
+
+ ret = posix_handle_unset_gfid (this, stat.ia_gfid);
+
+ return ret;
+}
+
+
+int
+posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid,
+ char *real_path)
+{
+ int ret = -1;
+ struct stat stbuf = {0,};
+ char *newpath = NULL;
+
+ MAKE_HANDLE_PATH (newpath, this, gfid, NULL);
+ ret = lstat (newpath, &stbuf);
+ if (!ret) {
+#ifdef HAVE_LINKAT
+ /*
+ * Use linkat if the target may be a symlink to a directory
+ * or without an existing target. See comment about linkat()
+ * usage in posix_link() in posix.c for details
+ */
+ ret = linkat (AT_FDCWD, newpath, AT_FDCWD, real_path, 0);
+#else
+ ret = link (newpath, real_path);
+#endif
+ }
+
+ return ret;
+}
diff --git a/xlators/storage/posix/src/posix-handle.h b/xlators/storage/posix/src/posix-handle.h
new file mode 100644
index 000000000..f1163b727
--- /dev/null
+++ b/xlators/storage/posix/src/posix-handle.h
@@ -0,0 +1,143 @@
+/*
+ Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _POSIX_HANDLE_H
+#define _POSIX_HANDLE_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/types.h>
+#include "xlator.h"
+
+
+#define LOC_HAS_ABSPATH(loc) ((loc) && (loc->path) && (loc->path[0] == '/'))
+
+#define MAKE_REAL_PATH(var, this, path) do { \
+ var = alloca (strlen (path) + POSIX_BASE_PATH_LEN(this) + 2); \
+ strcpy (var, POSIX_BASE_PATH(this)); \
+ strcpy (&var[POSIX_BASE_PATH_LEN(this)], path); \
+ } while (0)
+
+
+#define MAKE_HANDLE_PATH(var, this, gfid, base) do { \
+ int __len; \
+ __len = posix_handle_path (this, gfid, base, NULL, 0); \
+ if (__len <= 0) \
+ break; \
+ var = alloca (__len); \
+ __len = posix_handle_path (this, gfid, base, var, __len); \
+ } while (0)
+
+
+#define MAKE_HANDLE_GFID_PATH(var, this, gfid, base) do { \
+ int __len = 0; \
+ __len = posix_handle_gfid_path (this, gfid, base, NULL, 0); \
+ if (__len <= 0) \
+ break; \
+ var = alloca (__len); \
+ __len = posix_handle_gfid_path (this, gfid, base, var, __len); \
+ } while (0)
+
+
+#define MAKE_HANDLE_RELPATH(var, this, gfid, base) do { \
+ int __len; \
+ __len = posix_handle_relpath (this, gfid, base, NULL, 0); \
+ if (__len <= 0) \
+ break; \
+ var = alloca (__len); \
+ __len = posix_handle_relpath (this, gfid, base, var, __len); \
+ } while (0)
+
+
+#define MAKE_INODE_HANDLE(rpath, this, loc, iatt_p) do { \
+ if (uuid_is_null (loc->gfid)) { \
+ gf_log (this->name, GF_LOG_ERROR, \
+ "null gfid for path %s", loc->path); \
+ break; \
+ } \
+ if (LOC_HAS_ABSPATH (loc)) { \
+ MAKE_REAL_PATH (rpath, this, loc->path); \
+ op_ret = posix_pstat (this, loc->gfid, rpath, iatt_p); \
+ break; \
+ } \
+ errno = 0; \
+ op_ret = posix_istat (this, loc->gfid, NULL, iatt_p); \
+ if (errno != ELOOP) { \
+ MAKE_HANDLE_PATH (rpath, this, loc->gfid, NULL); \
+ break; \
+ } \
+ /* __ret == -1 && errno == ELOOP */ \
+ } while (0)
+
+
+#define MAKE_ENTRY_HANDLE(entp, parp, this, loc, ent_p) do { \
+ char *__parp; \
+ \
+ if (uuid_is_null (loc->pargfid) || !loc->name) { \
+ gf_log (this->name, GF_LOG_ERROR, \
+ "null pargfid/name for path %s", loc->path); \
+ break; \
+ } \
+ \
+ if (LOC_HAS_ABSPATH (loc)) { \
+ MAKE_REAL_PATH (entp, this, loc->path); \
+ __parp = strdupa (entp); \
+ parp = dirname (__parp); \
+ op_ret = posix_pstat (this, NULL, entp, ent_p); \
+ break; \
+ } \
+ errno = 0; \
+ op_ret = posix_istat (this, loc->pargfid, loc->name, ent_p); \
+ if (errno != ELOOP) { \
+ MAKE_HANDLE_PATH (parp, this, loc->pargfid, NULL); \
+ MAKE_HANDLE_PATH (entp, this, loc->pargfid, loc->name); \
+ break; \
+ } \
+ /* __ret == -1 && errno == ELOOP */ \
+ /* expand ELOOP */ \
+ } while (0)
+
+
+
+int
+posix_handle_path (xlator_t *this, uuid_t gfid, const char *basename, char *buf,
+ size_t len);
+int
+posix_handle_path_safe (xlator_t *this, uuid_t gfid, const char *basename,
+ char *buf, size_t len);
+
+int
+posix_handle_gfid_path (xlator_t *this, uuid_t gfid, const char *basename,
+ char *buf, size_t len);
+
+int
+posix_handle_hard (xlator_t *this, const char *path, uuid_t gfid,
+ struct stat *buf);
+
+
+int
+posix_handle_soft (xlator_t *this, const char *real_path, loc_t *loc,
+ uuid_t gfid, struct stat *buf);
+
+int
+posix_handle_unset (xlator_t *this, uuid_t gfid, const char *basename);
+
+int posix_handle_mkdir_hashes (xlator_t *this, const char *newpath);
+
+int posix_handle_init (xlator_t *this);
+
+int posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid,
+ char *real_path);
+
+int
+posix_handle_trash_init (xlator_t *this);
+#endif /* !_POSIX_HANDLE_H */
diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c
new file mode 100644
index 000000000..e295f8850
--- /dev/null
+++ b/xlators/storage/posix/src/posix-helpers.c
@@ -0,0 +1,1391 @@
+/*
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#define __XOPEN_SOURCE 500
+
+#include <stdint.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <errno.h>
+#include <libgen.h>
+#include <pthread.h>
+#include <ftw.h>
+#include <sys/stat.h>
+#include <signal.h>
+
+#ifndef GF_BSD_HOST_OS
+#include <alloca.h>
+#endif /* GF_BSD_HOST_OS */
+
+#include "glusterfs.h"
+#include "checksum.h"
+#include "dict.h"
+#include "logging.h"
+#include "posix.h"
+#include "xlator.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+#include "syscall.h"
+#include "statedump.h"
+#include "locking.h"
+#include "timer.h"
+#include "glusterfs3-xdr.h"
+#include "hashfn.h"
+#include "glusterfs-acl.h"
+#include <fnmatch.h>
+
+char *marker_xattrs[] = {"trusted.glusterfs.quota.*",
+ "trusted.glusterfs.*.xtime",
+ NULL};
+
+static char* posix_ignore_xattrs[] = {
+ "gfid-req",
+ GLUSTERFS_ENTRYLK_COUNT,
+ GLUSTERFS_INODELK_COUNT,
+ GLUSTERFS_POSIXLK_COUNT,
+ NULL
+};
+
+gf_boolean_t
+posix_special_xattr (char **pattern, char *key)
+{
+ int i = 0;
+ gf_boolean_t flag = _gf_false;
+
+ GF_VALIDATE_OR_GOTO ("posix", pattern, out);
+ GF_VALIDATE_OR_GOTO ("posix", key, out);
+
+ for (i = 0; pattern[i]; i++) {
+ if (!fnmatch (pattern[i], key, 0)) {
+ flag = _gf_true;
+ break;
+ }
+ }
+out:
+ return flag;
+}
+
+static gf_boolean_t
+posix_xattr_ignorable (char *key, posix_xattr_filler_t *filler)
+{
+ int i = 0;
+ gf_boolean_t ignore = _gf_false;
+
+ GF_ASSERT (key);
+ if (!key)
+ goto out;
+ for (i = 0; posix_ignore_xattrs[i]; i++) {
+ if (!strcmp (key, posix_ignore_xattrs[i])) {
+ ignore = _gf_true;
+ goto out;
+ }
+ }
+ if ((!strcmp (key, GF_CONTENT_KEY))
+ && (!IA_ISREG (filler->stbuf->ia_type)))
+ ignore = _gf_true;
+out:
+ return ignore;
+}
+
+static int
+_posix_xattr_get_set (dict_t *xattr_req,
+ char *key,
+ data_t *data,
+ void *xattrargs)
+{
+ posix_xattr_filler_t *filler = xattrargs;
+ char *value = NULL;
+ ssize_t xattr_size = -1;
+ int ret = -1;
+ char *databuf = NULL;
+ int _fd = -1;
+ loc_t *loc = NULL;
+ ssize_t req_size = 0;
+
+
+ if (posix_xattr_ignorable (key, filler))
+ goto out;
+ /* should size be put into the data_t ? */
+ if (!strcmp (key, GF_CONTENT_KEY)
+ && IA_ISREG (filler->stbuf->ia_type)) {
+
+ /* file content request */
+ req_size = data_to_uint64 (data);
+ if (req_size >= filler->stbuf->ia_size) {
+ _fd = open (filler->real_path, O_RDONLY);
+ if (_fd == -1) {
+ gf_log (filler->this->name, GF_LOG_ERROR,
+ "Opening file %s failed: %s",
+ filler->real_path, strerror (errno));
+ goto err;
+ }
+
+ databuf = GF_CALLOC (1, filler->stbuf->ia_size,
+ gf_posix_mt_char);
+ if (!databuf) {
+ goto err;
+ }
+
+ ret = read (_fd, databuf, filler->stbuf->ia_size);
+ if (ret == -1) {
+ gf_log (filler->this->name, GF_LOG_ERROR,
+ "Read on file %s failed: %s",
+ filler->real_path, strerror (errno));
+ goto err;
+ }
+
+ ret = close (_fd);
+ _fd = -1;
+ if (ret == -1) {
+ gf_log (filler->this->name, GF_LOG_ERROR,
+ "Close on file %s failed: %s",
+ filler->real_path, strerror (errno));
+ goto err;
+ }
+
+ ret = dict_set_bin (filler->xattr, key,
+ databuf, filler->stbuf->ia_size);
+ if (ret < 0) {
+ gf_log (filler->this->name, GF_LOG_ERROR,
+ "failed to set dict value. key: %s, path: %s",
+ key, filler->real_path);
+ goto err;
+ }
+
+ /* To avoid double free in cleanup below */
+ databuf = NULL;
+ err:
+ if (_fd != -1)
+ close (_fd);
+ GF_FREE (databuf);
+ }
+ } else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) {
+ loc = filler->loc;
+ if (loc) {
+ ret = dict_set_uint32 (filler->xattr, key,
+ loc->inode->fd_count);
+ if (ret < 0)
+ gf_log (filler->this->name, GF_LOG_WARNING,
+ "Failed to set dictionary value for %s",
+ key);
+ }
+ } else {
+ xattr_size = sys_lgetxattr (filler->real_path, key, NULL, 0);
+
+ if (xattr_size > 0) {
+ value = GF_CALLOC (1, xattr_size + 1,
+ gf_posix_mt_char);
+ if (!value)
+ return -1;
+
+ xattr_size = sys_lgetxattr (filler->real_path, key, value,
+ xattr_size);
+ if (xattr_size <= 0) {
+ gf_log (filler->this->name, GF_LOG_WARNING,
+ "getxattr failed. path: %s, key: %s",
+ filler->real_path, key);
+ GF_FREE (value);
+ return -1;
+ }
+
+ value[xattr_size] = '\0';
+ ret = dict_set_bin (filler->xattr, key,
+ value, xattr_size);
+ if (ret < 0) {
+ gf_log (filler->this->name, GF_LOG_DEBUG,
+ "dict set failed. path: %s, key: %s",
+ filler->real_path, key);
+ GF_FREE (value);
+ }
+ }
+ }
+out:
+ return 0;
+}
+
+
+int
+posix_fill_gfid_path (xlator_t *this, const char *path, struct iatt *iatt)
+{
+ int ret = 0;
+ ssize_t size = 0;
+
+ if (!iatt)
+ return 0;
+
+ size = sys_lgetxattr (path, GFID_XATTR_KEY, iatt->ia_gfid, 16);
+ /* Return value of getxattr */
+ if ((size == 16) || (size == -1))
+ ret = 0;
+ else
+ ret = size;
+
+ return ret;
+}
+
+
+int
+posix_fill_gfid_fd (xlator_t *this, int fd, struct iatt *iatt)
+{
+ int ret = 0;
+ ssize_t size = 0;
+
+ if (!iatt)
+ return 0;
+
+ size = sys_fgetxattr (fd, GFID_XATTR_KEY, iatt->ia_gfid, 16);
+ /* Return value of getxattr */
+ if ((size == 16) || (size == -1))
+ ret = 0;
+ else
+ ret = size;
+
+ return ret;
+}
+
+void
+posix_fill_ino_from_gfid (xlator_t *this, struct iatt *buf)
+{
+ uint64_t temp_ino = 0;
+ int j = 0;
+ int i = 0;
+
+ /* consider least significant 8 bytes of value out of gfid */
+ if (uuid_is_null (buf->ia_gfid)) {
+ buf->ia_ino = -1;
+ goto out;
+ }
+ for (i = 15; i > (15 - 8); i--) {
+ temp_ino += (uint64_t)(buf->ia_gfid[i]) << j;
+ j += 8;
+ }
+ buf->ia_ino = temp_ino;
+out:
+ return;
+}
+
+int
+posix_fdstat (xlator_t *this, int fd, struct iatt *stbuf_p)
+{
+ int ret = 0;
+ struct stat fstatbuf = {0, };
+ struct iatt stbuf = {0, };
+
+ ret = fstat (fd, &fstatbuf);
+ if (ret == -1)
+ goto out;
+
+ if (fstatbuf.st_nlink && !S_ISDIR (fstatbuf.st_mode))
+ fstatbuf.st_nlink--;
+
+ iatt_from_stat (&stbuf, &fstatbuf);
+
+ ret = posix_fill_gfid_fd (this, fd, &stbuf);
+ if (ret)
+ gf_log_callingfn (this->name, GF_LOG_DEBUG, "failed to get gfid");
+
+ posix_fill_ino_from_gfid (this, &stbuf);
+
+ if (stbuf_p)
+ *stbuf_p = stbuf;
+
+out:
+ return ret;
+}
+
+
+int
+posix_istat (xlator_t *this, uuid_t gfid, const char *basename,
+ struct iatt *buf_p)
+{
+ char *real_path = NULL;
+ struct stat lstatbuf = {0, };
+ struct iatt stbuf = {0, };
+ int ret = 0;
+ struct posix_private *priv = NULL;
+
+
+ priv = this->private;
+
+ MAKE_HANDLE_PATH (real_path, this, gfid, basename);
+
+ ret = lstat (real_path, &lstatbuf);
+
+ if (ret != 0) {
+ if (ret == -1) {
+ if (errno != ENOENT && errno != ELOOP)
+ gf_log (this->name, GF_LOG_WARNING,
+ "lstat failed on %s (%s)",
+ real_path, strerror (errno));
+ } else {
+ // may be some backend filesystem issue
+ gf_log (this->name, GF_LOG_ERROR, "lstat failed on "
+ "%s and return value is %d instead of -1. "
+ "Please see dmesg output to check whether the "
+ "failure is due to backend filesystem issue",
+ real_path, ret);
+ ret = -1;
+ }
+ goto out;
+ }
+
+ if ((lstatbuf.st_ino == priv->handledir.st_ino) &&
+ (lstatbuf.st_dev == priv->handledir.st_dev)) {
+ errno = ENOENT;
+ return -1;
+ }
+
+ if (!S_ISDIR (lstatbuf.st_mode))
+ lstatbuf.st_nlink --;
+
+ iatt_from_stat (&stbuf, &lstatbuf);
+
+ if (basename)
+ posix_fill_gfid_path (this, real_path, &stbuf);
+ else
+ uuid_copy (stbuf.ia_gfid, gfid);
+
+ posix_fill_ino_from_gfid (this, &stbuf);
+
+ if (buf_p)
+ *buf_p = stbuf;
+out:
+ return ret;
+}
+
+
+
+int
+posix_pstat (xlator_t *this, uuid_t gfid, const char *path,
+ struct iatt *buf_p)
+{
+ struct stat lstatbuf = {0, };
+ struct iatt stbuf = {0, };
+ int ret = 0;
+ struct posix_private *priv = NULL;
+
+
+ priv = this->private;
+
+ ret = lstat (path, &lstatbuf);
+
+ if (ret != 0) {
+ if (ret == -1) {
+ if (errno != ENOENT)
+ gf_log (this->name, GF_LOG_WARNING,
+ "lstat failed on %s (%s)",
+ path, strerror (errno));
+ } else {
+ // may be some backend filesytem issue
+ gf_log (this->name, GF_LOG_ERROR, "lstat failed on "
+ "%s and return value is %d instead of -1. "
+ "Please see dmesg output to check whether the "
+ "failure is due to backend filesystem issue",
+ path, ret);
+ ret = -1;
+ }
+ goto out;
+ }
+
+ if ((lstatbuf.st_ino == priv->handledir.st_ino) &&
+ (lstatbuf.st_dev == priv->handledir.st_dev)) {
+ errno = ENOENT;
+ return -1;
+ }
+
+ if (!S_ISDIR (lstatbuf.st_mode))
+ lstatbuf.st_nlink --;
+
+ iatt_from_stat (&stbuf, &lstatbuf);
+
+ if (gfid && !uuid_is_null (gfid))
+ uuid_copy (stbuf.ia_gfid, gfid);
+ else
+ posix_fill_gfid_path (this, path, &stbuf);
+
+ posix_fill_ino_from_gfid (this, &stbuf);
+
+ if (buf_p)
+ *buf_p = stbuf;
+out:
+ return ret;
+}
+
+
+dict_t *
+posix_lookup_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc,
+ dict_t *xattr_req, struct iatt *buf)
+{
+ dict_t *xattr = NULL;
+ posix_xattr_filler_t filler = {0, };
+
+ xattr = get_new_dict();
+ if (!xattr) {
+ goto out;
+ }
+
+ filler.this = this;
+ filler.real_path = real_path;
+ filler.xattr = xattr;
+ filler.stbuf = buf;
+ filler.loc = loc;
+
+ dict_foreach (xattr_req, _posix_xattr_get_set, &filler);
+out:
+ return xattr;
+}
+
+
+int
+posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req)
+{
+ void *uuid_req = NULL;
+ uuid_t uuid_curr;
+ int ret = 0;
+ ssize_t size = 0;
+ struct stat stat = {0, };
+
+
+ if (!xattr_req)
+ goto out;
+
+ if (sys_lstat (path, &stat) != 0)
+ goto out;
+
+ size = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16);
+ if (size == 16) {
+ ret = 0;
+ goto verify_handle;
+ }
+
+ ret = dict_get_ptr (xattr_req, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get the gfid from dict for %s",
+ loc->path);
+ goto out;
+ }
+
+ ret = sys_lsetxattr (path, GFID_XATTR_KEY, uuid_req, 16, XATTR_CREATE);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "setting GFID on %s failed (%s)", path,
+ strerror (errno));
+ goto out;
+ }
+ uuid_copy (uuid_curr, uuid_req);
+
+verify_handle:
+ if (!S_ISDIR (stat.st_mode))
+ ret = posix_handle_hard (this, path, uuid_curr, &stat);
+ else
+ ret = posix_handle_soft (this, path, loc, uuid_curr, &stat);
+
+out:
+ return ret;
+}
+
+
+int
+posix_set_file_contents (xlator_t *this, const char *path, char *keyp,
+ data_t *value, int flags)
+{
+ char * key = NULL;
+ char real_path[PATH_MAX];
+ int32_t file_fd = -1;
+ int op_ret = 0;
+ int ret = -1;
+
+
+ /* XXX: does not handle assigning GFID to created files */
+ return -1;
+
+ key = &(keyp[15]);
+ sprintf (real_path, "%s/%s", path, key);
+
+ if (flags & XATTR_REPLACE) {
+ /* if file exists, replace it
+ * else, error out */
+ file_fd = open (real_path, O_TRUNC|O_WRONLY);
+
+ if (file_fd == -1) {
+ goto create;
+ }
+
+ if (value->len) {
+ ret = write (file_fd, value->data, value->len);
+ if (ret == -1) {
+ op_ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "write failed while doing setxattr "
+ "for key %s on path %s: %s",
+ key, real_path, strerror (errno));
+ goto out;
+ }
+
+ ret = close (file_fd);
+ if (ret == -1) {
+ op_ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "close failed on %s: %s",
+ real_path, strerror (errno));
+ goto out;
+ }
+ }
+
+ create: /* we know file doesn't exist, create it */
+
+ file_fd = open (real_path, O_CREAT|O_WRONLY, 0644);
+
+ if (file_fd == -1) {
+ op_ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to open file %s with O_CREAT: %s",
+ key, strerror (errno));
+ goto out;
+ }
+
+ ret = write (file_fd, value->data, value->len);
+ if (ret == -1) {
+ op_ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "write failed on %s while setxattr with "
+ "key %s: %s",
+ real_path, key, strerror (errno));
+ goto out;
+ }
+
+ ret = close (file_fd);
+ if (ret == -1) {
+ op_ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "close failed on %s while setxattr with "
+ "key %s: %s",
+ real_path, key, strerror (errno));
+ goto out;
+ }
+ }
+
+out:
+ return op_ret;
+}
+
+
+int
+posix_get_file_contents (xlator_t *this, uuid_t pargfid,
+ const char *name, char **contents)
+{
+ char *real_path = NULL;
+ int32_t file_fd = -1;
+ struct iatt stbuf = {0,};
+ int op_ret = 0;
+ int ret = -1;
+
+
+ MAKE_HANDLE_PATH (real_path, this, pargfid, name);
+
+ op_ret = posix_istat (this, pargfid, name, &stbuf);
+ if (op_ret == -1) {
+ op_ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR, "lstat failed on %s: %s",
+ real_path, strerror (errno));
+ goto out;
+ }
+
+ file_fd = open (real_path, O_RDONLY);
+
+ if (file_fd == -1) {
+ op_ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR, "open failed on %s: %s",
+ real_path, strerror (errno));
+ goto out;
+ }
+
+ *contents = GF_CALLOC (stbuf.ia_size + 1, sizeof(char),
+ gf_posix_mt_char);
+ if (! *contents) {
+ op_ret = -errno;
+ goto out;
+ }
+
+ ret = read (file_fd, *contents, stbuf.ia_size);
+ if (ret <= 0) {
+ op_ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "read on %s failed: %s",
+ real_path, strerror (errno));
+ goto out;
+ }
+
+ *contents[stbuf.ia_size] = '\0';
+
+ op_ret = close (file_fd);
+ file_fd = -1;
+ if (op_ret == -1) {
+ op_ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR, "close on %s failed: %s",
+ real_path, strerror (errno));
+ goto out;
+ }
+
+out:
+ if (op_ret < 0) {
+ GF_FREE (*contents);
+ if (file_fd != -1)
+ close (file_fd);
+ }
+
+ return op_ret;
+}
+
+static int gf_xattr_enotsup_log;
+
+int
+posix_handle_pair (xlator_t *this, const char *real_path,
+ char *key, data_t *value, int flags)
+{
+ int sys_ret = -1;
+ int ret = 0;
+
+ if (ZR_FILE_CONTENT_REQUEST(key)) {
+ ret = posix_set_file_contents (this, real_path, key, value,
+ flags);
+ } else {
+ sys_ret = sys_lsetxattr (real_path, key, value->data,
+ value->len, flags);
+
+ if (sys_ret < 0) {
+ if (errno == ENOTSUP) {
+ GF_LOG_OCCASIONALLY(gf_xattr_enotsup_log,
+ this->name,GF_LOG_WARNING,
+ "Extended attributes not "
+ "supported (try remounting "
+ "brick with 'user_xattr' "
+ "flag)");
+ } else if (errno == ENOENT) {
+ if (!posix_special_xattr (marker_xattrs,
+ key)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setxattr on %s failed: %s",
+ real_path, strerror (errno));
+ }
+ } else {
+
+#ifdef GF_DARWIN_HOST_OS
+ gf_log (this->name,
+ ((errno == EINVAL) ?
+ GF_LOG_DEBUG : GF_LOG_ERROR),
+ "%s: key:%s error:%s",
+ real_path, key,
+ strerror (errno));
+#else /* ! DARWIN */
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: key:%s error:%s",
+ real_path, key,
+ strerror (errno));
+#endif /* DARWIN */
+ }
+
+ ret = -errno;
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
+
+int
+posix_fhandle_pair (xlator_t *this, int fd,
+ char *key, data_t *value, int flags)
+{
+ int sys_ret = -1;
+ int ret = 0;
+
+ sys_ret = sys_fsetxattr (fd, key, value->data,
+ value->len, flags);
+
+ if (sys_ret < 0) {
+ if (errno == ENOTSUP) {
+ GF_LOG_OCCASIONALLY(gf_xattr_enotsup_log,
+ this->name,GF_LOG_WARNING,
+ "Extended attributes not "
+ "supported (try remounting "
+ "brick with 'user_xattr' "
+ "flag)");
+ } else if (errno == ENOENT) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsetxattr on fd=%d failed: %s", fd,
+ strerror (errno));
+ } else {
+
+#ifdef GF_DARWIN_HOST_OS
+ gf_log (this->name,
+ ((errno == EINVAL) ?
+ GF_LOG_DEBUG : GF_LOG_ERROR),
+ "fd=%d: key:%s error:%s",
+ fd, key, strerror (errno));
+#else /* ! DARWIN */
+ gf_log (this->name, GF_LOG_ERROR,
+ "fd=%d: key:%s error:%s",
+ fd, key, strerror (errno));
+#endif /* DARWIN */
+ }
+
+ ret = -errno;
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+
+static int
+janitor_walker (const char *fpath, const struct stat *sb,
+ int typeflag, struct FTW *ftwbuf)
+{
+ struct iatt stbuf = {0, };
+ xlator_t *this = NULL;
+
+ this = THIS;
+ posix_pstat (this, NULL, fpath, &stbuf);
+ switch (sb->st_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFBLK:
+ case S_IFLNK:
+ case S_IFCHR:
+ case S_IFIFO:
+ case S_IFSOCK:
+ gf_log (THIS->name, GF_LOG_TRACE,
+ "unlinking %s", fpath);
+ unlink (fpath);
+ if (stbuf.ia_nlink == 1)
+ posix_handle_unset (this, stbuf.ia_gfid, NULL);
+ break;
+
+ case S_IFDIR:
+ if (ftwbuf->level) { /* don't remove top level dir */
+ gf_log (THIS->name, GF_LOG_TRACE,
+ "removing directory %s", fpath);
+
+ rmdir (fpath);
+ posix_handle_unset (this, stbuf.ia_gfid, NULL);
+ }
+ break;
+ }
+
+ return 0; /* 0 = FTW_CONTINUE */
+}
+
+
+static struct posix_fd *
+janitor_get_next_fd (xlator_t *this)
+{
+ struct posix_private *priv = NULL;
+ struct posix_fd *pfd = NULL;
+
+ struct timespec timeout;
+
+ priv = this->private;
+
+ pthread_mutex_lock (&priv->janitor_lock);
+ {
+ if (list_empty (&priv->janitor_fds)) {
+ time (&timeout.tv_sec);
+ timeout.tv_sec += priv->janitor_sleep_duration;
+ timeout.tv_nsec = 0;
+
+ pthread_cond_timedwait (&priv->janitor_cond,
+ &priv->janitor_lock,
+ &timeout);
+ goto unlock;
+ }
+
+ pfd = list_entry (priv->janitor_fds.next, struct posix_fd,
+ list);
+
+ list_del (priv->janitor_fds.next);
+ }
+unlock:
+ pthread_mutex_unlock (&priv->janitor_lock);
+
+ return pfd;
+}
+
+
+static void *
+posix_janitor_thread_proc (void *data)
+{
+ xlator_t * this = NULL;
+ struct posix_private *priv = NULL;
+ struct posix_fd *pfd;
+
+ time_t now;
+
+ this = data;
+ priv = this->private;
+
+ THIS = this;
+
+ while (1) {
+ time (&now);
+ if ((now - priv->last_landfill_check) > priv->janitor_sleep_duration) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "janitor cleaning out %s", priv->trash_path);
+
+ nftw (priv->trash_path,
+ janitor_walker,
+ 32,
+ FTW_DEPTH | FTW_PHYS);
+
+ priv->last_landfill_check = now;
+ }
+
+ pfd = janitor_get_next_fd (this);
+ if (pfd) {
+ if (pfd->dir == NULL) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "janitor: closing file fd=%d", pfd->fd);
+ close (pfd->fd);
+ } else {
+ gf_log (this->name, GF_LOG_TRACE,
+ "janitor: closing dir fd=%p", pfd->dir);
+ closedir (pfd->dir);
+ }
+
+ GF_FREE (pfd);
+ }
+ }
+
+ return NULL;
+}
+
+
+void
+posix_spawn_janitor_thread (xlator_t *this)
+{
+ struct posix_private *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ LOCK (&priv->lock);
+ {
+ if (!priv->janitor_present) {
+ ret = gf_thread_create (&priv->janitor, NULL,
+ posix_janitor_thread_proc, this);
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "spawning janitor thread failed: %s",
+ strerror (errno));
+ goto unlock;
+ }
+
+ priv->janitor_present = _gf_true;
+ }
+ }
+unlock:
+ UNLOCK (&priv->lock);
+}
+
+static int
+is_fresh_file (struct stat *stat)
+{
+ struct timeval tv;
+
+ gettimeofday (&tv, NULL);
+
+ if ((stat->st_ctime >= (tv.tv_sec - 1))
+ && (stat->st_ctime <= tv.tv_sec))
+ return 1;
+
+ return 0;
+}
+
+
+int
+posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req)
+{
+ /* The purpose of this function is to prevent a race
+ where an inode creation FOP (like mkdir/mknod/create etc)
+ races with lookup in the following way:
+
+ {create thread} | {lookup thread}
+ |
+ t0
+ mkdir ("name") |
+ t1
+ | posix_gfid_set ("name", 2);
+ t2
+ posix_gfid_set ("name", 1); |
+ t3
+ lstat ("name"); | lstat ("name");
+
+ In the above case mkdir FOP would have resulted with GFID 2 while
+ it should have been GFID 1. It matters in the case where GFID would
+ have gotten set to 1 on other subvolumes of replciate/distribute
+
+ The "solution" here is that, if we detect lookup is attempting to
+ set a GFID on a file which is created very recently, but does not
+ yet have a GFID (i.e, between t1 and t2), then "fake" it as though
+ posix_gfid_heal was called at t0 instead.
+ */
+
+ uuid_t uuid_curr;
+ int ret = 0;
+ struct stat stat = {0, };
+
+ if (!xattr_req)
+ goto out;
+
+ if (sys_lstat (path, &stat) != 0)
+ goto out;
+
+ ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16);
+ if (ret != 16) {
+ if (is_fresh_file (&stat)) {
+ ret = -1;
+ errno = ENOENT;
+ goto out;
+ }
+ }
+
+ ret = posix_gfid_set (this, path, loc, xattr_req);
+out:
+ return ret;
+}
+
+
+int
+posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req)
+{
+ int ret = 0;
+ data_t *data = NULL;
+ struct stat stat = {0, };
+
+ if (!xattr_req)
+ goto out;
+
+ if (sys_lstat (path, &stat) != 0)
+ goto out;
+
+ data = dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR);
+ if (data) {
+ ret = sys_lsetxattr (path, POSIX_ACL_ACCESS_XATTR,
+ data->data, data->len, 0);
+ if (ret != 0)
+ goto out;
+ }
+
+ data = dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR);
+ if (data) {
+ ret = sys_lsetxattr (path, POSIX_ACL_DEFAULT_XATTR,
+ data->data, data->len, 0);
+ if (ret != 0)
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+static int
+_handle_entry_create_keyvalue_pair (dict_t *d, char *k, data_t *v,
+ void *tmp)
+{
+ int ret = -1;
+ posix_xattr_filler_t *filler = NULL;
+
+ filler = tmp;
+
+ if (!strcmp (GFID_XATTR_KEY, k) ||
+ !strcmp ("gfid-req", k) ||
+ !strcmp (POSIX_ACL_DEFAULT_XATTR, k) ||
+ !strcmp (POSIX_ACL_ACCESS_XATTR, k) ||
+ ZR_FILE_CONTENT_REQUEST(k)) {
+ return 0;
+ }
+
+ ret = posix_handle_pair (filler->this, filler->real_path, k, v,
+ XATTR_CREATE);
+ if (ret < 0) {
+ errno = -ret;
+ return -1;
+ }
+ return 0;
+}
+
+int
+posix_entry_create_xattr_set (xlator_t *this, const char *path,
+ dict_t *dict)
+{
+ int ret = -1;
+
+ posix_xattr_filler_t filler = {0,};
+
+ if (!dict)
+ goto out;
+
+ filler.this = this;
+ filler.real_path = path;
+
+ ret = dict_foreach (dict, _handle_entry_create_keyvalue_pair, &filler);
+
+out:
+ return ret;
+}
+
+
+static int
+__posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd_p)
+{
+ uint64_t tmp_pfd = 0;
+ struct posix_fd *pfd = NULL;
+ int ret = -1;
+ char *real_path = NULL;
+ int _fd = -1;
+ DIR *dir = NULL;
+
+ ret = __fd_ctx_get (fd, this, &tmp_pfd);
+ if (ret == 0) {
+ pfd = (void *)(long) tmp_pfd;
+ ret = 0;
+ goto out;
+ }
+
+ if (!fd_is_anonymous(fd))
+ /* anonymous fd */
+ goto out;
+
+ MAKE_HANDLE_PATH (real_path, this, fd->inode->gfid, NULL);
+
+ pfd = GF_CALLOC (1, sizeof (*pfd), gf_posix_mt_posix_fd);
+ if (!pfd) {
+ goto out;
+ }
+ pfd->fd = -1;
+
+ if (fd->inode->ia_type == IA_IFDIR) {
+ dir = opendir (real_path);
+ if (!dir) {
+ GF_FREE (pfd);
+ pfd = NULL;
+ goto out;
+ }
+ _fd = dirfd (dir);
+ }
+
+ if (fd->inode->ia_type == IA_IFREG) {
+ _fd = open (real_path, O_RDWR|O_LARGEFILE);
+ if (_fd == -1) {
+ GF_FREE (pfd);
+ pfd = NULL;
+ goto out;
+ }
+ }
+
+ pfd->fd = _fd;
+ pfd->dir = dir;
+
+ ret = __fd_ctx_set (fd, this, (uint64_t) (long) pfd);
+ if (ret != 0) {
+ if (_fd != -1)
+ close (_fd);
+ if (dir)
+ closedir (dir);
+ GF_FREE (pfd);
+ pfd = NULL;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (pfd_p)
+ *pfd_p = pfd;
+ return ret;
+}
+
+
+int
+posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd)
+{
+ int ret;
+
+ LOCK (&fd->inode->lock);
+ {
+ ret = __posix_fd_ctx_get (fd, this, pfd);
+ }
+ UNLOCK (&fd->inode->lock);
+
+ return ret;
+}
+
+static void *
+posix_health_check_thread_proc (void *data)
+{
+ xlator_t *this = NULL;
+ struct posix_private *priv = NULL;
+ uint32_t interval = 0;
+ int ret = -1;
+ struct stat sb = {0, };
+
+ this = data;
+ priv = this->private;
+
+ /* prevent races when the interval is updated */
+ interval = priv->health_check_interval;
+ if (interval == 0)
+ goto out;
+
+ gf_log (this->name, GF_LOG_DEBUG, "health-check thread started, "
+ "interval = %d seconds", interval);
+
+ while (1) {
+ /* aborting sleep() is a request to exit this thread, sleep()
+ * will normally not return when cancelled */
+ ret = sleep (interval);
+ if (ret > 0)
+ break;
+
+ /* prevent thread errors while doing the health-check(s) */
+ pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+
+ /* Do the health-check, it should be moved to its own function
+ * in case it gets more complex. */
+ ret = stat (priv->base_path, &sb);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "stat() on %s returned: %s", priv->base_path,
+ strerror (errno));
+ goto abort;
+ }
+
+ pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL);
+ }
+
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "health-check thread exiting");
+
+ LOCK (&priv->lock);
+ {
+ priv->health_check_active = _gf_false;
+ }
+ UNLOCK (&priv->lock);
+
+ return NULL;
+
+abort:
+ /* health-check failed */
+ gf_log (this->name, GF_LOG_EMERG, "health-check failed, going down");
+ xlator_notify (this->parents->xlator, GF_EVENT_CHILD_DOWN, this);
+
+ ret = sleep (30);
+ if (ret == 0) {
+ gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGTERM");
+ kill (getpid(), SIGTERM);
+ }
+
+ ret = sleep (30);
+ if (ret == 0) {
+ gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGKILL");
+ kill (getpid(), SIGKILL);
+ }
+
+ return NULL;
+}
+
+void
+posix_spawn_health_check_thread (xlator_t *xl)
+{
+ struct posix_private *priv = NULL;
+ int ret = -1;
+
+ priv = xl->private;
+
+ LOCK (&priv->lock);
+ {
+ /* cancel the running thread */
+ if (priv->health_check_active == _gf_true) {
+ pthread_cancel (priv->health_check);
+ priv->health_check_active = _gf_false;
+ }
+
+ /* prevent scheduling a check in a tight loop */
+ if (priv->health_check_interval == 0)
+ goto unlock;
+
+ ret = gf_thread_create (&priv->health_check, NULL,
+ posix_health_check_thread_proc, xl);
+ if (ret < 0) {
+ priv->health_check_interval = 0;
+ priv->health_check_active = _gf_false;
+ gf_log (xl->name, GF_LOG_ERROR,
+ "unable to setup health-check thread: %s",
+ strerror (errno));
+ goto unlock;
+ }
+
+ /* run the thread detached, resources will be freed on exit */
+ pthread_detach (priv->health_check);
+ priv->health_check_active = _gf_true;
+ }
+unlock:
+ UNLOCK (&priv->lock);
+}
+
+int
+posix_fsyncer_pick (xlator_t *this, struct list_head *head)
+{
+ struct posix_private *priv = NULL;
+ int count = 0;
+
+ priv = this->private;
+ pthread_mutex_lock (&priv->fsync_mutex);
+ {
+ while (list_empty (&priv->fsyncs))
+ pthread_cond_wait (&priv->fsync_cond,
+ &priv->fsync_mutex);
+
+ count = priv->fsync_queue_count;
+ priv->fsync_queue_count = 0;
+ list_splice_init (&priv->fsyncs, head);
+ }
+ pthread_mutex_unlock (&priv->fsync_mutex);
+
+ return count;
+}
+
+
+void
+posix_fsyncer_process (xlator_t *this, call_stub_t *stub, gf_boolean_t do_fsync)
+{
+ struct posix_fd *pfd = NULL;
+ int ret = -1;
+ struct posix_private *priv = NULL;
+
+ priv = this->private;
+
+ ret = posix_fd_ctx_get (stub->args.fd, this, &pfd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not get fdctx for fd(%s)",
+ uuid_utoa (stub->args.fd->inode->gfid));
+ call_unwind_error (stub, -1, EINVAL);
+ return;
+ }
+
+ if (do_fsync) {
+#ifdef HAVE_FDATASYNC
+ if (stub->args.datasync)
+ ret = fdatasync (pfd->fd);
+ else
+#endif
+ ret = fsync (pfd->fd);
+ } else {
+ ret = 0;
+ }
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not fstat fd(%s)",
+ uuid_utoa (stub->args.fd->inode->gfid));
+ call_unwind_error (stub, -1, errno);
+ return;
+ }
+
+ call_unwind_error (stub, 0, 0);
+}
+
+
+static void
+posix_fsyncer_syncfs (xlator_t *this, struct list_head *head)
+{
+ call_stub_t *stub = NULL;
+ struct posix_fd *pfd = NULL;
+ int ret = -1;
+
+ stub = list_entry (head->prev, call_stub_t, list);
+ ret = posix_fd_ctx_get (stub->args.fd, this, &pfd);
+ if (ret)
+ return;
+
+#ifdef GF_LINUX_HOST_OS
+ /* syncfs() is not "declared" in RHEL's glibc even though
+ the kernel has support.
+ */
+#include <sys/syscall.h>
+#include <unistd.h>
+#ifdef SYS_syncfs
+ syscall (SYS_syncfs, pfd->fd);
+#else
+ sync();
+#endif
+#else
+ sync();
+#endif
+}
+
+
+void *
+posix_fsyncer (void *d)
+{
+ xlator_t *this = d;
+ struct posix_private *priv = NULL;
+ call_stub_t *stub = NULL;
+ call_stub_t *tmp = NULL;
+ struct list_head list;
+ int count = 0;
+ gf_boolean_t do_fsync = _gf_true;
+
+ priv = this->private;
+
+ for (;;) {
+ INIT_LIST_HEAD (&list);
+
+ count = posix_fsyncer_pick (this, &list);
+
+ usleep (priv->batch_fsync_delay_usec);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "picked %d fsyncs", count);
+
+ switch (priv->batch_fsync_mode) {
+ case BATCH_NONE:
+ case BATCH_REVERSE_FSYNC:
+ break;
+ case BATCH_SYNCFS:
+ case BATCH_SYNCFS_SINGLE_FSYNC:
+ case BATCH_SYNCFS_REVERSE_FSYNC:
+ posix_fsyncer_syncfs (this, &list);
+ break;
+ }
+
+ if (priv->batch_fsync_mode == BATCH_SYNCFS)
+ do_fsync = _gf_false;
+ else
+ do_fsync = _gf_true;
+
+ list_for_each_entry_safe_reverse (stub, tmp, &list, list) {
+ list_del_init (&stub->list);
+
+ posix_fsyncer_process (this, stub, do_fsync);
+
+ if (priv->batch_fsync_mode == BATCH_SYNCFS_SINGLE_FSYNC)
+ do_fsync = _gf_false;
+ }
+ }
+}
diff --git a/xlators/storage/posix/src/posix-mem-types.h b/xlators/storage/posix/src/posix-mem-types.h
new file mode 100644
index 000000000..81752c17e
--- /dev/null
+++ b/xlators/storage/posix/src/posix-mem-types.h
@@ -0,0 +1,27 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef __POSIX_MEM_TYPES_H__
+#define __POSIX_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_posix_mem_types_ {
+ gf_posix_mt_dir_entry_t = gf_common_mt_end + 1,
+ gf_posix_mt_posix_fd,
+ gf_posix_mt_char,
+ gf_posix_mt_posix_private,
+ gf_posix_mt_int32_t,
+ gf_posix_mt_posix_dev_t,
+ gf_posix_mt_trash_path,
+ gf_posix_mt_paiocb,
+ gf_posix_mt_end
+};
+#endif
+
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index b730b136f..fb45c7a67 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -1,22 +1,12 @@
/*
- Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
@@ -24,17 +14,28 @@
#define __XOPEN_SOURCE 500
+#include <openssl/md5.h>
#include <stdint.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <errno.h>
+#include <libgen.h>
+#include <pthread.h>
#include <ftw.h>
+#include <sys/stat.h>
+#include <signal.h>
+#include <sys/uio.h>
#ifndef GF_BSD_HOST_OS
#include <alloca.h>
#endif /* GF_BSD_HOST_OS */
+#ifdef HAVE_LINKAT
+#include <fcntl.h>
+#endif /* HAVE_LINKAT */
+
#include "glusterfs.h"
+#include "checksum.h"
#include "dict.h"
#include "logging.h"
#include "posix.h"
@@ -45,18 +46,28 @@
#include "compat.h"
#include "byte-order.h"
#include "syscall.h"
+#include "statedump.h"
+#include "locking.h"
+#include "timer.h"
+#include "glusterfs3-xdr.h"
+#include "hashfn.h"
+#include "posix-aio.h"
+#include "glusterfs-acl.h"
+
+extern char *marker_xattrs[];
+#define ALIGN_SIZE 4096
#undef HAVE_SET_FSID
#ifdef HAVE_SET_FSID
#define DECLARE_OLD_FS_ID_VAR uid_t old_fsuid; gid_t old_fsgid;
-#define SET_FS_ID(uid, gid) do { \
+#define SET_FS_ID(uid, gid) do { \
old_fsuid = setfsuid (uid); \
old_fsgid = setfsgid (gid); \
} while (0)
-#define SET_TO_OLD_FS_ID() do { \
+#define SET_TO_OLD_FS_ID() do { \
setfsuid (old_fsuid); \
setfsgid (old_fsgid); \
} while (0)
@@ -69,243 +80,106 @@
#endif
-typedef struct {
- xlator_t *this;
- const char *real_path;
- dict_t *xattr;
- struct stat *stbuf;
- loc_t *loc;
-} posix_xattr_filler_t;
-
int
posix_forget (xlator_t *this, inode_t *inode)
{
- uint64_t tmp_cache = 0;
- if (!inode_ctx_del (inode, this, &tmp_cache))
- dict_destroy ((dict_t *)(long)tmp_cache);
-
- return 0;
-}
-
-static void
-_posix_xattr_get_set (dict_t *xattr_req,
- char *key,
- data_t *data,
- void *xattrargs)
-{
- posix_xattr_filler_t *filler = xattrargs;
- char *value = NULL;
- ssize_t xattr_size = -1;
- int ret = -1;
- char *databuf = NULL;
- int _fd = -1;
- loc_t *loc = NULL;
- ssize_t req_size = 0;
-
-
- /* should size be put into the data_t ? */
- if (!strcmp (key, "glusterfs.content")) {
- /* file content request */
- req_size = data_to_uint64 (data);
- if (req_size >= filler->stbuf->st_size) {
- _fd = open (filler->real_path, O_RDONLY);
-
- if (_fd == -1) {
- gf_log (filler->this->name, GF_LOG_ERROR,
- "Opening file %s failed: %s",
- filler->real_path, strerror (errno));
- goto err;
- }
-
- databuf = calloc (1, filler->stbuf->st_size);
-
- if (!databuf) {
- gf_log (filler->this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto err;
- }
-
- ret = read (_fd, databuf, filler->stbuf->st_size);
- if (ret == -1) {
- gf_log (filler->this->name, GF_LOG_ERROR,
- "Read on file %s failed: %s",
- filler->real_path, strerror (errno));
- goto err;
- }
-
- ret = close (_fd);
- _fd = -1;
- if (ret == -1) {
- gf_log (filler->this->name, GF_LOG_ERROR,
- "Close on file %s failed: %s",
- filler->real_path, strerror (errno));
- goto err;
- }
-
- ret = dict_set_bin (filler->xattr, key,
- databuf, filler->stbuf->st_size);
- if (ret < 0) {
- goto err;
- }
-
- /* To avoid double free in cleanup below */
- databuf = NULL;
- err:
- if (_fd != -1)
- close (_fd);
- if (databuf)
- FREE (databuf);
- }
- } else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) {
- loc = filler->loc;
- if (!list_empty (&loc->inode->fd_list)) {
- ret = dict_set_uint32 (filler->xattr, key, 1);
- } else {
- ret = dict_set_uint32 (filler->xattr, key, 0);
- }
- } else {
- xattr_size = sys_lgetxattr (filler->real_path, key, NULL, 0);
-
- if (xattr_size > 0) {
- value = calloc (1, xattr_size + 1);
-
- sys_lgetxattr (filler->real_path, key, value,
- xattr_size);
-
- value[xattr_size] = '\0';
- ret = dict_set_bin (filler->xattr, key,
- value, xattr_size);
- if (ret < 0)
- gf_log (filler->this->name, GF_LOG_DEBUG,
- "dict set failed. path: %s, key: %s",
- filler->real_path, key);
- }
- }
-}
-
-
-dict_t *
-posix_lookup_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc,
- dict_t *xattr_req, struct stat *buf)
-{
- dict_t *xattr = NULL;
- posix_xattr_filler_t filler = {0, };
-
- xattr = get_new_dict();
- if (!xattr) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
- filler.this = this;
- filler.real_path = real_path;
- filler.xattr = xattr;
- filler.stbuf = buf;
- filler.loc = loc;
-
- dict_foreach (xattr_req, _posix_xattr_get_set, &filler);
-out:
- return xattr;
-}
-
-
-static int
-posix_scale_st_ino (struct posix_private *priv, struct stat *buf)
-{
- int i = 0;
- int ret = -1;
- ino_t temp_ino = 0;
-
- for (i = 0; i < priv->num_devices_to_span; i++) {
- if (buf->st_dev == priv->st_device[i])
- break;
- if (priv->st_device[i] == 0) {
- priv->st_device[i] = buf->st_dev;
- break;
- }
- }
-
- if (i == priv->num_devices_to_span)
- goto out;
-
- temp_ino = (buf->st_ino * priv->num_devices_to_span) + i;
-
- buf->st_ino = temp_ino;
+ uint64_t tmp_cache = 0;
+ if (!inode_ctx_del (inode, this, &tmp_cache))
+ dict_destroy ((dict_t *)(long)tmp_cache);
- ret = 0;
- out:
- return ret;
+ return 0;
}
+/* Regular fops */
int32_t
posix_lookup (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr_req)
+ loc_t *loc, dict_t *xdata)
{
- struct stat buf = {0, };
- char * real_path = NULL;
+ struct iatt buf = {0, };
int32_t op_ret = -1;
+ int32_t entry_ret = 0;
int32_t op_errno = 0;
dict_t * xattr = NULL;
-
- struct posix_private *priv = NULL;
+ char * real_path = NULL;
+ char * par_path = NULL;
+ struct iatt postparent = {0,};
+ int32_t gfidless = 0;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (loc, out);
- VALIDATE_OR_GOTO (loc->path, out);
- MAKE_REAL_PATH (real_path, this, loc->path);
+ /* The Hidden directory should be for housekeeping purpose and it
+ should not get any gfid on it */
+ if (__is_root_gfid (loc->pargfid) &&
+ (strcmp (loc->name, GF_HIDDEN_PATH) == 0)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Lookup issued on %s, which is not permitted",
+ GF_HIDDEN_PATH);
+ op_errno = EPERM;
+ op_ret = -1;
+ goto out;
+ }
- priv = this->private;
+ op_ret = dict_get_int32 (xdata, GF_GFIDLESS_LOOKUP, &gfidless);
+ op_ret = -1;
+ if (uuid_is_null (loc->pargfid)) {
+ /* nameless lookup */
+ MAKE_INODE_HANDLE (real_path, this, loc, &buf);
+ } else {
+ MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf);
+
+ if (uuid_is_null (loc->inode->gfid)) {
+ posix_gfid_heal (this, real_path, loc, xdata);
+ MAKE_ENTRY_HANDLE (real_path, par_path, this,
+ loc, &buf);
+ }
+ }
- op_ret = lstat (real_path, &buf);
op_errno = errno;
if (op_ret == -1) {
- if (op_errno != ENOENT) {
- gf_log (this->name, GF_LOG_ERROR,
- "lstat on %s failed: %s",
- loc->path, strerror (op_errno));
- }
- goto out;
- }
-
- /* Make sure we don't access another mountpoint inside export dir.
- * It may cause inode number to repeat from single export point,
- * which leads to severe problems..
- */
- if (!priv->span_devices) {
- if (priv->st_device[0] != buf.st_dev) {
- op_errno = ENOENT;
+ if (op_errno != ENOENT) {
gf_log (this->name, GF_LOG_ERROR,
- "%s: different mountpoint/device, returning "
- "ENOENT", loc->path);
- goto out;
+ "lstat on %s failed: %s",
+ real_path, strerror (op_errno));
}
- } else {
- op_ret = posix_scale_st_ino (priv, &buf);
- if (-1 == op_ret) {
- op_errno = ENOENT;
+
+ entry_ret = -1;
+ goto parent;
+ }
+
+ if (xdata && (op_ret == 0)) {
+ xattr = posix_lookup_xattr_fill (this, real_path, loc,
+ xdata, &buf);
+ }
+
+parent:
+ if (par_path) {
+ op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent);
+ if (op_ret == -1) {
+ op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "%s: from different mountpoint",
- loc->path);
+ "post-operation lstat on parent %s failed: %s",
+ par_path, strerror (op_errno));
goto out;
}
}
- if (xattr_req && (op_ret == 0)) {
- xattr = posix_lookup_xattr_fill (this, real_path, loc,
- xattr_req, &buf);
- }
-
- op_ret = 0;
+ op_ret = entry_ret;
out:
if (xattr)
dict_ref (xattr);
- STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &buf, xattr);
+ if (!op_ret && !gfidless && uuid_is_null (buf.ia_gfid)) {
+ gf_log (this->name, GF_LOG_ERROR, "buf->ia_gfid is null for "
+ "%s", (real_path) ? real_path: "");
+ op_ret = -1;
+ op_errno = ENODATA;
+ }
+ STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno,
+ (loc)?loc->inode:NULL, &buf, xattr, &postparent);
if (xattr)
dict_unref (xattr);
@@ -315,14 +189,13 @@ out:
int32_t
-posix_stat (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
+posix_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- struct stat buf = {0,};
- char * real_path = NULL;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ struct iatt buf = {0,};
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ struct posix_private *priv = NULL;
+ char *real_path = NULL;
DECLARE_OLD_FS_ID_VAR;
@@ -330,402 +203,829 @@ posix_stat (call_frame_t *frame,
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (loc, out);
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv, out);
+
SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_path, this, loc->path);
- op_ret = lstat (real_path, &buf);
+ MAKE_INODE_HANDLE (real_path, this, loc, &buf);
if (op_ret == -1) {
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "lstat on %s failed: %s", loc->path,
+ gf_log (this->name, (op_errno == ENOENT)?
+ GF_LOG_DEBUG:GF_LOG_ERROR,
+ "lstat on %s failed: %s", real_path,
strerror (op_errno));
goto out;
}
op_ret = 0;
- out:
+out:
SET_TO_OLD_FS_ID();
- STACK_UNWIND (frame, op_ret, op_errno, &buf);
+ STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, &buf, NULL);
return 0;
}
-int32_t
-posix_opendir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, fd_t *fd)
+static int
+posix_do_chmod (xlator_t *this, const char *path, struct iatt *stbuf)
{
- char * real_path = NULL;
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- DIR * dir = NULL;
- struct posix_fd * pfd = NULL;
+ int32_t ret = -1;
+ mode_t mode = 0;
+ struct stat stat;
+ int is_symlink = 0;
+
+ ret = sys_lstat (path, &stat);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "lstat failed: %s (%s)", path, strerror (errno));
+ goto out;
+ }
+
+ if (S_ISLNK (stat.st_mode))
+ is_symlink = 1;
+
+ mode = st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type);
+ ret = lchmod (path, mode);
+ if ((ret == -1) && (errno == ENOSYS)) {
+ /* in Linux symlinks are always in mode 0777 and no
+ such call as lchmod exists.
+ */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s (%s)", path, strerror (errno));
+ if (is_symlink) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = chmod (path, mode);
+ }
+out:
+ return ret;
+}
+
+static int
+posix_do_chown (xlator_t *this,
+ const char *path,
+ struct iatt *stbuf,
+ int32_t valid)
+{
+ int32_t ret = -1;
+ uid_t uid = -1;
+ gid_t gid = -1;
+
+ if (valid & GF_SET_ATTR_UID)
+ uid = stbuf->ia_uid;
+
+ if (valid & GF_SET_ATTR_GID)
+ gid = stbuf->ia_gid;
+
+ ret = lchown (path, uid, gid);
+
+ return ret;
+}
+
+static int
+posix_do_utimes (xlator_t *this,
+ const char *path,
+ struct iatt *stbuf)
+{
+ int32_t ret = -1;
+ struct timeval tv[2] = {{0,},{0,}};
+ struct stat stat;
+ int is_symlink = 0;
+
+ ret = sys_lstat (path, &stat);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s (%s)", path, strerror (errno));
+ goto out;
+ }
+
+ if (S_ISLNK (stat.st_mode))
+ is_symlink = 1;
+
+ tv[0].tv_sec = stbuf->ia_atime;
+ tv[0].tv_usec = stbuf->ia_atime_nsec / 1000;
+ tv[1].tv_sec = stbuf->ia_mtime;
+ tv[1].tv_usec = stbuf->ia_mtime_nsec / 1000;
+
+ ret = lutimes (path, tv);
+ if ((ret == -1) && (errno == ENOSYS)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s (%s)", path, strerror (errno));
+ if (is_symlink) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = utimes (path, tv);
+ }
+
+out:
+ return ret;
+}
+
+int
+posix_setattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char * real_path = 0;
+ struct iatt statpre = {0,};
+ struct iatt statpost = {0,};
DECLARE_OLD_FS_ID_VAR;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (loc, out);
- VALIDATE_OR_GOTO (loc->path, out);
- VALIDATE_OR_GOTO (fd, out);
SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_path, this, loc->path);
+ MAKE_INODE_HANDLE (real_path, this, loc, &statpre);
- dir = opendir (real_path);
-
- if (dir == NULL) {
+ if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "opendir failed on %s: %s",
- loc->path, strerror (op_errno));
+ "setattr (lstat) on %s failed: %s", real_path,
+ strerror (op_errno));
goto out;
}
- op_ret = dirfd (dir);
- if (op_ret < 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "dirfd() failed on %s: %s",
- loc->path, strerror (op_errno));
- goto out;
- }
+ if (valid & GF_SET_ATTR_MODE) {
+ op_ret = posix_do_chmod (this, real_path, stbuf);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "setattr (chmod) on %s failed: %s", real_path,
+ strerror (op_errno));
+ goto out;
+ }
+ }
- pfd = CALLOC (1, sizeof (*fd));
- if (!pfd) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
+ if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)){
+ op_ret = posix_do_chown (this, real_path, stbuf, valid);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "setattr (chown) on %s failed: %s", real_path,
+ strerror (op_errno));
+ goto out;
+ }
}
- pfd->dir = dir;
- pfd->fd = dirfd (dir);
- pfd->path = strdup (real_path);
- if (!pfd->path) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
+ if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
+ op_ret = posix_do_utimes (this, real_path, stbuf);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "setattr (utimes) on %s failed: %s", real_path,
+ strerror (op_errno));
+ goto out;
+ }
}
- fd_ctx_set (fd, this, (uint64_t)(long)pfd);
+ if (!valid) {
+ op_ret = lchown (real_path, -1, -1);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "lchown (%s, -1, -1) failed => (%s)",
+ real_path, strerror (op_errno));
- op_ret = 0;
+ goto out;
+ }
+ }
- out:
+ op_ret = posix_pstat (this, loc->gfid, real_path, &statpost);
if (op_ret == -1) {
- if (dir) {
- closedir (dir);
- dir = NULL;
- }
- if (pfd) {
- if (pfd->path)
- FREE (pfd->path);
- FREE (pfd);
- pfd = NULL;
- }
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "setattr (lstat) on %s failed: %s", real_path,
+ strerror (op_errno));
+ goto out;
}
+ op_ret = 0;
+
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, fd);
+
+ STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno,
+ &statpre, &statpost, NULL);
+
return 0;
}
+int32_t
+posix_do_fchown (xlator_t *this,
+ int fd,
+ struct iatt *stbuf,
+ int32_t valid)
+{
+ int ret = -1;
+ uid_t uid = -1;
+ gid_t gid = -1;
+
+ if (valid & GF_SET_ATTR_UID)
+ uid = stbuf->ia_uid;
+
+ if (valid & GF_SET_ATTR_GID)
+ gid = stbuf->ia_gid;
+
+ ret = fchown (fd, uid, gid);
+
+ return ret;
+}
+
int32_t
-posix_getdents (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t off, int32_t flag)
+posix_do_fchmod (xlator_t *this,
+ int fd, struct iatt *stbuf)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char * real_path = NULL;
- dir_entry_t entries = {0, };
- dir_entry_t * tmp = NULL;
- DIR * dir = NULL;
- struct dirent * dirent = NULL;
- int real_path_len = -1;
- int entry_path_len = -1;
- char * entry_path = NULL;
- int count = 0;
- struct posix_fd * pfd = NULL;
- uint64_t tmp_pfd = 0;
- struct stat buf = {0,};
- int ret = -1;
- char tmp_real_path[ZR_PATH_MAX];
- char linkpath[ZR_PATH_MAX];
+ mode_t mode = 0;
- DECLARE_OLD_FS_ID_VAR ;
+ mode = st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type);
+ return fchmod (fd, mode);
+}
+
+static int
+posix_do_futimes (xlator_t *this,
+ int fd,
+ struct iatt *stbuf)
+{
+ gf_log (this->name, GF_LOG_WARNING, "function not implemented fd(%d)", fd);
+
+ errno = ENOSYS;
+ return -1;
+}
+
+int
+posix_fsetattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ struct iatt statpre = {0,};
+ struct iatt statpost = {0,};
+ struct posix_fd *pfd = NULL;
+ int32_t ret = -1;
+
+ DECLARE_OLD_FS_ID_VAR;
+
+ SET_FS_ID (frame->root->uid, frame->root->gid);
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (fd, out);
- SET_FS_ID (frame->root->uid, frame->root->gid);
-
- ret = fd_ctx_get (fd, this, &tmp_pfd);
+ ret = posix_fd_ctx_get (fd, this, &pfd);
if (ret < 0) {
op_errno = -ret;
gf_log (this->name, GF_LOG_DEBUG,
- "fd %p does not have context in %s",
- fd, this->name);
+ "pfd is NULL from fd=%p", fd);
goto out;
}
- pfd = (struct posix_fd *)(long)tmp_pfd;
- if (!pfd->path) {
- op_errno = EBADFD;
- gf_log (this->name, GF_LOG_DEBUG,
- "pfd does not have path set (possibly file "
- "fd, fd=%p)", fd);
+
+ op_ret = posix_fdstat (this, pfd->fd, &statpre);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsetattr (fstat) failed on fd=%p: %s", fd,
+ strerror (op_errno));
goto out;
}
- real_path = pfd->path;
- real_path_len = strlen (real_path);
+ if (valid & GF_SET_ATTR_MODE) {
+ op_ret = posix_do_fchmod (this, pfd->fd, stbuf);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsetattr (fchmod) failed on fd=%p: %s",
+ fd, strerror (op_errno));
+ goto out;
+ }
+ }
+
+ if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
+ op_ret = posix_do_fchown (this, pfd->fd, stbuf, valid);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsetattr (fchown) failed on fd=%p: %s",
+ fd, strerror (op_errno));
+ goto out;
+ }
+
+ }
+
+ if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
+ op_ret = posix_do_futimes (this, pfd->fd, stbuf);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsetattr (futimes) on failed fd=%p: %s", fd,
+ strerror (op_errno));
+ goto out;
+ }
+ }
+
+ if (!valid) {
+ op_ret = fchown (pfd->fd, -1, -1);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "fchown (%d, -1, -1) failed => (%s)",
+ pfd->fd, strerror (op_errno));
- entry_path_len = real_path_len + NAME_MAX;
- entry_path = CALLOC (1, entry_path_len);
+ goto out;
+ }
+ }
- if (!entry_path) {
+ op_ret = posix_fdstat (this, pfd->fd, &statpost);
+ if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
+ "fsetattr (fstat) failed on fd=%p: %s", fd,
+ strerror (op_errno));
goto out;
}
- strncpy (entry_path, real_path, entry_path_len);
- entry_path[real_path_len] = '/';
+ op_ret = 0;
- dir = pfd->dir;
+out:
+ SET_TO_OLD_FS_ID ();
- if (!dir) {
- op_errno = EBADFD;
+ STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno,
+ &statpre, &statpost, NULL);
+
+ return 0;
+}
+
+static int32_t
+posix_do_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ off_t offset, size_t len, struct iatt *statpre,
+ struct iatt *statpost)
+{
+ struct posix_fd *pfd = NULL;
+ int32_t ret = -1;
+
+ DECLARE_OLD_FS_ID_VAR;
+
+ SET_FS_ID (frame->root->uid, frame->root->gid);
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ ret = posix_fd_ctx_get (fd, this, &pfd);
+ if (ret < 0) {
gf_log (this->name, GF_LOG_DEBUG,
- "pfd does not have dir set (possibly file fd, "
- "fd=%p, path=`%s'",
- fd, real_path);
+ "pfd is NULL from fd=%p", fd);
+ goto out;
+ }
+
+ ret = posix_fdstat (this, pfd->fd, statpre);
+ if (ret == -1) {
+ ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "fallocate (fstat) failed on fd=%p: %s", fd,
+ strerror (errno));
goto out;
}
- /* TODO: check for all the type of flag, and behave appropriately */
+ ret = sys_fallocate(pfd->fd, flags, offset, len);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
- while ((dirent = readdir (dir))) {
- if (!dirent)
- break;
+ ret = posix_fdstat (this, pfd->fd, statpost);
+ if (ret == -1) {
+ ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "fallocate (fstat) failed on fd=%p: %s", fd,
+ strerror (errno));
+ goto out;
+ }
- /* This helps in self-heal, when only directories
- needs to be replicated */
+out:
+ SET_TO_OLD_FS_ID ();
- /* This is to reduce the network traffic, in case only
- directory is needed from posix */
+ return ret;
+}
- strncpy (tmp_real_path, real_path, ZR_PATH_MAX);
- strncat (tmp_real_path, "/",
- ZR_PATH_MAX - strlen (tmp_real_path));
+char*
+_page_aligned_alloc (size_t size, char **aligned_buf)
+{
+ char *alloc_buf = NULL;
+ char *buf = NULL;
- strncat (tmp_real_path, dirent->d_name,
- ZR_PATH_MAX - strlen (tmp_real_path));
- ret = lstat (tmp_real_path, &buf);
+ alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char);
+ if (!alloc_buf)
+ goto out;
+ /* page aligned buffer */
+ buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE);
+ *aligned_buf = buf;
+out:
+ return alloc_buf;
+}
- if ((flag == GF_GET_DIR_ONLY)
- && (ret != -1 && !S_ISDIR(buf.st_mode))) {
- continue;
+static int32_t
+_posix_do_zerofill(int fd, off_t offset, size_t len, int o_direct)
+{
+ size_t num_vect = 0;
+ int32_t num_loop = 1;
+ int32_t idx = 0;
+ int32_t op_ret = -1;
+ int32_t vect_size = VECTOR_SIZE;
+ size_t remain = 0;
+ size_t extra = 0;
+ struct iovec *vector = NULL;
+ char *iov_base = NULL;
+ char *alloc_buf = NULL;
+
+ if (len == 0)
+ return 0;
+ if (len < VECTOR_SIZE)
+ vect_size = len;
+
+ num_vect = len / (vect_size);
+ remain = len % vect_size ;
+ if (num_vect > MAX_NO_VECT) {
+ extra = num_vect % MAX_NO_VECT;
+ num_loop = num_vect / MAX_NO_VECT;
+ num_vect = MAX_NO_VECT;
+ }
+
+ vector = GF_CALLOC (num_vect, sizeof(struct iovec),
+ gf_common_mt_iovec);
+ if (!vector)
+ return -1;
+ if (o_direct) {
+ alloc_buf = _page_aligned_alloc(vect_size, &iov_base);
+ if (!alloc_buf) {
+ gf_log ("_posix_do_zerofill", GF_LOG_DEBUG,
+ "memory alloc failed, vect_size %d: %s",
+ vect_size, strerror(errno));
+ GF_FREE(vector);
+ return -1;
}
+ } else {
+ iov_base = GF_CALLOC (vect_size, sizeof(char),
+ gf_common_mt_char);
+ if (!iov_base) {
+ GF_FREE(vector);
+ return -1;
+ }
+ }
+
+ for (idx = 0; idx < num_vect; idx++) {
+ vector[idx].iov_base = iov_base;
+ vector[idx].iov_len = vect_size;
+ }
+ lseek(fd, offset, SEEK_SET);
+ for (idx = 0; idx < num_loop; idx++) {
+ op_ret = writev(fd, vector, num_vect);
+ if (op_ret < 0)
+ goto err;
+ }
+ if (extra) {
+ op_ret = writev(fd, vector, extra);
+ if (op_ret < 0)
+ goto err;
+ }
+ if (remain) {
+ vector[0].iov_len = remain;
+ op_ret = writev(fd, vector , 1);
+ if (op_ret < 0)
+ goto err;
+ }
+err:
+ if (o_direct)
+ GF_FREE(alloc_buf);
+ else
+ GF_FREE(iov_base);
+ GF_FREE(vector);
+ return op_ret;
+}
- tmp = CALLOC (1, sizeof (*tmp));
+static int32_t
+posix_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, size_t len, struct iatt *statpre,
+ struct iatt *statpost)
+{
+ struct posix_fd *pfd = NULL;
+ int32_t ret = -1;
- if (!tmp) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
+ DECLARE_OLD_FS_ID_VAR;
- tmp->name = strdup (dirent->d_name);
- if (!tmp->name) {
- op_errno = errno;
+ SET_FS_ID (frame->root->uid, frame->root->gid);
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ ret = posix_fd_ctx_get (fd, this, &pfd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "pfd is NULL from fd=%p", fd);
+ goto out;
+ }
+
+ ret = posix_fdstat (this, pfd->fd, statpre);
+ if (ret == -1) {
+ ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "pre-operation fstat failed on fd = %p: %s", fd,
+ strerror (errno));
+ goto out;
+ }
+ ret = _posix_do_zerofill(pfd->fd, offset, len, pfd->flags & O_DIRECT);
+ if (ret < 0) {
+ ret = -errno;
+ gf_log(this->name, GF_LOG_ERROR,
+ "zerofill failed on fd %d length %ld %s",
+ pfd->fd, len, strerror(errno));
+ goto out;
+ }
+ if (pfd->flags & (O_SYNC|O_DSYNC)) {
+ ret = fsync (pfd->fd);
+ if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
+ "fsync() in writev on fd %d failed: %s",
+ pfd->fd, strerror (errno));
+ ret = -errno;
goto out;
}
+ }
- if (entry_path_len <
- (real_path_len + 1 + strlen (tmp->name) + 1)) {
- entry_path_len = (real_path_len +
- strlen (tmp->name) + 1024);
+ ret = posix_fdstat (this, pfd->fd, statpost);
+ if (ret == -1) {
+ ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "post operation fstat failed on fd=%p: %s", fd,
+ strerror (errno));
+ goto out;
+ }
- entry_path = realloc (entry_path, entry_path_len);
- }
+out:
+ SET_TO_OLD_FS_ID ();
- strcpy (&entry_path[real_path_len+1], tmp->name);
+ return ret;
+}
- ret = lstat (entry_path, &tmp->buf);
+static int32_t
+_posix_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ int32_t ret;
+ int32_t flags = 0;
+ struct iatt statpre = {0,};
+ struct iatt statpost = {0,};
- if (ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "lstat on %s failed: %s",
- entry_path, strerror (op_errno));
- goto out;
- }
+ if (keep_size)
+ flags = FALLOC_FL_KEEP_SIZE;
- if (S_ISLNK(tmp->buf.st_mode)) {
+ ret = posix_do_fallocate(frame, this, fd, flags, offset, len,
+ &statpre, &statpost);
+ if (ret < 0)
+ goto err;
- ret = readlink (entry_path, linkpath, ZR_PATH_MAX);
- if (ret != -1) {
- linkpath[ret] = '\0';
- tmp->link = strdup (linkpath);
- }
- } else {
- tmp->link = "";
- }
+ STACK_UNWIND_STRICT(fallocate, frame, 0, 0, &statpre, &statpost, NULL);
+ return 0;
- count++;
+err:
+ STACK_UNWIND_STRICT(fallocate, frame, -1, -ret, NULL, NULL, NULL);
+ return 0;
+}
- tmp->next = entries.next;
- entries.next = tmp;
+static int32_t
+posix_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ int32_t ret;
+ int32_t flags = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE;
+ struct iatt statpre = {0,};
+ struct iatt statpost = {0,};
- /* if size is 0, count can never be = size, so entire
- dir is read */
- if (count == size)
- break;
- }
+ ret = posix_do_fallocate(frame, this, fd, flags, offset, len,
+ &statpre, &statpost);
+ if (ret < 0)
+ goto err;
- FREE (entry_path);
+ STACK_UNWIND_STRICT(discard, frame, 0, 0, &statpre, &statpost, NULL);
+ return 0;
- op_ret = 0;
+err:
+ STACK_UNWIND_STRICT(discard, frame, -1, -ret, NULL, NULL, NULL);
+ return 0;
- out:
- SET_TO_OLD_FS_ID ();
+}
- if (op_ret == -1) {
- if (entry_path)
- FREE (entry_path);
+static int32_t
+posix_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ int32_t ret = 0;
+ struct iatt statpre = {0,};
+ struct iatt statpost = {0,};
+
+ ret = posix_do_zerofill(frame, this, fd, offset, len,
+ &statpre, &statpost);
+ if (ret < 0)
+ goto err;
+
+ STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, NULL);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT(zerofill, frame, -1, -ret, NULL, NULL, NULL);
+ return 0;
+
+}
+
+int32_t
+posix_opendir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, fd_t *fd, dict_t *xdata)
+{
+ char * real_path = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ DIR * dir = NULL;
+ struct posix_fd * pfd = NULL;
+
+ DECLARE_OLD_FS_ID_VAR;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (loc->path, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ SET_FS_ID (frame->root->uid, frame->root->gid);
+ MAKE_INODE_HANDLE (real_path, this, loc, NULL);
+
+ op_ret = -1;
+ dir = opendir (real_path);
+
+ if (dir == NULL) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "opendir failed on %s: %s",
+ real_path, strerror (op_errno));
+ goto out;
}
- STACK_UNWIND (frame, op_ret, op_errno, &entries, count);
+ op_ret = dirfd (dir);
+ if (op_ret < 0) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "dirfd() failed on %s: %s",
+ real_path, strerror (op_errno));
+ goto out;
+ }
- if (op_ret == 0) {
- while (entries.next) {
- tmp = entries.next;
- entries.next = entries.next->next;
- FREE (tmp->name);
- FREE (tmp);
+ pfd = GF_CALLOC (1, sizeof (*pfd), gf_posix_mt_posix_fd);
+ if (!pfd) {
+ op_errno = errno;
+ goto out;
+ }
+
+ pfd->dir = dir;
+ pfd->fd = dirfd (dir);
+
+ op_ret = fd_ctx_set (fd, this, (uint64_t)(long)pfd);
+ if (op_ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set the fd context path=%s fd=%p",
+ real_path, fd);
+
+ op_ret = 0;
+
+out:
+ if (op_ret == -1) {
+ if (dir) {
+ closedir (dir);
+ dir = NULL;
+ }
+ if (pfd) {
+ GF_FREE (pfd);
+ pfd = NULL;
}
}
+ SET_TO_OLD_FS_ID ();
+ STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, NULL);
return 0;
}
-
int32_t
posix_releasedir (xlator_t *this,
- fd_t *fd)
+ fd_t *fd)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
struct posix_fd * pfd = NULL;
- uint64_t tmp_pfd = 0;
+ uint64_t tmp_pfd = 0;
int ret = 0;
+ struct posix_private *priv = NULL;
+
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (fd, out);
ret = fd_ctx_del (fd, this, &tmp_pfd);
if (ret < 0) {
- op_errno = -ret;
gf_log (this->name, GF_LOG_DEBUG,
"pfd from fd=%p is NULL", fd);
goto out;
}
- pfd = (struct posix_fd *)(long)tmp_pfd;
+ pfd = (struct posix_fd *)(long)tmp_pfd;
if (!pfd->dir) {
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_DEBUG,
- "pfd->dir is NULL for fd=%p path=%s",
- fd, pfd->path ? pfd->path : "<NULL>");
- goto out;
- }
-
- ret = closedir (pfd->dir);
- if (ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "closedir on %p failed: %s", pfd->dir,
- strerror (errno));
- goto out;
- }
- pfd->dir = NULL;
-
- if (!pfd->path) {
- op_errno = EBADFD;
- gf_log (this->name, GF_LOG_DEBUG,
- "pfd->path was NULL. fd=%p pfd=%p",
- fd, pfd);
+ gf_log (this->name, GF_LOG_WARNING,
+ "pfd->dir is NULL for fd=%p", fd);
goto out;
}
- op_ret = 0;
+ priv = this->private;
- out:
- if (pfd) {
- if (pfd->path)
- FREE (pfd->path);
- FREE (pfd);
+ pthread_mutex_lock (&priv->janitor_lock);
+ {
+ INIT_LIST_HEAD (&pfd->list);
+ list_add_tail (&pfd->list, &priv->janitor_fds);
+ pthread_cond_signal (&priv->janitor_cond);
}
+ pthread_mutex_unlock (&priv->janitor_lock);
+out:
return 0;
}
int32_t
posix_readlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, size_t size)
+ loc_t *loc, size_t size, dict_t *xdata)
{
char * dest = NULL;
int32_t op_ret = -1;
int32_t op_errno = 0;
char * real_path = NULL;
+ struct iatt stbuf = {0,};
DECLARE_OLD_FS_ID_VAR;
- VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (frame, out);
SET_FS_ID (frame->root->uid, frame->root->gid);
dest = alloca (size + 1);
- MAKE_REAL_PATH (real_path, this, loc->path);
+ MAKE_INODE_HANDLE (real_path, this, loc, &stbuf);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "lstat on %s failed: %s", real_path,
+ strerror (op_errno));
+ goto out;
+ }
op_ret = readlink (real_path, dest, size);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "readlink on %s failed: %s", loc->path,
+ "readlink on %s failed: %s", real_path,
strerror (op_errno));
goto out;
}
dest[op_ret] = 0;
-
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, dest);
+ STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, dest, &stbuf, NULL);
return 0;
}
-int32_t
+
+int
posix_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t dev)
+ loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata)
{
- int tmp_fd = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char * real_path = 0;
- struct stat stbuf = { 0, };
+ int tmp_fd = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_path = 0;
+ char *par_path = 0;
+ struct iatt stbuf = { 0, };
+ char was_present = 1;
+ struct posix_private *priv = NULL;
+ gid_t gid = 0;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
+ void * uuid_req = NULL;
DECLARE_OLD_FS_ID_VAR;
@@ -733,68 +1033,157 @@ posix_mknod (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (loc, out);
- SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_path, this, loc->path);
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv, out);
+
+ MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, NULL);
+ gid = frame->root->gid;
+
+ SET_FS_ID (frame->root->uid, gid);
+
+ op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "pre-operation lstat on parent of %s failed: %s",
+ real_path, strerror (op_errno));
+ goto out;
+ }
+
+ if (preparent.ia_prot.sgid) {
+ gid = preparent.ia_gid;
+ }
+
+ /* Check if the 'gfid' already exists, because this mknod may be an
+ internal call from distribute for creating 'linkfile', and that
+ linkfile may be for a hardlinked file */
+ if (dict_get (xdata, GLUSTERFS_INTERNAL_FOP_KEY)) {
+ dict_del (xdata, GLUSTERFS_INTERNAL_FOP_KEY);
+ op_ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get the gfid from dict for %s",
+ loc->path);
+ goto real_op;
+ }
+ op_ret = posix_create_link_if_gfid_exists (this, uuid_req,
+ real_path);
+ if (!op_ret)
+ goto post_op;
+ }
+
+real_op:
+#ifdef __NetBSD__
+ if (S_ISFIFO(mode))
+ op_ret = mkfifo (real_path, mode);
+ else
+#endif /* __NetBSD__ */
op_ret = mknod (real_path, mode, dev);
if (op_ret == -1) {
op_errno = errno;
- if ((op_errno == EINVAL) && S_ISREG (mode)) {
- /* Over Darwin, mknod with (S_IFREG|mode)
- doesn't work */
- tmp_fd = creat (real_path, mode);
- if (tmp_fd == -1)
- goto out;
- close (tmp_fd);
- } else {
+ if ((op_errno == EINVAL) && S_ISREG (mode)) {
+ /* Over Darwin, mknod with (S_IFREG|mode)
+ doesn't work */
+ tmp_fd = creat (real_path, mode);
+ if (tmp_fd == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "create failed on %s: %s",
+ real_path, strerror (errno));
+ goto out;
+ }
+ close (tmp_fd);
+ } else {
- gf_log (this->name, GF_LOG_ERROR,
- "mknod on %s failed: %s", loc->path,
- strerror (op_errno));
- goto out;
- }
+ gf_log (this->name, GF_LOG_ERROR,
+ "mknod on %s failed: %s", real_path,
+ strerror (op_errno));
+ goto out;
+ }
+ }
+
+ op_ret = posix_gfid_set (this, real_path, loc, xdata);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting gfid on %s failed", real_path);
}
#ifndef HAVE_SET_FSID
- op_ret = lchown (real_path, frame->root->uid, frame->root->gid);
+ op_ret = lchown (real_path, frame->root->uid, gid);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "lchown on %s failed: %s", loc->path,
+ "lchown on %s failed: %s", real_path,
strerror (op_errno));
goto out;
}
#endif
- op_ret = lstat (real_path, &stbuf);
+post_op:
+ op_ret = posix_acl_xattr_set (this, real_path, xdata);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting ACLs on %s failed (%s)", real_path,
+ strerror (errno));
+ }
+
+ op_ret = posix_entry_create_xattr_set (this, real_path, xdata);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting xattrs on %s failed (%s)", real_path,
+ strerror (errno));
+ }
+ op_ret = posix_pstat (this, NULL, real_path, &stbuf);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "mknod on %s failed: %s", loc->path,
+ "mknod on %s failed: %s", real_path,
strerror (op_errno));
goto out;
}
+ op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "post-operation lstat on parent %s failed: %s",
+ par_path, strerror (op_errno));
+ goto out;
+ }
+
op_ret = 0;
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf);
+ STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno,
+ (loc)?loc->inode:NULL, &stbuf, &preparent,
+ &postparent, NULL);
+
+ if ((op_ret == -1) && (!was_present)) {
+ unlink (real_path);
+ }
return 0;
}
-int32_t
+
+int
posix_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode)
+ loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char * real_path = NULL;
- struct stat stbuf = {0, };
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_path = NULL;
+ char *par_path = NULL;
+ struct iatt stbuf = {0, };
+ char was_present = 1;
+ struct posix_private *priv = NULL;
+ gid_t gid = 0;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
DECLARE_OLD_FS_ID_VAR;
@@ -802,44 +1191,116 @@ posix_mkdir (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (loc, out);
- SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_path, this, loc->path);
+ /* The Hidden directory should be for housekeeping purpose and it
+ should not get created from a user request */
+ if (__is_root_gfid (loc->pargfid) &&
+ (strcmp (loc->name, GF_HIDDEN_PATH) == 0)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "mkdir issued on %s, which is not permitted",
+ GF_HIDDEN_PATH);
+ op_errno = EPERM;
+ op_ret = -1;
+ goto out;
+ }
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv, out);
+
+ MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, NULL);
+
+ gid = frame->root->gid;
+
+ op_ret = posix_pstat (this, NULL, real_path, &stbuf);
+ if ((op_ret == -1) && (errno == ENOENT)) {
+ was_present = 0;
+ }
+
+ SET_FS_ID (frame->root->uid, gid);
+
+ op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "pre-operation lstat on parent %s failed: %s",
+ par_path, strerror (op_errno));
+ goto out;
+ }
+
+ if (preparent.ia_prot.sgid) {
+ gid = preparent.ia_gid;
+ mode |= S_ISGID;
+ }
op_ret = mkdir (real_path, mode);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "mkdir of %s failed: %s", loc->path,
+ "mkdir of %s failed: %s", real_path,
strerror (op_errno));
goto out;
}
+ op_ret = posix_gfid_set (this, real_path, loc, xdata);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting gfid on %s failed", real_path);
+ }
+
#ifndef HAVE_SET_FSID
- op_ret = chown (real_path, frame->root->uid, frame->root->gid);
+ op_ret = chown (real_path, frame->root->uid, gid);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "chown on %s failed: %s", loc->path,
+ "chown on %s failed: %s", real_path,
strerror (op_errno));
goto out;
}
#endif
- op_ret = lstat (real_path, &stbuf);
+ op_ret = posix_acl_xattr_set (this, real_path, xdata);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting ACLs on %s failed (%s)", real_path,
+ strerror (errno));
+ }
+
+ op_ret = posix_entry_create_xattr_set (this, real_path, xdata);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting xattrs on %s failed (%s)", real_path,
+ strerror (errno));
+ }
+
+ op_ret = posix_pstat (this, NULL, real_path, &stbuf);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "lstat on %s failed: %s", loc->path,
+ "lstat on %s failed: %s", real_path,
strerror (op_errno));
goto out;
}
+ op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "post-operation lstat on parent of %s failed: %s",
+ real_path, strerror (op_errno));
+ goto out;
+ }
+
op_ret = 0;
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf);
+ STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno,
+ (loc)?loc->inode:NULL, &stbuf, &preparent,
+ &postparent, NULL);
+
+ if ((op_ret == -1) && (!was_present)) {
+ unlink (real_path);
+ }
return 0;
}
@@ -847,13 +1308,17 @@ posix_mkdir (call_frame_t *frame, xlator_t *this,
int32_t
posix_unlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
+ loc_t *loc, int xflag, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char *real_path = NULL;
- int32_t fd = -1;
- struct posix_private *priv = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_path = NULL;
+ char *par_path = NULL;
+ int32_t fd = -1;
+ struct iatt stbuf = {0,};
+ struct posix_private *priv = NULL;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
DECLARE_OLD_FS_ID_VAR;
@@ -862,38 +1327,60 @@ posix_unlink (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (loc, out);
SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_path, this, loc->path);
+ MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &stbuf);
+
+ op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "pre-operation lstat on parent %s failed: %s",
+ par_path, strerror (op_errno));
+ goto out;
+ }
+
+ if (stbuf.ia_nlink == 1)
+ posix_handle_unset (this, stbuf.ia_gfid, NULL);
priv = this->private;
if (priv->background_unlink) {
- if (S_ISREG (loc->inode->st_mode)) {
+ if (IA_ISREG (loc->inode->ia_type)) {
fd = open (real_path, O_RDONLY);
if (fd == -1) {
op_ret = -1;
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "open of %s failed: %s", loc->path,
+ "open of %s failed: %s", real_path,
strerror (op_errno));
goto out;
}
}
}
- op_ret = unlink (real_path);
+ op_ret = sys_unlink (real_path);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "unlink of %s failed: %s", loc->path,
+ "unlink of %s failed: %s", real_path,
strerror (op_errno));
goto out;
}
+ op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "post-operation lstat on parent %s failed: %s",
+ par_path, strerror (op_errno));
+ goto out;
+ }
+
op_ret = 0;
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno);
+ STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno,
+ &preparent, &postparent, NULL);
if (fd != -1) {
close (fd);
@@ -902,13 +1389,20 @@ posix_unlink (call_frame_t *frame, xlator_t *this,
return 0;
}
-int32_t
+
+int
posix_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
+ loc_t *loc, int flags, dict_t *xdata)
{
int32_t op_ret = -1;
int32_t op_errno = 0;
- char * real_path = 0;
+ char * real_path = NULL;
+ char * par_path = NULL;
+ char * gfid_str = NULL;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
+ struct iatt stbuf;
+ struct posix_private *priv = NULL;
DECLARE_OLD_FS_ID_VAR;
@@ -917,38 +1411,102 @@ posix_rmdir (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (loc, out);
SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_path, this, loc->path);
- op_ret = rmdir (real_path);
+ /* The Hidden directory should be for housekeeping purpose and it
+ should not get deleted from inside process */
+ if (__is_root_gfid (loc->pargfid) &&
+ (strcmp (loc->name, GF_HIDDEN_PATH) == 0)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "rmdir issued on %s, which is not permitted",
+ GF_HIDDEN_PATH);
+ op_errno = EPERM;
+ op_ret = -1;
+ goto out;
+ }
+
+ priv = this->private;
+
+ MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &stbuf);
+
+ op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "pre-operation lstat on parent %s failed: %s",
+ par_path, strerror (op_errno));
+ goto out;
+ }
+
+ if (flags) {
+ gfid_str = uuid_utoa (stbuf.ia_gfid);
+ char *tmp_path = alloca (strlen (priv->trash_path) +
+ strlen ("/") +
+ strlen (gfid_str) + 1);
+
+ mkdir (priv->trash_path, 0755);
+ sprintf (tmp_path, "%s/%s", priv->trash_path, gfid_str);
+ op_ret = rename (real_path, tmp_path);
+ } else {
+ op_ret = rmdir (real_path);
+ }
op_errno = errno;
- if (op_errno == EEXIST)
- /* Solaris sets errno = EEXIST instead of ENOTEMPTY */
- op_errno = ENOTEMPTY;
+ if (op_ret == 0) {
+ posix_handle_unset (this, stbuf.ia_gfid, NULL);
+ }
+
+ if (op_errno == EEXIST)
+ /* Solaris sets errno = EEXIST instead of ENOTEMPTY */
+ op_errno = ENOTEMPTY;
+ /* No need to log a common error as ENOTEMPTY */
if (op_ret == -1 && op_errno != ENOTEMPTY) {
gf_log (this->name, GF_LOG_ERROR,
- "rmdir of %s failed: %s", loc->path,
+ "rmdir of %s failed: %s", real_path,
strerror (op_errno));
+ }
+
+ if (op_ret == -1) {
+ gf_log (this->name,
+ (op_errno == ENOTEMPTY) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+ "%s on %s failed", (flags) ? "rename" : "rmdir",
+ real_path);
goto out;
}
- out:
+ op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "post-operation lstat on parent of %s failed: %s",
+ par_path, strerror (op_errno));
+ goto out;
+ }
+
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno);
+ STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno,
+ &preparent, &postparent, NULL);
return 0;
}
-int32_t
+
+int
posix_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkname, loc_t *loc)
+ const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char * real_path = 0;
- struct stat stbuf = { 0, };
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char * real_path = 0;
+ char * par_path = 0;
+ struct iatt stbuf = { 0, };
+ struct posix_private *priv = NULL;
+ gid_t gid = 0;
+ char was_present = 1;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
DECLARE_OLD_FS_ID_VAR;
@@ -957,8 +1515,31 @@ posix_symlink (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (linkname, out);
VALIDATE_OR_GOTO (loc, out);
- SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_path, this, loc->path);
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv, out);
+
+ MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &stbuf);
+
+ if ((op_ret == -1) && (errno == ENOENT)){
+ was_present = 0;
+ }
+
+ SET_FS_ID (frame->root->uid, gid);
+
+ gid = frame->root->gid;
+
+ op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "pre-operation lstat on parent %s failed: %s",
+ par_path, strerror (op_errno));
+ goto out;
+ }
+
+ if (preparent.ia_prot.sgid) {
+ gid = preparent.ia_gid;
+ }
op_ret = symlink (linkname, real_path);
@@ -966,35 +1547,71 @@ posix_symlink (call_frame_t *frame, xlator_t *this,
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
"symlink of %s --> %s failed: %s",
- loc->path, linkname, strerror (op_errno));
+ real_path, linkname, strerror (op_errno));
goto out;
}
+ op_ret = posix_gfid_set (this, real_path, loc, xdata);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting gfid on %s failed", real_path);
+ }
+
#ifndef HAVE_SET_FSID
- op_ret = lchown (real_path, frame->root->uid, frame->root->gid);
+ op_ret = lchown (real_path, frame->root->uid, gid);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
"lchown failed on %s: %s",
- loc->path, strerror (op_errno));
+ real_path, strerror (op_errno));
goto out;
}
#endif
- op_ret = lstat (real_path, &stbuf);
+
+ op_ret = posix_acl_xattr_set (this, real_path, xdata);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting ACLs on %s failed (%s)", real_path,
+ strerror (errno));
+ }
+
+ op_ret = posix_entry_create_xattr_set (this, real_path, xdata);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting xattrs on %s failed (%s)", real_path,
+ strerror (errno));
+ }
+
+ op_ret = posix_pstat (this, NULL, real_path, &stbuf);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
"lstat failed on %s: %s",
- loc->path, strerror (op_errno));
+ real_path, strerror (op_errno));
+ goto out;
+ }
+
+ op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "post-operation lstat on parent %s failed: %s",
+ par_path, strerror (op_errno));
goto out;
}
op_ret = 0;
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf);
+ STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno,
+ (loc)?loc->inode:NULL, &stbuf, &preparent,
+ &postparent, NULL);
+
+ if ((op_ret == -1) && (!was_present)) {
+ unlink (real_path);
+ }
return 0;
}
@@ -1002,13 +1619,26 @@ posix_symlink (call_frame_t *frame, xlator_t *this,
int
posix_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc)
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char * real_oldpath = NULL;
- char * real_newpath = NULL;
- struct stat stbuf = {0, };
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_oldpath = NULL;
+ char *real_newpath = NULL;
+ char *par_oldpath = NULL;
+ char *par_newpath = NULL;
+ struct iatt stbuf = {0, };
+ struct posix_private *priv = NULL;
+ char was_present = 1;
+ struct iatt preoldparent = {0, };
+ struct iatt postoldparent = {0, };
+ struct iatt prenewparent = {0, };
+ struct iatt postnewparent = {0, };
+ char olddirid[64];
+ char newdirid[64];
+ uuid_t victim = {0};
+ int was_dir = 0;
+ int nlink = 0;
DECLARE_OLD_FS_ID_VAR;
@@ -1017,213 +1647,237 @@ posix_rename (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (oldloc, out);
VALIDATE_OR_GOTO (newloc, out);
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv, out);
+
SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_oldpath, this, oldloc->path);
- MAKE_REAL_PATH (real_newpath, this, newloc->path);
+ MAKE_ENTRY_HANDLE (real_oldpath, par_oldpath, this, oldloc, NULL);
+ MAKE_ENTRY_HANDLE (real_newpath, par_newpath, this, newloc, &stbuf);
- op_ret = rename (real_oldpath, real_newpath);
+ op_ret = posix_pstat (this, oldloc->pargfid, par_oldpath, &preoldparent);
if (op_ret == -1) {
op_errno = errno;
- gf_log (this->name,
- (op_errno == ENOTEMPTY ? GF_LOG_DEBUG : GF_LOG_ERROR),
- "rename of %s to %s failed: %s",
- oldloc->path, newloc->path, strerror (op_errno));
+ gf_log (this->name, GF_LOG_ERROR,
+ "pre-operation lstat on parent %s failed: %s",
+ par_oldpath, strerror (op_errno));
goto out;
}
- op_ret = lstat (real_newpath, &stbuf);
+ op_ret = posix_pstat (this, newloc->pargfid, par_newpath, &prenewparent);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "lstat on %s failed: %s",
- real_newpath, strerror (op_errno));
+ "pre-operation lstat on parent of %s failed: %s",
+ par_newpath, strerror (op_errno));
goto out;
}
- op_ret = 0;
-
- out:
- SET_TO_OLD_FS_ID ();
+ op_ret = posix_pstat (this, NULL, real_newpath, &stbuf);
+ if ((op_ret == -1) && (errno == ENOENT)){
+ was_present = 0;
+ } else {
+ uuid_copy (victim, stbuf.ia_gfid);
+ if (IA_ISDIR (stbuf.ia_type))
+ was_dir = 1;
+ nlink = stbuf.ia_nlink;
+ }
- STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
+ if (was_present && IA_ISDIR(stbuf.ia_type) && !newloc->inode) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "found directory at %s while expecting ENOENT",
+ real_newpath);
+ op_ret = -1;
+ op_errno = EEXIST;
+ goto out;
+ }
- return 0;
-}
+ if (was_present && IA_ISDIR(stbuf.ia_type) &&
+ uuid_compare (newloc->inode->gfid, stbuf.ia_gfid)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "found directory %s at %s while renaming %s",
+ uuid_utoa_r (newloc->inode->gfid, olddirid),
+ real_newpath,
+ uuid_utoa_r (stbuf.ia_gfid, newdirid));
+ op_ret = -1;
+ op_errno = EEXIST;
+ goto out;
+ }
+ if (IA_ISDIR (oldloc->inode->ia_type)) {
+ posix_handle_unset (this, oldloc->inode->gfid, NULL);
+ }
-int
-posix_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char * real_oldpath = 0;
- char * real_newpath = 0;
- struct stat stbuf = {0, };
+ op_ret = sys_rename (real_oldpath, real_newpath);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name,
+ (op_errno == ENOTEMPTY ? GF_LOG_DEBUG : GF_LOG_ERROR),
+ "rename of %s to %s failed: %s",
+ real_oldpath, real_newpath, strerror (op_errno));
+ goto out;
+ }
+ if (was_dir)
+ posix_handle_unset (this, victim, NULL);
- DECLARE_OLD_FS_ID_VAR;
+ if (was_present && !was_dir && nlink == 1)
+ posix_handle_unset (this, victim, NULL);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (oldloc, out);
- VALIDATE_OR_GOTO (newloc, out);
+ if (IA_ISDIR (oldloc->inode->ia_type)) {
+ posix_handle_soft (this, real_newpath, newloc,
+ oldloc->inode->gfid, NULL);
+ }
- SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_oldpath, this, oldloc->path);
- MAKE_REAL_PATH (real_newpath, this, newloc->path);
+ op_ret = posix_pstat (this, NULL, real_newpath, &stbuf);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "lstat on %s failed: %s",
+ real_newpath, strerror (op_errno));
+ goto out;
+ }
- op_ret = link (real_oldpath, real_newpath);
+ op_ret = posix_pstat (this, oldloc->pargfid, par_oldpath, &postoldparent);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "link %s to %s failed: %s",
- oldloc->path, newloc->path, strerror (op_errno));
+ "post-operation lstat on parent %s failed: %s",
+ par_oldpath, strerror (op_errno));
goto out;
}
- op_ret = lstat (real_newpath, &stbuf);
+ op_ret = posix_pstat (this, newloc->pargfid, par_newpath, &postnewparent);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "lstat on %s failed: %s",
- real_newpath, strerror (op_errno));
+ "post-operation lstat on parent %s failed: %s",
+ par_newpath, strerror (op_errno));
goto out;
}
op_ret = 0;
- out:
+out:
+
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, oldloc->inode, &stbuf);
+ STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, &stbuf,
+ &preoldparent, &postoldparent,
+ &prenewparent, &postnewparent, NULL);
+
+ if ((op_ret == -1) && !was_present) {
+ unlink (real_newpath);
+ }
return 0;
}
int
-posix_chmod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode)
+posix_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char * real_path = 0;
- struct stat stbuf = {0,};
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_oldpath = 0;
+ char *real_newpath = 0;
+ char *par_newpath = 0;
+ struct iatt stbuf = {0, };
+ struct posix_private *priv = NULL;
+ char was_present = 1;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
DECLARE_OLD_FS_ID_VAR;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (oldloc, out);
+ VALIDATE_OR_GOTO (newloc, out);
- SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_path, this, loc->path);
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv, out);
- if (S_ISLNK (loc->inode->st_mode)) {
- /* chmod on a link should always succeed */
- op_ret = lstat (real_path, &stbuf);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "lstat on %s failed: %s",
- real_path, strerror (op_errno));
- goto out;
- }
- op_ret = 0;
- goto out;
- }
+ SET_FS_ID (frame->root->uid, frame->root->gid);
+ MAKE_INODE_HANDLE (real_oldpath, this, oldloc, &stbuf);
- op_ret = lchmod (real_path, mode);
- if ((op_ret == -1) && (errno == ENOSYS)) {
- gf_log (this->name, GF_LOG_TRACE,
- "lchmod not implemented, falling back to chmod");
- op_ret = chmod (real_path, mode);
+ MAKE_ENTRY_HANDLE (real_newpath, par_newpath, this, newloc, &stbuf);
+ if ((op_ret == -1) && (errno == ENOENT)) {
+ was_present = 0;
}
+ op_ret = posix_pstat (this, newloc->pargfid, par_newpath, &preparent);
if (op_ret == -1) {
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR, "chmod on %s failed: %s",
- loc->path, strerror (op_errno));
+ gf_log (this->name, GF_LOG_ERROR, "lstat failed: %s: %s",
+ par_newpath, strerror (op_errno));
goto out;
}
- op_ret = lstat (real_path, &stbuf);
+#ifdef HAVE_LINKAT
+ /*
+ * On most systems (Linux being the notable exception), link(2)
+ * first resolves symlinks. If the target is a directory or
+ * is nonexistent, it will fail. linkat(2) operates on the
+ * symlink instead of its target when the AT_SYMLINK_FOLLOW
+ * flag is not supplied.
+ */
+ op_ret = linkat (AT_FDCWD, real_oldpath, AT_FDCWD, real_newpath, 0);
+#else
+ op_ret = link (real_oldpath, real_newpath);
+#endif
if (op_ret == -1) {
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR, "lstat on %s failed: %s",
- real_path, strerror (op_errno));
+ gf_log (this->name, GF_LOG_ERROR,
+ "link %s to %s failed: %s",
+ real_oldpath, real_newpath, strerror (op_errno));
goto out;
}
- op_ret = 0;
-
- out:
- SET_TO_OLD_FS_ID ();
-
- STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
-
- return 0;
-}
-
-
-int
-posix_chown (call_frame_t *frame, xlator_t *this,
- loc_t *loc, uid_t uid, gid_t gid)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char * real_path = 0;
- struct stat stbuf = {0,};
-
- DECLARE_OLD_FS_ID_VAR;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (loc, out);
-
- SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_path, this, loc->path);
-
- op_ret = lchown (real_path, uid, gid);
+ op_ret = posix_pstat (this, NULL, real_newpath, &stbuf);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "lchown on %s failed: %s",
- loc->path, strerror (op_errno));
+ "lstat on %s failed: %s",
+ real_newpath, strerror (op_errno));
goto out;
}
- op_ret = lstat (real_path, &stbuf);
+ op_ret = posix_pstat (this, newloc->pargfid, par_newpath, &postparent);
if (op_ret == -1) {
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "lstat on %s failed: %s",
- real_path, strerror (op_errno));
+ gf_log (this->name, GF_LOG_ERROR, "lstat failed: %s: %s",
+ par_newpath, strerror (op_errno));
goto out;
}
op_ret = 0;
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
+ STACK_UNWIND_STRICT (link, frame, op_ret, op_errno,
+ (oldloc)?oldloc->inode:NULL, &stbuf, &preparent,
+ &postparent, NULL);
+
+ if ((op_ret == -1) && (!was_present)) {
+ unlink (real_newpath);
+ }
return 0;
}
int32_t
-posix_truncate (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- off_t offset)
+posix_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char * real_path = 0;
- struct stat stbuf = {0,};
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_path = 0;
+ struct posix_private *priv = NULL;
+ struct iatt prebuf = {0,};
+ struct iatt postbuf = {0,};
DECLARE_OLD_FS_ID_VAR;
@@ -1231,19 +1885,30 @@ posix_truncate (call_frame_t *frame,
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (loc, out);
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv, out);
+
SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_path, this, loc->path);
+
+ MAKE_INODE_HANDLE (real_path, this, loc, &prebuf);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "pre-operation lstat on %s failed: %s",
+ real_path, strerror (op_errno));
+ goto out;
+ }
op_ret = truncate (real_path, offset);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
"truncate on %s failed: %s",
- loc->path, strerror (op_errno));
+ real_path, strerror (op_errno));
goto out;
}
- op_ret = lstat (real_path, &stbuf);
+ op_ret = posix_pstat (this, loc->gfid, real_path, &postbuf);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR, "lstat on %s failed: %s",
@@ -1253,97 +1918,65 @@ posix_truncate (call_frame_t *frame,
op_ret = 0;
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
+ STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno,
+ &prebuf, &postbuf, NULL);
return 0;
}
int
-posix_utimens (call_frame_t *frame, xlator_t *this,
- loc_t *loc, struct timespec ts[2])
+posix_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode,
+ mode_t umask, fd_t *fd, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char * real_path = 0;
- struct stat stbuf = {0,};
- struct timeval tv[2] = {{0,},{0,}};
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ int32_t _fd = -1;
+ int _flags = 0;
+ char * real_path = NULL;
+ char * par_path = NULL;
+ struct iatt stbuf = {0, };
+ struct posix_fd * pfd = NULL;
+ struct posix_private * priv = NULL;
+ char was_present = 1;
+
+ gid_t gid = 0;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
DECLARE_OLD_FS_ID_VAR;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (fd, out);
- SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_path, this, loc->path);
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv, out);
- tv[0].tv_sec = ts[0].tv_sec;
- tv[0].tv_usec = ts[0].tv_nsec / 1000;
- tv[1].tv_sec = ts[1].tv_sec;
- tv[1].tv_usec = ts[1].tv_nsec / 1000;
+ MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &stbuf);
- op_ret = lutimes (real_path, tv);
- if ((op_ret == -1) && (errno == ENOSYS)) {
- op_ret = utimes (real_path, tv);
- }
+ gid = frame->root->gid;
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "utimes on %s failed: %s", real_path,
- strerror (op_errno));
- goto out;
- }
+ SET_FS_ID (frame->root->uid, gid);
- op_ret = lstat (real_path, &stbuf);
+ op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "lstat on %s failed: %s", real_path,
- strerror (op_errno));
+ "pre-operation lstat on parent %s failed: %s",
+ par_path, strerror (op_errno));
goto out;
}
- op_ret = 0;
-
- out:
- SET_TO_OLD_FS_ID ();
-
- STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
-
- return 0;
-}
-
-int32_t
-posix_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- fd_t *fd)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int32_t _fd = -1;
- int _flags = 0;
- char * real_path = NULL;
- struct stat stbuf = {0, };
- struct posix_fd * pfd = NULL;
- struct posix_private * priv = NULL;
-
- DECLARE_OLD_FS_ID_VAR;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
-
- SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_path, this, loc->path);
+ if (preparent.ia_prot.sgid) {
+ gid = preparent.ia_gid;
+ }
if (!flags) {
_flags = O_CREAT | O_RDWR | O_EXCL;
@@ -1352,6 +1985,11 @@ posix_create (call_frame_t *frame, xlator_t *this,
_flags = flags | O_CREAT;
}
+ op_ret = posix_pstat (this, NULL, real_path, &stbuf);
+ if ((op_ret == -1) && (errno == ENOENT)) {
+ was_present = 0;
+ }
+
if (priv->o_direct)
_flags |= O_DIRECT;
@@ -1359,23 +1997,48 @@ posix_create (call_frame_t *frame, xlator_t *this,
if (_fd == -1) {
op_errno = errno;
+ op_ret = -1;
gf_log (this->name, GF_LOG_ERROR,
- "open on %s failed: %s", loc->path,
+ "open on %s failed: %s", real_path,
strerror (op_errno));
goto out;
}
+ if (was_present)
+ goto fill_stat;
+
+ op_ret = posix_gfid_set (this, real_path, loc, xdata);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting gfid on %s failed", real_path);
+ }
+
#ifndef HAVE_SET_FSID
- op_ret = chown (real_path, frame->root->uid, frame->root->gid);
+ op_ret = chown (real_path, frame->root->uid, gid);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
"chown on %s failed: %s",
- real_path, strerror (op_errno));
+ real_path, strerror (op_errno));
}
#endif
- op_ret = fstat (_fd, &stbuf);
+ op_ret = posix_acl_xattr_set (this, real_path, xdata);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting ACLs on %s failed (%s)", real_path,
+ strerror (errno));
+ }
+
+ op_ret = posix_entry_create_xattr_set (this, real_path, xdata);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting xattrs on %s failed (%s)", real_path,
+ strerror (errno));
+ }
+
+fill_stat:
+ op_ret = posix_fdstat (this, _fd, &stbuf);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
@@ -1383,46 +2046,68 @@ posix_create (call_frame_t *frame, xlator_t *this,
goto out;
}
- op_ret = -1;
- pfd = CALLOC (1, sizeof (*pfd));
+ op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "post-operation lstat on parent %s failed: %s",
+ par_path, strerror (op_errno));
+ goto out;
+ }
+ op_ret = -1;
+ pfd = GF_CALLOC (1, sizeof (*pfd), gf_posix_mt_posix_fd);
if (!pfd) {
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
goto out;
}
pfd->flags = flags;
pfd->fd = _fd;
- fd_ctx_set (fd, this, (uint64_t)(long)pfd);
+ op_ret = fd_ctx_set (fd, this, (uint64_t)(long)pfd);
+ if (op_ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set the fd context path=%s fd=%p",
+ real_path, fd);
- ((struct posix_private *)this->private)->stats.nr_files++;
+ LOCK (&priv->lock);
+ {
+ priv->nr_files++;
+ }
+ UNLOCK (&priv->lock);
op_ret = 0;
- out:
+out:
SET_TO_OLD_FS_ID ();
- if ((-1 == op_ret) && (_fd != -1))
+ if ((-1 == op_ret) && (_fd != -1)) {
close (_fd);
- STACK_UNWIND (frame, op_ret, op_errno, fd, loc->inode, &stbuf);
+ if (!was_present) {
+ unlink (real_path);
+ }
+ }
+
+ STACK_UNWIND_STRICT (create, frame, op_ret, op_errno,
+ fd, (loc)?loc->inode:NULL, &stbuf, &preparent,
+ &postparent, xdata);
return 0;
}
int32_t
posix_open (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, fd_t *fd)
+ loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char * real_path = NULL;
- int32_t _fd = -1;
- struct posix_fd * pfd = NULL;
- struct posix_private * priv = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_path = NULL;
+ int32_t _fd = -1;
+ struct posix_fd *pfd = NULL;
+ struct posix_private *priv = NULL;
+ struct iatt stbuf = {0, };
DECLARE_OLD_FS_ID_VAR;
@@ -1433,75 +2118,66 @@ posix_open (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (fd, out);
priv = this->private;
+ VALIDATE_OR_GOTO (priv, out);
+ MAKE_INODE_HANDLE (real_path, this, loc, &stbuf);
+
+ op_ret = -1;
SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_path, this, loc->path);
if (priv->o_direct)
flags |= O_DIRECT;
_fd = open (real_path, flags, 0);
if (_fd == -1) {
+ op_ret = -1;
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
"open on %s: %s", real_path, strerror (op_errno));
goto out;
}
- pfd = CALLOC (1, sizeof (*pfd));
-
+ pfd = GF_CALLOC (1, sizeof (*pfd), gf_posix_mt_posix_fd);
if (!pfd) {
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
goto out;
}
pfd->flags = flags;
pfd->fd = _fd;
- fd_ctx_set (fd, this, (uint64_t)(long)pfd);
-
- ((struct posix_private *)this->private)->stats.nr_files++;
+ op_ret = fd_ctx_set (fd, this, (uint64_t)(long)pfd);
+ if (op_ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set the fd context path=%s fd=%p",
+ real_path, fd);
-#ifndef HAVE_SET_FSID
- if (flags & O_CREAT) {
- op_ret = chown (real_path, frame->root->uid, frame->root->gid);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "chown on %s failed: %s",
- real_path, strerror (op_errno));
- goto out;
- }
+ LOCK (&priv->lock);
+ {
+ priv->nr_files++;
}
-#endif
+ UNLOCK (&priv->lock);
op_ret = 0;
- out:
+out:
if (op_ret == -1) {
if (_fd != -1) {
close (_fd);
- _fd = -1;
}
}
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, fd);
+ STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, NULL);
return 0;
}
-#define ALIGN_BUF(ptr,bound) ((void *)((unsigned long)(ptr + bound - 1) & \
- (unsigned long)(~(bound - 1))))
-
int
posix_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset)
+ fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata)
{
- uint64_t tmp_pfd = 0;
int32_t op_ret = -1;
int32_t op_errno = 0;
int _fd = -1;
@@ -1510,8 +2186,7 @@ posix_readv (call_frame_t *frame, xlator_t *this,
struct iobref * iobref = NULL;
struct iovec vec = {0,};
struct posix_fd * pfd = NULL;
- struct stat stbuf = {0,};
- int align = 1;
+ struct iatt stbuf = {0,};
int ret = -1;
VALIDATE_OR_GOTO (frame, out);
@@ -1520,45 +2195,30 @@ posix_readv (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this->private, out);
priv = this->private;
+ VALIDATE_OR_GOTO (priv, out);
- ret = fd_ctx_get (fd, this, &tmp_pfd);
+ ret = posix_fd_ctx_get (fd, this, &pfd);
if (ret < 0) {
op_errno = -ret;
- gf_log (this->name, GF_LOG_DEBUG,
- "pfd is NULL from fd=%p", fd);
+ gf_log (this->name, GF_LOG_WARNING,
+ "pfd is NULL from fd=%p", fd);
goto out;
}
- pfd = (struct posix_fd *)(long)tmp_pfd;
if (!size) {
op_errno = EINVAL;
- gf_log (this->name, GF_LOG_DEBUG, "size=%"GF_PRI_SIZET, size);
+ gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size);
goto out;
}
- if (pfd->flags & O_DIRECT) {
- align = 4096; /* align to page boundary */
- }
-
- iobuf = iobuf_get (this->ctx->iobuf_pool);
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
if (!iobuf) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
+ op_errno = ENOMEM;
goto out;
}
_fd = pfd->fd;
-
- op_ret = lseek (_fd, offset, SEEK_SET);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "lseek(%"PRId64") failed: %s",
- offset, strerror (op_errno));
- goto out;
- }
-
- op_ret = read (_fd, iobuf->ptr, size);
+ op_ret = pread (_fd, iobuf->ptr, size, offset);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
@@ -1567,13 +2227,15 @@ posix_readv (call_frame_t *frame, xlator_t *this,
goto out;
}
- priv->read_value += op_ret;
- priv->interval_read += op_ret;
+ LOCK (&priv->lock);
+ {
+ priv->read_value += op_ret;
+ }
+ UNLOCK (&priv->lock);
vec.iov_base = iobuf->ptr;
vec.iov_len = op_ret;
- op_ret = -1;
iobref = iobref_new ();
iobref_add (iobref, iobuf);
@@ -1583,7 +2245,7 @@ posix_readv (call_frame_t *frame, xlator_t *this,
* we read from
*/
- op_ret = fstat (_fd, &stbuf);
+ op_ret = posix_fdstat (this, _fd, &stbuf);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
@@ -1591,11 +2253,16 @@ posix_readv (call_frame_t *frame, xlator_t *this,
strerror (op_errno));
goto out;
}
-
- op_ret = vec.iov_len;
- out:
- STACK_UNWIND (frame, op_ret, op_errno, &vec, 1, &stbuf, iobref);
+ /* Hack to notify higher layers of EOF. */
+ if (!stbuf.ia_size || (offset + vec.iov_len) >= stbuf.ia_size)
+ op_errno = ENOENT;
+
+ op_ret = vec.iov_len;
+out:
+
+ STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno,
+ &vec, 1, &stbuf, iobref, NULL);
if (iobref)
iobref_unref (iobref);
@@ -1607,25 +2274,139 @@ posix_readv (call_frame_t *frame, xlator_t *this,
int32_t
-posix_writev (call_frame_t *frame, xlator_t *this,
- fd_t *fd, struct iovec *vector, int32_t count, off_t offset,
- struct iobref *iobref)
+__posix_pwritev (int fd, struct iovec *vector, int count, off_t offset)
+{
+ int32_t op_ret = 0;
+ int idx = 0;
+ int retval = 0;
+ off_t internal_off = 0;
+
+ if (!vector)
+ return -EFAULT;
+
+ internal_off = offset;
+ for (idx = 0; idx < count; idx++) {
+ retval = pwrite (fd, vector[idx].iov_base, vector[idx].iov_len,
+ internal_off);
+ if (retval == -1) {
+ op_ret = -errno;
+ goto err;
+ }
+ op_ret += retval;
+ internal_off += retval;
+ }
+
+err:
+ return op_ret;
+}
+
+int32_t
+__posix_writev (int fd, struct iovec *vector, int count, off_t startoff,
+ int odirect)
+{
+ int32_t op_ret = 0;
+ int idx = 0;
+ int max_buf_size = 0;
+ int retval = 0;
+ char *buf = NULL;
+ char *alloc_buf = NULL;
+ off_t internal_off = 0;
+
+ /* Check for the O_DIRECT flag during open() */
+ if (!odirect)
+ return __posix_pwritev (fd, vector, count, startoff);
+
+ for (idx = 0; idx < count; idx++) {
+ if (max_buf_size < vector[idx].iov_len)
+ max_buf_size = vector[idx].iov_len;
+ }
+
+ alloc_buf = _page_aligned_alloc (max_buf_size, &buf);
+ if (!alloc_buf) {
+ op_ret = -errno;
+ goto err;
+ }
+
+ internal_off = startoff;
+ for (idx = 0; idx < count; idx++) {
+ memcpy (buf, vector[idx].iov_base, vector[idx].iov_len);
+
+ /* not sure whether writev works on O_DIRECT'd fd */
+ retval = pwrite (fd, buf, vector[idx].iov_len, internal_off);
+ if (retval == -1) {
+ op_ret = -errno;
+ goto err;
+ }
+
+ op_ret += retval;
+ internal_off += retval;
+ }
+
+err:
+ GF_FREE (alloc_buf);
+
+ return op_ret;
+}
+
+dict_t*
+_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append)
+{
+ dict_t *rsp_xdata = NULL;
+ int32_t ret = 0;
+ inode_t *inode = NULL;
+
+ if (fd)
+ inode = fd->inode;
+
+ if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) {
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid Args: "
+ "fd: %p inode: %p gfid:%s", fd, inode?inode:0,
+ inode?uuid_utoa(inode->gfid):"N/A");
+ goto out;
+ }
+
+ if (!xdata || !dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT))
+ goto out;
+
+ rsp_xdata = dict_new();
+ if (!rsp_xdata)
+ goto out;
+
+ ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT,
+ fd->inode->fd_count);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set "
+ "dictionary value for %s", uuid_utoa (fd->inode->gfid),
+ GLUSTERFS_OPEN_FD_COUNT);
+ }
+
+ ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND,
+ is_append);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set "
+ "dictionary value for %s", uuid_utoa (fd->inode->gfid),
+ GLUSTERFS_WRITE_IS_APPEND);
+ }
+out:
+ return rsp_xdata;
+}
+
+int32_t
+posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
int32_t op_ret = -1;
int32_t op_errno = 0;
int _fd = -1;
struct posix_private * priv = NULL;
struct posix_fd * pfd = NULL;
- struct stat stbuf = {0,};
+ struct iatt preop = {0,};
+ struct iatt postop = {0,};
int ret = -1;
-
- int idx = 0;
- int align = 4096;
- int max_buf_size = 0;
- int retval = 0;
- char * buf = NULL;
- char * alloc_buf = NULL;
- uint64_t tmp_pfd = 0;
+ dict_t *rsp_xdata = NULL;
+ int is_append = 0;
+ gf_boolean_t locked = _gf_false;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -1637,115 +2418,111 @@ posix_writev (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (priv, out);
- ret = fd_ctx_get (fd, this, &tmp_pfd);
+ ret = posix_fd_ctx_get (fd, this, &pfd);
if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "pfd is NULL from fd=%p", fd);
+ gf_log (this->name, GF_LOG_WARNING,
+ "pfd is NULL from fd=%p", fd);
op_errno = -ret;
goto out;
}
- pfd = (struct posix_fd *)(long)tmp_pfd;
_fd = pfd->fd;
- op_ret = lseek (_fd, offset, SEEK_SET);
+ if (xdata && dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) {
+ /* The write_is_append check and write must happen
+ atomically. Else another write can overtake this
+ write after the check and get written earlier.
+ So lock before preop-stat and unlock after write.
+ */
+ locked = _gf_true;
+ LOCK(&fd->inode->lock);
+ }
+
+ op_ret = posix_fdstat (this, _fd, &preop);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "lseek(%"PRId64") on fd=%p failed: %s",
- offset, fd, strerror (op_errno));
+ "pre-operation fstat failed on fd=%p: %s", fd,
+ strerror (op_errno));
goto out;
}
- /* Check for the O_DIRECT flag during open() */
- if (pfd->flags & O_DIRECT) {
- /* This is O_DIRECT'd file */
- op_ret = -1;
- for (idx = 0; idx < count; idx++) {
- if (max_buf_size < vector[idx].iov_len)
- max_buf_size = vector[idx].iov_len;
- }
-
- alloc_buf = MALLOC (1 * (max_buf_size + align));
- if (!alloc_buf) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
- for (idx = 0; idx < count; idx++) {
- /* page aligned buffer */
- buf = ALIGN_BUF (alloc_buf, align);
-
- memcpy (buf, vector[idx].iov_base,
- vector[idx].iov_len);
-
- /* not sure whether writev works on O_DIRECT'd fd */
- retval = write (_fd, buf, vector[idx].iov_len);
-
- if (retval == -1) {
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "O_DIRECT enabled on fd=%p: %s",
- fd, strerror (op_errno));
- goto out;
- }
+ if (locked) {
+ if (preop.ia_size == offset || (fd->flags & O_APPEND))
+ is_append = 1;
+ }
- break;
- }
- if (op_ret == -1)
- op_ret = 0;
- op_ret += retval;
- }
+ op_ret = __posix_writev (_fd, vector, count, offset,
+ (pfd->flags & O_DIRECT));
- } else /* if (O_DIRECT) */ {
+ if (locked) {
+ UNLOCK (&fd->inode->lock);
+ locked = _gf_false;
+ }
- /* This is not O_DIRECT'd fd */
- op_ret = writev (_fd, vector, count);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "writev failed on fd=%p: %s",
- fd, strerror (op_errno));
- goto out;
- }
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ op_ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64
+ ", %s", offset, strerror (op_errno));
+ goto out;
}
- priv->write_value += op_ret;
- priv->interval_write += op_ret;
+ LOCK (&priv->lock);
+ {
+ priv->write_value += op_ret;
+ }
+ UNLOCK (&priv->lock);
if (op_ret >= 0) {
+ rsp_xdata = _fill_writev_xdata (fd, xdata, this, is_append);
/* wiretv successful, we also need to get the stat of
* the file we wrote to
*/
- ret = fstat (_fd, &stbuf);
+
+ if (flags & (O_SYNC|O_DSYNC)) {
+ ret = fsync (_fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsync() in writev on fd %d failed: %s",
+ _fd, strerror (errno));
+ op_ret = -1;
+ op_errno = errno;
+ goto out;
+ }
+ }
+
+ ret = posix_fdstat (this, _fd, &postop);
if (ret == -1) {
- op_ret = -1;
+ op_ret = -1;
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "fstat failed on fd=%p: %s",
+ gf_log (this->name, GF_LOG_ERROR,
+ "post-operation fstat failed on fd=%p: %s",
fd, strerror (op_errno));
goto out;
}
}
- out:
- if (alloc_buf) {
- FREE (alloc_buf);
- }
+out:
- STACK_UNWIND (frame, op_ret, op_errno, &stbuf);
+ if (locked) {
+ UNLOCK (&fd->inode->lock);
+ locked = _gf_false;
+ }
+
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop,
+ rsp_xdata);
+ if (rsp_xdata)
+ dict_unref (rsp_xdata);
return 0;
}
int32_t
posix_statfs (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
+ loc_t *loc, dict_t *xdata)
{
char * real_path = NULL;
int32_t op_ret = -1;
@@ -1758,7 +2535,7 @@ posix_statfs (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (loc, out);
VALIDATE_OR_GOTO (this->private, out);
- MAKE_REAL_PATH (real_path, this, loc->path);
+ MAKE_INODE_HANDLE (real_path, this, loc, NULL);
priv = this->private;
@@ -1766,7 +2543,7 @@ posix_statfs (call_frame_t *frame, xlator_t *this,
if (op_ret == -1) {
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"statvfs failed on %s: %s",
real_path, strerror (op_errno));
goto out;
@@ -1783,116 +2560,127 @@ posix_statfs (call_frame_t *frame, xlator_t *this,
op_ret = 0;
- out:
- STACK_UNWIND (frame, op_ret, op_errno, &buf);
+out:
+ STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, &buf, NULL);
return 0;
}
int32_t
posix_flush (call_frame_t *frame, xlator_t *this,
- fd_t *fd)
+ fd_t *fd, dict_t *xdata)
{
int32_t op_ret = -1;
int32_t op_errno = 0;
- int _fd = -1;
- struct posix_fd * pfd = NULL;
int ret = -1;
- uint64_t tmp_pfd = 0;
+ struct posix_fd *pfd = NULL;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (fd, out);
- ret = fd_ctx_get (fd, this, &tmp_pfd);
+ ret = posix_fd_ctx_get (fd, this, &pfd);
if (ret < 0) {
op_errno = -ret;
- gf_log (this->name, GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_WARNING,
"pfd is NULL on fd=%p", fd);
goto out;
}
- pfd = (struct posix_fd *)(long)tmp_pfd;
-
- _fd = pfd->fd;
-
- /* do nothing */
op_ret = 0;
- out:
- STACK_UNWIND (frame, op_ret, op_errno);
+out:
+ STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, NULL);
return 0;
}
int32_t
-posix_release (xlator_t *this,
- fd_t *fd)
+posix_release (xlator_t *this, fd_t *fd)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int _fd = -1;
struct posix_private * priv = NULL;
struct posix_fd * pfd = NULL;
int ret = -1;
- uint64_t tmp_pfd = 0;
+ uint64_t tmp_pfd = 0;
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (fd, out);
priv = this->private;
- priv->stats.nr_files--;
-
- ret = fd_ctx_get (fd, this, &tmp_pfd);
+ ret = fd_ctx_del (fd, this, &tmp_pfd);
if (ret < 0) {
- op_errno = -ret;
- gf_log (this->name, GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_WARNING,
"pfd is NULL from fd=%p", fd);
goto out;
}
- pfd = (struct posix_fd *)(long)tmp_pfd;
-
- _fd = pfd->fd;
-
- op_ret = close (_fd);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "close failed on fd=%p: %s", fd, strerror (op_errno));
- goto out;
- }
+ pfd = (struct posix_fd *)(long)tmp_pfd;
if (pfd->dir) {
- op_ret = -1;
- op_errno = EBADF;
- gf_log (this->name, GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_WARNING,
"pfd->dir is %p (not NULL) for file fd=%p",
pfd->dir, fd);
- goto out;
}
- op_ret = 0;
+ pthread_mutex_lock (&priv->janitor_lock);
+ {
+ INIT_LIST_HEAD (&pfd->list);
+ list_add_tail (&pfd->list, &priv->janitor_fds);
+ pthread_cond_signal (&priv->janitor_cond);
+ }
+ pthread_mutex_unlock (&priv->janitor_lock);
- out:
- if (pfd)
- FREE (pfd);
+ LOCK (&priv->lock);
+ {
+ priv->nr_files--;
+ }
+ UNLOCK (&priv->lock);
+out:
return 0;
}
+int
+posix_batch_fsync (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int datasync, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+ struct posix_private *priv = NULL;
+
+ priv = this->private;
+
+ stub = fop_fsync_stub (frame, default_fsync, fd, datasync, xdata);
+ if (!stub) {
+ STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0);
+ return 0;
+ }
+
+ pthread_mutex_lock (&priv->fsync_mutex);
+ {
+ list_add_tail (&stub->list, &priv->fsyncs);
+ priv->fsync_queue_count++;
+ pthread_cond_signal (&priv->fsync_cond);
+ }
+ pthread_mutex_unlock (&priv->fsync_mutex);
+
+ return 0;
+}
+
+
int32_t
posix_fsync (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t datasync)
+ fd_t *fd, int32_t datasync, dict_t *xdata)
{
int32_t op_ret = -1;
int32_t op_errno = 0;
int _fd = -1;
struct posix_fd * pfd = NULL;
int ret = -1;
- uint64_t tmp_pfd = 0;
+ struct iatt preop = {0,};
+ struct iatt postop = {0,};
+ struct posix_private *priv = NULL;
DECLARE_OLD_FS_ID_VAR;
@@ -1908,183 +2696,94 @@ posix_fsync (call_frame_t *frame, xlator_t *this,
goto out;
#endif
- ret = fd_ctx_get (fd, this, &tmp_pfd);
+ priv = this->private;
+ if (priv->batch_fsync_mode && xdata && dict_get (xdata, "batch-fsync")) {
+ posix_batch_fsync (frame, this, fd, datasync, xdata);
+ return 0;
+ }
+
+ ret = posix_fd_ctx_get (fd, this, &pfd);
if (ret < 0) {
op_errno = -ret;
- gf_log (this->name, GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_WARNING,
"pfd not found in fd's ctx");
goto out;
}
- pfd = (struct posix_fd *)(long)tmp_pfd;
_fd = pfd->fd;
+ op_ret = posix_fdstat (this, _fd, &preop);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_WARNING,
+ "pre-operation fstat failed on fd=%p: %s", fd,
+ strerror (op_errno));
+ goto out;
+ }
+
if (datasync) {
;
#ifdef HAVE_FDATASYNC
op_ret = fdatasync (_fd);
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fdatasync on fd=%p failed: %s",
+ fd, strerror (errno));
+ }
#endif
} else {
op_ret = fsync (_fd);
if (op_ret == -1) {
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"fsync on fd=%p failed: %s",
fd, strerror (op_errno));
+ goto out;
}
}
+ op_ret = posix_fdstat (this, _fd, &postop);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_WARNING,
+ "post-operation fstat failed on fd=%p: %s", fd,
+ strerror (op_errno));
+ goto out;
+ }
+
op_ret = 0;
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno);
+ STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, &preop, &postop,
+ NULL);
return 0;
}
static int gf_posix_xattr_enotsup_log;
-
-int
-set_file_contents (xlator_t *this, char *real_path,
- data_pair_t *trav, int flags)
-{
- char * key = NULL;
- char real_filepath[ZR_PATH_MAX] = {0,};
- int32_t file_fd = -1;
- int op_ret = 0;
- int ret = -1;
-
- key = &(trav->key[15]);
- sprintf (real_filepath, "%s/%s", real_path, key);
-
- if (flags & XATTR_REPLACE) {
- /* if file exists, replace it
- * else, error out */
- file_fd = open (real_filepath, O_TRUNC|O_WRONLY);
-
- if (file_fd == -1) {
- goto create;
- }
-
- if (trav->value->len) {
- ret = write (file_fd, trav->value->data,
- trav->value->len);
- if (ret == -1) {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR,
- "write failed while doing setxattr "
- "for key %s on path %s: %s",
- key, real_filepath, strerror (errno));
- goto out;
- }
-
- ret = close (file_fd);
- if (ret == -1) {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR,
- "close failed on %s: %s",
- real_filepath, strerror (errno));
- goto out;
- }
- }
-
- create: /* we know file doesn't exist, create it */
-
- file_fd = open (real_filepath, O_CREAT|O_WRONLY, 0644);
-
- if (file_fd == -1) {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR,
- "failed to open file %s with O_CREAT: %s",
- key, strerror (errno));
- goto out;
- }
-
- ret = write (file_fd, trav->value->data, trav->value->len);
- if (ret == -1) {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR,
- "write failed on %s while setxattr with "
- "key %s: %s",
- real_filepath, key, strerror (errno));
- goto out;
- }
-
- ret = close (file_fd);
- if (ret == -1) {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR,
- "close failed on %s while setxattr with "
- "key %s: %s",
- real_filepath, key, strerror (errno));
- goto out;
- }
- }
-
- out:
- return op_ret;
-}
-
-int
-handle_pair (xlator_t *this, char *real_path,
- data_pair_t *trav, int flags)
+static int
+_handle_setxattr_keyvalue_pair (dict_t *d, char *k, data_t *v,
+ void *tmp)
{
- int sys_ret = -1;
- int ret = 0;
-
- if (ZR_FILE_CONTENT_REQUEST(trav->key)) {
- ret = set_file_contents (this, real_path, trav, flags);
- } else {
- sys_ret = sys_lsetxattr (real_path, trav->key,
- trav->value->data,
- trav->value->len, flags);
-
- if (sys_ret < 0) {
- if (errno == ENOTSUP) {
- GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log,
- this->name,GF_LOG_WARNING,
- "Extended attributes not "
- "supported");
- } else if (errno == ENOENT) {
- gf_log (this->name, GF_LOG_ERROR,
- "setxattr on %s failed: %s", real_path,
- strerror (errno));
- } else {
+ posix_xattr_filler_t *filler = NULL;
-#ifdef GF_DARWIN_HOST_OS
- gf_log (this->name,
- ((errno == EINVAL) ?
- GF_LOG_DEBUG : GF_LOG_ERROR),
- "%s: key:%s error:%s",
- real_path, trav->key,
- strerror (errno));
-#else /* ! DARWIN */
- gf_log (this->name, GF_LOG_ERROR,
- "%s: key:%s error:%s",
- real_path, trav->key,
- strerror (errno));
-#endif /* DARWIN */
- }
+ filler = tmp;
- ret = -errno;
- goto out;
- }
- }
- out:
- return ret;
+ return posix_handle_pair (filler->this, filler->real_path, k, v,
+ filler->flags);
}
int32_t
posix_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int flags)
+ loc_t *loc, dict_t *dict, int flags, dict_t *xdata)
{
int32_t op_ret = -1;
int32_t op_errno = 0;
char * real_path = NULL;
- data_pair_t * trav = NULL;
- int ret = -1;
+
+ posix_xattr_filler_t filler = {0,};
DECLARE_OLD_FS_ID_VAR;
SET_FS_ID (frame->root->uid, frame->root->gid);
@@ -2094,96 +2793,72 @@ posix_setxattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (loc, out);
VALIDATE_OR_GOTO (dict, out);
- MAKE_REAL_PATH (real_path, this, loc->path);
+ MAKE_INODE_HANDLE (real_path, this, loc, NULL);
- trav = dict->members_list;
+ op_ret = -1;
+ dict_del (dict, GFID_XATTR_KEY);
- while (trav) {
- ret = handle_pair (this, real_path, trav, flags);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
- trav = trav->next;
- }
-
- op_ret = 0;
+ filler.real_path = real_path;
+ filler.this = this;
+ filler.flags = flags;
+ op_ret = dict_foreach (dict, _handle_setxattr_keyvalue_pair,
+ &filler);
+ if (op_ret < 0)
+ op_errno = -op_ret;
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno);
+ STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, NULL);
return 0;
}
+
int
-get_file_contents (xlator_t *this, char *real_path,
- const char *name, char **contents)
+posix_xattr_get_real_filename (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key, dict_t *dict, dict_t *xdata)
{
- char real_filepath[ZR_PATH_MAX] = {0,};
- char * key = NULL;
- int32_t file_fd = -1;
- struct stat stbuf = {0,};
- int op_ret = 0;
- int ret = -1;
-
- key = (char *) &(name[15]);
- sprintf (real_filepath, "%s/%s", real_path, key);
-
- op_ret = lstat (real_filepath, &stbuf);
- if (op_ret == -1) {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR, "lstat failed on %s: %s",
- real_filepath, strerror (errno));
- goto out;
- }
-
- file_fd = open (real_filepath, O_RDONLY);
-
- if (file_fd == -1) {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR, "open failed on %s: %s",
- real_filepath, strerror (errno));
- goto out;
- }
-
- *contents = CALLOC (stbuf.st_size + 1, sizeof(char));
-
- if (! *contents) {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR, "Out of memory.");
- goto out;
- }
-
- ret = read (file_fd, *contents, stbuf.st_size);
- if (ret <= 0) {
- op_ret = -1;
- gf_log (this->name, GF_LOG_ERROR, "read on %s failed: %s",
- real_filepath, strerror (errno));
- goto out;
- }
+ char *real_path = NULL;
+ struct dirent *dirent = NULL;
+ DIR *fd = NULL;
+ const char *fname = NULL;
+ char *found = NULL;
+ int ret = -1;
+ int op_ret = -1;
+
+ MAKE_INODE_HANDLE (real_path, this, loc, NULL);
+
+ fd = opendir (real_path);
+ if (!fd)
+ return -errno;
+
+ fname = key + strlen (GF_XATTR_GET_REAL_FILENAME_KEY);
+
+ while ((dirent = readdir (fd))) {
+ if (strcasecmp (dirent->d_name, fname) == 0) {
+ found = gf_strdup (dirent->d_name);
+ if (!found) {
+ closedir (fd);
+ return -ENOMEM;
+ }
+ break;
+ }
+ }
- *contents[stbuf.st_size] = '\0';
+ closedir (fd);
- op_ret = close (file_fd);
- file_fd = -1;
- if (op_ret == -1) {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR, "close on %s failed: %s",
- real_filepath, strerror (errno));
- goto out;
- }
+ if (!found)
+ return -ENOENT;
- out:
- if (op_ret < 0) {
- if (*contents)
- FREE (*contents);
- if (file_fd != -1)
- close (file_fd);
- }
+ ret = dict_set_dynstr (dict, (char *)key, found);
+ if (ret) {
+ GF_FREE (found);
+ return -ENOMEM;
+ }
+ ret = strlen (found) + 1;
- return op_ret;
+ return ret;
}
/**
@@ -2193,20 +2868,25 @@ get_file_contents (xlator_t *this, char *real_path,
*/
int32_t
posix_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name)
-{
- int32_t op_ret = -1;
- int32_t op_errno = ENOENT;
- int32_t list_offset = 0;
- size_t size = 0;
- size_t remaining_size = 0;
- char key[1024] = {0,};
- char * value = NULL;
- char * list = NULL;
- char * real_path = NULL;
- dict_t * dict = NULL;
- char * file_contents = NULL;
- int ret = -1;
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ struct posix_private *priv = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ int32_t list_offset = 0;
+ ssize_t size = 0;
+ size_t remaining_size = 0;
+ char key[4096] = {0,};
+ char host_buf[1024] = {0,};
+ char *value = NULL;
+ char *list = NULL;
+ char *real_path = NULL;
+ dict_t *dict = NULL;
+ char *file_contents = NULL;
+ int ret = -1;
+ char *path = NULL;
+ char *rpath = NULL;
+ char *dyn_rpath = NULL;
DECLARE_OLD_FS_ID_VAR;
@@ -2215,28 +2895,194 @@ posix_getxattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (loc, out);
SET_FS_ID (frame->root->uid, frame->root->gid);
- MAKE_REAL_PATH (real_path, this, loc->path);
+ MAKE_INODE_HANDLE (real_path, this, loc, NULL);
+
+ op_ret = -1;
+ priv = this->private;
- if (loc->inode && S_ISDIR(loc->inode->st_mode) && name &&
- ZR_FILE_CONTENT_REQUEST(name)) {
- ret = get_file_contents (this, real_path, name,
- &file_contents);
+ if (loc->inode && IA_ISDIR(loc->inode->ia_type) && name &&
+ ZR_FILE_CONTENT_REQUEST(name)) {
+ ret = posix_get_file_contents (this, loc->gfid, &name[15],
+ &file_contents);
if (ret < 0) {
op_errno = -ret;
gf_log (this->name, GF_LOG_ERROR,
- "getting file contents failed: %s",
+ "getting file contents failed: %s",
strerror (op_errno));
goto out;
}
}
- /* Get the total size */
- dict = get_new_dict ();
+ dict = dict_new ();
if (!dict) {
- gf_log (this->name, GF_LOG_ERROR, "Out of memory.");
+ op_errno = ENOMEM;
goto out;
}
+ if (loc->inode && name &&
+ (strncmp (name, GF_XATTR_GET_REAL_FILENAME_KEY,
+ strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)) {
+ ret = posix_xattr_get_real_filename (frame, this, loc,
+ name, dict, xdata);
+ if (ret < 0) {
+ op_ret = -1;
+ op_errno = -ret;
+ gf_log (this->name, (op_errno == ENOENT) ?
+ GF_LOG_DEBUG : GF_LOG_WARNING,
+ "Failed to get real filename (%s, %s): %s",
+ loc->path, name, strerror (op_errno));
+ goto out;
+ }
+
+ size = ret;
+ goto done;
+ }
+
+ if (loc->inode && name && !strcmp (name, GLUSTERFS_OPEN_FD_COUNT)) {
+ if (!list_empty (&loc->inode->fd_list)) {
+ ret = dict_set_uint32 (dict, (char *)name, 1);
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to set dictionary value for %s",
+ name);
+ } else {
+ ret = dict_set_uint32 (dict, (char *)name, 0);
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to set dictionary value for %s",
+ name);
+ }
+ goto done;
+ }
+ if (loc->inode && name &&
+ (strcmp (name, GF_XATTR_PATHINFO_KEY) == 0)) {
+ if (LOC_HAS_ABSPATH (loc))
+ MAKE_REAL_PATH (rpath, this, loc->path);
+ else
+ rpath = real_path;
+
+ (void) snprintf (host_buf, 1024,
+ "<POSIX(%s):%s:%s>", priv->base_path,
+ ((priv->node_uuid_pathinfo
+ && !uuid_is_null(priv->glusterd_uuid))
+ ? uuid_utoa (priv->glusterd_uuid)
+ : priv->hostname),
+ rpath);
+
+ dyn_rpath = gf_strdup (host_buf);
+ if (!dyn_rpath) {
+ ret = -1;
+ goto done;
+ }
+ size = strlen (dyn_rpath) + 1;
+ ret = dict_set_dynstr (dict, GF_XATTR_PATHINFO_KEY,
+ dyn_rpath);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not set value (%s) in dictionary",
+ dyn_rpath);
+ GF_FREE (dyn_rpath);
+ }
+
+ goto done;
+ }
+
+ if (loc->inode && name &&
+ (strcmp (name, GF_XATTR_NODE_UUID_KEY) == 0)
+ && !uuid_is_null (priv->glusterd_uuid)) {
+ (void) snprintf (host_buf, 1024, "%s",
+ uuid_utoa (priv->glusterd_uuid));
+
+ dyn_rpath = gf_strdup (host_buf);
+ if (!dyn_rpath) {
+ ret = -1;
+ goto done;
+ }
+
+ size = strlen (dyn_rpath) + 1;
+ ret = dict_set_dynstr (dict, GF_XATTR_NODE_UUID_KEY,
+ dyn_rpath);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not set value (%s) in dictionary",
+ dyn_rpath);
+ GF_FREE (dyn_rpath);
+ }
+ goto done;
+ }
+
+ if (loc->inode && name &&
+ (strcmp (name, GFID_TO_PATH_KEY) == 0)) {
+ ret = inode_path (loc->inode, NULL, &path);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "%s: could not get "
+ "inode path", uuid_utoa (loc->inode->gfid));
+ goto done;
+ }
+
+ ret = dict_set_dynstr (dict, GFID_TO_PATH_KEY, path);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not set value (%s) in dictionary",
+ host_buf);
+ GF_FREE (path);
+ }
+ goto done;
+ }
+
+ if (name) {
+ strcpy (key, name);
+
+ size = sys_lgetxattr (real_path, key, NULL, 0);
+ if (size <= 0) {
+ op_errno = errno;
+ if ((op_errno == ENOTSUP) || (op_errno == ENOSYS)) {
+ GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log,
+ this->name, GF_LOG_WARNING,
+ "Extended attributes not "
+ "supported (try remounting"
+ " brick with 'user_xattr' "
+ "flag)");
+ } else if (op_errno == ENOATTR ||
+ op_errno == ENODATA) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "No such attribute:%s for file %s",
+ key, real_path);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "getxattr failed on %s: %s (%s)",
+ real_path, key, strerror (op_errno));
+ }
+
+ goto done;
+ }
+ value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char);
+ if (!value) {
+ op_ret = -1;
+ goto out;
+ }
+ size = sys_lgetxattr (real_path, key, value, size);
+ if (size == -1) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR, "getxattr failed on "
+ "%s: key = %s (%s)", real_path, key,
+ strerror (op_errno));
+ GF_FREE (value);
+ goto out;
+ }
+ value [size] = '\0';
+ op_ret = dict_set_dynptr (dict, key, value, size);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "dict set operation "
+ "on %s for the key %s failed.", real_path, key);
+ GF_FREE (value);
+ goto out;
+ }
+
+ goto done;
+ }
+
size = sys_llistxattr (real_path, NULL, 0);
if (size == -1) {
op_errno = errno;
@@ -2244,11 +3090,13 @@ posix_getxattr (call_frame_t *frame, xlator_t *this,
GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log,
this->name, GF_LOG_WARNING,
"Extended attributes not "
- "supported.");
+ "supported (try remounting"
+ " brick with 'user_xattr' "
+ "flag)");
}
else {
gf_log (this->name, GF_LOG_ERROR,
- "listxattr failed on %s: %s",
+ "listxattr failed on %s: %s",
real_path, strerror (op_errno));
}
goto out;
@@ -2260,7 +3108,6 @@ posix_getxattr (call_frame_t *frame, xlator_t *this,
list = alloca (size + 1);
if (!list) {
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR, "Out of memory.");
goto out;
}
@@ -2269,43 +3116,63 @@ posix_getxattr (call_frame_t *frame, xlator_t *this,
remaining_size = size;
list_offset = 0;
while (remaining_size > 0) {
- if(*(list + list_offset) == '\0')
+ if (*(list + list_offset) == '\0')
break;
strcpy (key, list + list_offset);
- op_ret = sys_lgetxattr (real_path, key, NULL, 0);
- if (op_ret == -1)
+ size = sys_lgetxattr (real_path, key, NULL, 0);
+ if (size == -1) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR, "getxattr failed on "
+ "%s: key = %s (%s)", real_path, key,
+ strerror (op_errno));
break;
+ }
- value = CALLOC (op_ret + 1, sizeof(char));
+ value = GF_CALLOC (size + 1, sizeof(char),
+ gf_posix_mt_char);
if (!value) {
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR, "Out of memory.");
goto out;
}
- op_ret = sys_lgetxattr (real_path, key, value, op_ret);
- if (op_ret == -1)
+ size = sys_lgetxattr (real_path, key, value, size);
+ if (size == -1) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR, "getxattr failed on "
+ "%s: key = %s (%s)", real_path, key,
+ strerror (op_errno));
+ GF_FREE (value);
break;
+ }
+
+ value [size] = '\0';
+ op_ret = dict_set_dynptr (dict, key, value, size);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "dict set operation "
+ "on %s for the key %s failed.", real_path, key);
+ GF_FREE (value);
+ goto out;
+ }
- value [op_ret] = '\0';
- dict_set (dict, key, data_from_dynptr (value, op_ret));
remaining_size -= strlen (key) + 1;
list_offset += strlen (key) + 1;
} /* while (remaining_size > 0) */
- done:
+done:
op_ret = size;
if (dict) {
- dict_ref (dict);
+ dict_del (dict, GFID_XATTR_KEY);
}
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, dict);
+ STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, NULL);
if (dict)
dict_unref (dict);
@@ -2316,17 +3183,16 @@ posix_getxattr (call_frame_t *frame, xlator_t *this,
int32_t
posix_fgetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, const char *name)
+ fd_t *fd, const char *name, dict_t *xdata)
{
int32_t op_ret = -1;
int32_t op_errno = ENOENT;
- uint64_t tmp_pfd = 0;
struct posix_fd * pfd = NULL;
int _fd = -1;
int32_t list_offset = 0;
- size_t size = 0;
+ ssize_t size = 0;
size_t remaining_size = 0;
- char key[1024] = {0,};
+ char key[4096] = {0,};
char * value = NULL;
char * list = NULL;
dict_t * dict = NULL;
@@ -2340,24 +3206,68 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this,
SET_FS_ID (frame->root->uid, frame->root->gid);
- ret = fd_ctx_get (fd, this, &tmp_pfd);
+ ret = posix_fd_ctx_get (fd, this, &pfd);
if (ret < 0) {
op_errno = -ret;
- gf_log (this->name, GF_LOG_DEBUG,
- "pfd is NULL from fd=%p", fd);
+ gf_log (this->name, GF_LOG_WARNING,
+ "pfd is NULL from fd=%p", fd);
goto out;
}
- pfd = (struct posix_fd *)(long)tmp_pfd;
_fd = pfd->fd;
/* Get the total size */
dict = get_new_dict ();
if (!dict) {
- gf_log (this->name, GF_LOG_ERROR, "Out of memory.");
goto out;
}
+ if (name && !strcmp (name, GLUSTERFS_OPEN_FD_COUNT)) {
+ ret = dict_set_uint32 (dict, (char *)name, 1);
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to set dictionary value for %s",
+ name);
+ goto done;
+ }
+
+ if (name) {
+ strcpy (key, name);
+
+ size = sys_fgetxattr (_fd, key, NULL, 0);
+ if (size <= 0) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on "
+ "key %s (%s)", key, strerror (op_errno));
+ goto done;
+ }
+
+ value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char);
+ if (!value) {
+ op_ret = -1;
+ goto out;
+ }
+ size = sys_fgetxattr (_fd, key, value, size);
+ if (size == -1) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on "
+ "fd %p for the key %s (%s)", fd, key,
+ strerror (op_errno));
+ GF_FREE (value);
+ goto out;
+ }
+ value [size] = '\0';
+ op_ret = dict_set_dynptr (dict, key, value, size);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "dict set operation "
+ "on key %s failed", key);
+ GF_FREE (value);
+ goto out;
+ }
+ goto done;
+ }
+
size = sys_flistxattr (_fd, NULL, 0);
if (size == -1) {
op_errno = errno;
@@ -2365,11 +3275,12 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this,
GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log,
this->name, GF_LOG_WARNING,
"Extended attributes not "
- "supported.");
+ "supported (try remounting "
+ "brick with 'user_xattr' flag)");
}
else {
gf_log (this->name, GF_LOG_ERROR,
- "listxattr failed on %p: %s",
+ "listxattr failed on %p: %s",
fd, strerror (op_errno));
}
goto out;
@@ -2381,7 +3292,6 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this,
list = alloca (size + 1);
if (!list) {
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR, "Out of memory.");
goto out;
}
@@ -2394,39 +3304,60 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this,
break;
strcpy (key, list + list_offset);
- op_ret = sys_fgetxattr (_fd, key, NULL, 0);
- if (op_ret == -1)
+ size = sys_fgetxattr (_fd, key, NULL, 0);
+ if (size == -1) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on "
+ "fd %p for the key %s (%s)", fd, key,
+ strerror (op_errno));
break;
+ }
- value = CALLOC (op_ret + 1, sizeof(char));
+ value = GF_CALLOC (size + 1, sizeof(char),
+ gf_posix_mt_char);
if (!value) {
+ op_ret = -1;
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR, "Out of memory.");
goto out;
}
- op_ret = sys_fgetxattr (_fd, key, value, op_ret);
- if (op_ret == -1)
+ size = sys_fgetxattr (_fd, key, value, size);
+ if (size == -1) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on "
+ "the fd %p for the key %s (%s)", fd, key,
+ strerror (op_errno));
+ GF_FREE (value);
break;
+ }
- value [op_ret] = '\0';
- dict_set (dict, key, data_from_dynptr (value, op_ret));
+ value [size] = '\0';
+ op_ret = dict_set_dynptr (dict, key, value, size);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_ERROR, "dict set operation "
+ "failed on key %s", key);
+ GF_FREE (value);
+ goto out;
+ }
remaining_size -= strlen (key) + 1;
list_offset += strlen (key) + 1;
} /* while (remaining_size > 0) */
- done:
+done:
op_ret = size;
if (dict) {
+ dict_del (dict, GFID_XATTR_KEY);
dict_ref (dict);
}
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, dict);
+ STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, NULL);
if (dict)
dict_unref (dict);
@@ -2434,64 +3365,29 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
+static int
+_handle_fsetxattr_keyvalue_pair (dict_t *d, char *k, data_t *v,
+ void *tmp)
+{
+ posix_xattr_filler_t *filler = NULL;
-int
-fhandle_pair (xlator_t *this, int fd,
- data_pair_t *trav, int flags)
-{
- int sys_ret = -1;
- int ret = 0;
-
- sys_ret = sys_fsetxattr (fd, trav->key, trav->value->data,
- trav->value->len, flags);
-
- if (sys_ret < 0) {
- if (errno == ENOTSUP) {
- GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log,
- this->name,GF_LOG_WARNING,
- "Extended attributes not "
- "supported");
- } else if (errno == ENOENT) {
- gf_log (this->name, GF_LOG_ERROR,
- "fsetxattr on fd=%d failed: %s", fd,
- strerror (errno));
- } else {
-
-#ifdef GF_DARWIN_HOST_OS
- gf_log (this->name,
- ((errno == EINVAL) ?
- GF_LOG_DEBUG : GF_LOG_ERROR),
- "fd=%d: key:%s error:%s",
- fd, trav->key,
- strerror (errno));
-#else /* ! DARWIN */
- gf_log (this->name, GF_LOG_ERROR,
- "fd=%d: key:%s error:%s",
- fd, trav->key,
- strerror (errno));
-#endif /* DARWIN */
- }
-
- ret = -errno;
- goto out;
- }
+ filler = tmp;
-out:
- return ret;
+ return posix_fhandle_pair (filler->this, filler->fd, k, v,
+ filler->flags);
}
-
int32_t
posix_fsetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, dict_t *dict, int flags)
+ fd_t *fd, dict_t *dict, int flags, dict_t *xdata)
{
int32_t op_ret = -1;
int32_t op_errno = 0;
struct posix_fd * pfd = NULL;
- uint64_t tmp_pfd = 0;
int _fd = -1;
- data_pair_t * trav = NULL;
- int ret = -1;
+ int ret = -1;
+
+ posix_xattr_filler_t filler = {0,};
DECLARE_OLD_FS_ID_VAR;
SET_FS_ID (frame->root->uid, frame->root->gid);
@@ -2501,103 +3397,190 @@ posix_fsetxattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (fd, out);
VALIDATE_OR_GOTO (dict, out);
- ret = fd_ctx_get (fd, this, &tmp_pfd);
+ ret = posix_fd_ctx_get (fd, this, &pfd);
if (ret < 0) {
op_errno = -ret;
- gf_log (this->name, GF_LOG_DEBUG,
- "pfd is NULL from fd=%p", fd);
+ gf_log (this->name, GF_LOG_WARNING,
+ "pfd is NULL from fd=%p", fd);
goto out;
}
- pfd = (struct posix_fd *)(long)tmp_pfd;
_fd = pfd->fd;
- trav = dict->members_list;
-
- while (trav) {
- ret = fhandle_pair (this, _fd, trav, flags);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
- trav = trav->next;
- }
+ dict_del (dict, GFID_XATTR_KEY);
- op_ret = 0;
+ filler.fd = _fd;
+ filler.this = this;
+ filler.flags = flags;
+ op_ret = dict_foreach (dict, _handle_fsetxattr_keyvalue_pair,
+ &filler);
+ if (op_ret < 0)
+ op_errno = -op_ret;
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno);
+ STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, NULL);
return 0;
}
+int
+_posix_remove_xattr (dict_t *dict, char *key, data_t *value, void *data)
+{
+ int32_t op_ret = 0;
+ xlator_t *this = NULL;
+ posix_xattr_filler_t *filler = NULL;
+
+ filler = (posix_xattr_filler_t *) data;
+ this = filler->this;
+
+ op_ret = sys_lremovexattr (filler->real_path, key);
+ if (op_ret == -1) {
+ filler->op_errno = errno;
+ if (errno != ENOATTR && errno != EPERM)
+ gf_log (this->name, GF_LOG_ERROR,
+ "removexattr failed on %s (for %s): %s",
+ filler->real_path, key, strerror (errno));
+ }
+
+ return op_ret;
+}
+
int32_t
posix_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name)
+ loc_t *loc, const char *name, dict_t *xdata)
{
int32_t op_ret = -1;
int32_t op_errno = 0;
char * real_path = NULL;
+ posix_xattr_filler_t filler = {0,};
DECLARE_OLD_FS_ID_VAR;
- MAKE_REAL_PATH (real_path, this, loc->path);
+ MAKE_INODE_HANDLE (real_path, this, loc, NULL);
+
+ if (!strcmp (GFID_XATTR_KEY, name)) {
+ gf_log (this->name, GF_LOG_WARNING, "Remove xattr called"
+ " on gfid for file %s", real_path);
+ op_ret = -1;
+ goto out;
+ }
+
SET_FS_ID (frame->root->uid, frame->root->gid);
+ /**
+ * sending an empty key name with xdata containing the
+ * list of key(s) to be removed implies "bulk remove request"
+ * for removexattr.
+ */
+ if (name && (strcmp (name, "") == 0) && xdata) {
+ filler.real_path = real_path;
+ filler.this = this;
+ op_ret = dict_foreach (xdata, _posix_remove_xattr, &filler);
+ if (op_ret) {
+ op_errno = filler.op_errno;
+ }
+
+ goto out;
+ }
+
op_ret = sys_lremovexattr (real_path, name);
+ if (op_ret == -1) {
+ op_errno = errno;
+ if (op_errno != ENOATTR && op_errno != EPERM)
+ gf_log (this->name, GF_LOG_ERROR,
+ "removexattr on %s (for %s): %s", real_path,
+ name, strerror (op_errno));
+ goto out;
+ }
+ op_ret = 0;
+
+out:
+ SET_TO_OLD_FS_ID ();
+
+ STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, NULL);
+ return 0;
+}
+
+int32_t
+posix_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ struct posix_fd * pfd = NULL;
+ int _fd = -1;
+ int ret = -1;
+
+ DECLARE_OLD_FS_ID_VAR;
+
+ if (!strcmp (GFID_XATTR_KEY, name)) {
+ gf_log (this->name, GF_LOG_WARNING, "Remove xattr called"
+ " on gfid for file");
+ goto out;
+ }
+
+ ret = posix_fd_ctx_get (fd, this, &pfd);
+ if (ret < 0) {
+ op_errno = -ret;
+ gf_log (this->name, GF_LOG_WARNING,
+ "pfd is NULL from fd=%p", fd);
+ goto out;
+ }
+ _fd = pfd->fd;
+
+
+
+ SET_FS_ID (frame->root->uid, frame->root->gid);
+
+ op_ret = sys_fremovexattr (_fd, name);
if (op_ret == -1) {
op_errno = errno;
- if (op_errno != ENOATTR && op_errno != EPERM)
- gf_log (this->name, GF_LOG_ERROR,
- "removexattr on %s: %s", loc->path,
- strerror (op_errno));
+ if (op_errno != ENOATTR && op_errno != EPERM)
+ gf_log (this->name, GF_LOG_ERROR,
+ "fremovexattr (for %s): %s",
+ name, strerror (op_errno));
goto out;
}
op_ret = 0;
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno);
+ STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, NULL);
return 0;
}
int32_t
posix_fsyncdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int datasync)
+ fd_t *fd, int datasync, dict_t *xdata)
{
int32_t op_ret = -1;
int32_t op_errno = 0;
- struct posix_fd * pfd = NULL;
- int _fd = -1;
int ret = -1;
- uint64_t tmp_pfd = 0;
+ struct posix_fd *pfd = NULL;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (fd, out);
- ret = fd_ctx_get (fd, this, &tmp_pfd);
+ ret = posix_fd_ctx_get (fd, this, &pfd);
if (ret < 0) {
op_errno = -ret;
- gf_log (this->name, GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_WARNING,
"pfd is NULL, fd=%p", fd);
goto out;
}
- pfd = (struct posix_fd *)(long)tmp_pfd;
-
- _fd = pfd->fd;
op_ret = 0;
- out:
- STACK_UNWIND (frame, op_ret, op_errno);
+out:
+ STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, NULL);
return 0;
}
@@ -2605,12 +3588,12 @@ posix_fsyncdir (call_frame_t *frame, xlator_t *this,
void
posix_print_xattr (dict_t *this,
- char *key,
- data_t *value,
- void *data)
+ char *key,
+ data_t *value,
+ void *data)
{
- gf_log ("posix", GF_LOG_DEBUG,
- "(key/val) = (%s/%d)", key, data_to_int32 (value));
+ gf_log ("posix", GF_LOG_DEBUG,
+ "(key/val) = (%s/%d)", key, data_to_int32 (value));
}
@@ -2624,245 +3607,281 @@ posix_print_xattr (dict_t *this,
static void
__add_array (int32_t *dest, int32_t *src, int count)
{
- int i = 0;
- for (i = 0; i < count; i++) {
- dest[i] = hton32 (ntoh32 (dest[i]) + ntoh32 (src[i]));
- }
+ int i = 0;
+ int32_t destval = 0;
+ for (i = 0; i < count; i++) {
+ destval = ntoh32 (dest[i]);
+ if (destval == 0xffffffff)
+ continue;
+ dest[i] = hton32 (destval + ntoh32 (src[i]));
+ }
}
+static void
+__or_array (int32_t *dest, int32_t *src, int count)
+{
+ int i = 0;
+ for (i = 0; i < count; i++) {
+ dest[i] = hton32 (ntoh32 (dest[i]) | ntoh32 (src[i]));
+ }
+}
-/**
- * xattrop - xattr operations - for internal use by GlusterFS
- * @optype: ADD_ARRAY:
- * dict should contain:
- * "key" ==> array of 32-bit numbers
- */
-
-int
-posix_xattrop (call_frame_t *frame, xlator_t *this,
- loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr)
+static void
+__and_array (int32_t *dest, int32_t *src, int count)
{
- char *real_path = NULL;
- int32_t *array = NULL;
- int size = 0;
- int count = 0;
+ int i = 0;
+ for (i = 0; i < count; i++) {
+ dest[i] = hton32 (ntoh32 (dest[i]) & ntoh32 (src[i]));
+ }
+}
- int op_ret = 0;
- int op_errno = 0;
+static void
+__add_long_array (int64_t *dest, int64_t *src, int count)
+{
+ int i = 0;
+ for (i = 0; i < count; i++) {
+ dest[i] = hton64 (ntoh64 (dest[i]) + ntoh64 (src[i]));
+ }
+}
- data_pair_t *trav = NULL;
+static int
+_posix_handle_xattr_keyvalue_pair (dict_t *d, char *k, data_t *v,
+ void *tmp)
+{
+ int size = 0;
+ int count = 0;
+ int op_ret = 0;
+ int op_errno = 0;
+ gf_xattrop_flags_t optype = 0;
+ char *array = NULL;
+ inode_t *inode = NULL;
+ xlator_t *this = NULL;
+ posix_xattr_filler_t *filler = NULL;
+
+ filler = tmp;
+
+ optype = (gf_xattrop_flags_t)(filler->flags);
+ this = filler->this;
+ inode = filler->inode;
+
+ count = v->len;
+ array = GF_CALLOC (count, sizeof (char), gf_posix_mt_char);
+
+ LOCK (&inode->lock);
+ {
+ if (filler->real_path) {
+ size = sys_lgetxattr (filler->real_path, k,
+ (char *)array, v->len);
+ } else {
+ size = sys_fgetxattr (filler->fd, k, (char *)array,
+ v->len);
+ }
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (xattr, out);
- VALIDATE_OR_GOTO (this, out);
+ op_errno = errno;
+ if ((size == -1) && (op_errno != ENODATA) &&
+ (op_errno != ENOATTR)) {
+ if (op_errno == ENOTSUP) {
+ GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log,
+ this->name, GF_LOG_WARNING,
+ "Extended attributes not "
+ "supported by filesystem");
+ } else if (op_errno != ENOENT ||
+ !posix_special_xattr (marker_xattrs,
+ k)) {
+ if (filler->real_path)
+ gf_log (this->name, GF_LOG_ERROR,
+ "getxattr failed on %s while doing "
+ "xattrop: Key:%s (%s)",
+ filler->real_path,
+ k, strerror (op_errno));
+ else
+ gf_log (this->name, GF_LOG_ERROR,
+ "fgetxattr failed on fd=%d while doing "
+ "xattrop: Key:%s (%s)",
+ filler->fd,
+ k, strerror (op_errno));
+ }
- trav = xattr->members_list;
+ op_ret = -1;
+ goto unlock;
+ }
- if (loc->path)
- MAKE_REAL_PATH (real_path, this, loc->path);
+ switch (optype) {
- while (trav) {
- count = trav->value->len / sizeof (int32_t);
- array = CALLOC (count, sizeof (int32_t));
-
- size = sys_lgetxattr (real_path, trav->key, (char *)array,
- trav->value->len);
+ case GF_XATTROP_ADD_ARRAY:
+ __add_array ((int32_t *) array, (int32_t *) v->data,
+ v->len / 4);
+ break;
- op_errno = errno;
- if ((size == -1) && (op_errno != ENODATA) &&
- (op_errno != ENOATTR)) {
- if (op_errno == ENOTSUP) {
- GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log,
- this->name,GF_LOG_WARNING,
- "Extended attributes not "
- "supported by filesystem");
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "getxattr failed on %s while doing "
- "xattrop: %s", loc->path,
- strerror (op_errno));
- }
- goto out;
- }
+ case GF_XATTROP_ADD_ARRAY64:
+ __add_long_array ((int64_t *) array, (int64_t *) v->data,
+ v->len / 8);
+ break;
- switch (optype) {
+ case GF_XATTROP_OR_ARRAY:
+ __or_array ((int32_t *) array,
+ (int32_t *) v->data,
+ v->len / 4);
+ break;
- case GF_XATTROP_ADD_ARRAY:
- __add_array (array, (int32_t *) trav->value->data,
- trav->value->len / 4);
- break;
+ case GF_XATTROP_AND_ARRAY:
+ __and_array ((int32_t *) array,
+ (int32_t *) v->data,
+ v->len / 4);
+ break;
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "Unknown xattrop type (%d) on %s. Please send "
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unknown xattrop type (%d) on %s. Please send "
"a bug report to gluster-devel@nongnu.org",
- optype, loc->path);
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
+ optype, filler->real_path);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto unlock;
+ }
- size = sys_lsetxattr (real_path, trav->key, array,
- trav->value->len, 0);
+ if (filler->real_path) {
+ size = sys_lsetxattr (filler->real_path, k, array,
+ v->len, 0);
+ } else {
+ size = sys_fsetxattr (filler->fd, k, (char *)array,
+ v->len, 0);
+ }
+ }
+unlock:
+ UNLOCK (&inode->lock);
+
+ if (op_ret == -1)
+ goto out;
+
+ op_errno = errno;
+ if (size == -1) {
+ if (filler->real_path)
+ gf_log (this->name, GF_LOG_ERROR,
+ "setxattr failed on %s while doing xattrop: "
+ "key=%s (%s)", filler->real_path,
+ k, strerror (op_errno));
+ else
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsetxattr failed on fd=%d while doing xattrop: "
+ "key=%s (%s)", filler->fd,
+ k, strerror (op_errno));
+
+ op_ret = -1;
+ goto out;
+ } else {
+ size = dict_set_bin (d, k, array, v->len);
+
+ if (size != 0) {
+ if (filler->real_path)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "dict_set_bin failed (path=%s): "
+ "key=%s (%s)", filler->real_path,
+ k, strerror (-size));
+ else
+ gf_log (this->name, GF_LOG_DEBUG,
+ "dict_set_bin failed (fd=%d): "
+ "key=%s (%s)", filler->fd,
+ k, strerror (-size));
+
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+ array = NULL;
+ }
+
+ array = NULL;
- op_errno = errno;
- if (size == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "setxattr failed on %s while doing xattrop: "
- "key=%s (%s)", loc->path,
- trav->key, strerror (op_errno));
- op_ret = -1;
- goto out;
- } else {
- size = dict_set_bin (xattr, trav->key, array,
- trav->value->len);
-
- if (size != 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "dict_set_bin failed (path=%s): "
- "key=%s (%s)", loc->path,
- trav->key, strerror (-size));
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
- array = NULL;
- }
-
- array = NULL;
- trav = trav->next;
- }
-
out:
- if (array)
- FREE (array);
- STACK_UNWIND (frame, op_ret, op_errno, xattr);
- return 0;
+ return op_ret;
}
+/**
+ * xattrop - xattr operations - for internal use by GlusterFS
+ * @optype: ADD_ARRAY:
+ * dict should contain:
+ * "key" ==> array of 32-bit numbers
+ */
int
-posix_fxattrop (call_frame_t *frame, xlator_t *this,
- fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr)
+do_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr)
{
- int32_t *array = NULL;
- int size = 0;
- int count = 0;
+ int op_ret = 0;
+ int op_errno = 0;
+ int _fd = -1;
+ char *real_path = NULL;
+ struct posix_fd *pfd = NULL;
+ inode_t *inode = NULL;
+ posix_xattr_filler_t filler = {0,};
- int op_ret = 0;
- int op_errno = 0;
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (xattr, out);
+ VALIDATE_OR_GOTO (this, out);
- int _fd = -1;
- struct posix_fd *pfd = NULL;
+ if (fd) {
+ op_ret = posix_fd_ctx_get (fd, this, &pfd);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to get pfd from fd=%p",
+ fd);
+ op_errno = EBADFD;
+ goto out;
+ }
+ _fd = pfd->fd;
+ }
- data_pair_t *trav = NULL;
- int32_t ret = -1;
+ if (loc && !uuid_is_null (loc->gfid))
+ MAKE_INODE_HANDLE (real_path, this, loc, NULL);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (xattr, out);
- VALIDATE_OR_GOTO (this, out);
+ if (real_path) {
+ inode = loc->inode;
+ } else if (fd) {
+ inode = fd->inode;
+ }
- trav = xattr->members_list;
+ filler.this = this;
+ filler.fd = _fd;
+ filler.real_path = real_path;
+ filler.flags = (int)optype;
+ filler.inode = inode;
- if (fd) {
- ret = fd_ctx_get (fd, this, (uint64_t *)&pfd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to get pfd from fd=%p",
- fd);
- op_ret = -1;
- op_errno = EBADFD;
- goto out;
- }
- _fd = pfd->fd;
- }
+ op_ret = dict_foreach (xattr, _posix_handle_xattr_keyvalue_pair,
+ &filler);
- while (trav) {
- count = trav->value->len / sizeof (int32_t);
- array = CALLOC (count, sizeof (int32_t));
-
- size = sys_fgetxattr (_fd, trav->key, (char *)array,
- trav->value->len);
-
- op_errno = errno;
- if ((size == -1) && ((op_errno != ENODATA) &&
- (op_errno != ENOATTR))) {
- if (op_errno == ENOTSUP) {
- GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log,
- this->name,GF_LOG_WARNING,
- "extended attributes not "
- "supported by filesystem");
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "fgetxattr failed on fd=%d while: "
- "doing xattrop: %s", _fd,
- strerror (op_errno));
- }
- goto out;
- }
+out:
- switch (optype) {
- case GF_XATTROP_ADD_ARRAY:
- __add_array (array, (int32_t *) trav->value->data,
- trav->value->len / 4);
- break;
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "Unknown xattrop type (%d) on fd=%d."
- "Please send a bug report to "
- "gluster-devel@nongnu.org",
- optype, _fd);
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
+ STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, xattr, NULL);
+ return 0;
+}
- size = sys_fsetxattr (_fd, trav->key, (char *)array,
- trav->value->len, 0);
- op_errno = errno;
- if (size == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "fsetxattr failed on fd=%d while doing: "
- "xattrop. key=%s (%s)", _fd,
- trav->key, strerror (op_errno));
- op_ret = -1;
- goto out;
- } else {
- size = dict_set_bin (xattr, trav->key, array,
- trav->value->len);
-
- if (size != 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "dict_set_bin failed (fd=%d): "
- "key=%s (%s)", _fd,
- trav->key, strerror (-size));
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
- array = NULL;
- }
-
- array = NULL;
- trav = trav->next;
- }
-
-out:
- if (array)
- FREE (array);
- STACK_UNWIND (frame, op_ret, op_errno, xattr);
- return 0;
+int
+posix_xattrop (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+ do_xattrop (frame, this, loc, NULL, optype, xattr);
+ return 0;
+}
+
+
+int
+posix_fxattrop (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+ do_xattrop (frame, this, NULL, fd, optype, xattr);
+ return 0;
}
int
posix_access (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t mask)
+ loc_t *loc, int32_t mask, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char * real_path = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_path = NULL;
DECLARE_OLD_FS_ID_VAR;
SET_FS_ID (frame->root->uid, frame->root->gid);
@@ -2871,37 +3890,37 @@ posix_access (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (loc, out);
- MAKE_REAL_PATH (real_path, this, loc->path);
+ MAKE_INODE_HANDLE (real_path, this, loc, NULL);
op_ret = access (real_path, mask & 07);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR, "access failed on %s: %s",
- loc->path, strerror (op_errno));
+ real_path, strerror (op_errno));
goto out;
}
-
op_ret = 0;
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno);
+ STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, NULL);
return 0;
}
int32_t
posix_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset)
+ fd_t *fd, off_t offset, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int _fd = -1;
- struct stat buf = {0,};
- struct posix_fd * pfd = NULL;
- int ret = -1;
- uint64_t tmp_pfd = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ int _fd = -1;
+ struct iatt preop = {0,};
+ struct iatt postop = {0,};
+ struct posix_fd *pfd = NULL;
+ int ret = -1;
+ struct posix_private *priv = NULL;
DECLARE_OLD_FS_ID_VAR;
SET_FS_ID (frame->root->uid, frame->root->gid);
@@ -2910,85 +3929,92 @@ posix_ftruncate (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (fd, out);
- ret = fd_ctx_get (fd, this, &tmp_pfd);
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv, out);
+
+ ret = posix_fd_ctx_get (fd, this, &pfd);
if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_WARNING,
"pfd is NULL, fd=%p", fd);
op_errno = -ret;
goto out;
}
- pfd = (struct posix_fd *)(long)tmp_pfd;
_fd = pfd->fd;
+ op_ret = posix_fdstat (this, _fd, &preop);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "pre-operation fstat failed on fd=%p: %s", fd,
+ strerror (op_errno));
+ goto out;
+ }
+
op_ret = ftruncate (_fd, offset);
if (op_ret == -1) {
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "ftruncate failed on fd=%p: %s",
- fd, strerror (errno));
+ gf_log (this->name, GF_LOG_ERROR,
+ "ftruncate failed on fd=%p (%"PRId64": %s",
+ fd, offset, strerror (errno));
goto out;
}
- op_ret = fstat (_fd, &buf);
+ op_ret = posix_fdstat (this, _fd, &postop);
if (op_ret == -1) {
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR, "fstat failed on fd=%p: %s",
+ gf_log (this->name, GF_LOG_ERROR,
+ "post-operation fstat failed on fd=%p: %s",
fd, strerror (errno));
goto out;
}
op_ret = 0;
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, &buf);
+ STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, &preop,
+ &postop, NULL);
return 0;
}
+
int32_t
-posix_fchown (call_frame_t *frame, xlator_t *this,
- fd_t *fd, uid_t uid, gid_t gid)
+posix_fstat (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int _fd = -1;
- struct stat buf = {0,};
- struct posix_fd * pfd = NULL;
- int ret = -1;
- uint64_t tmp_pfd = 0;
+ int _fd = -1;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ struct iatt buf = {0,};
+ struct posix_fd *pfd = NULL;
+ int ret = -1;
+ struct posix_private *priv = NULL;
DECLARE_OLD_FS_ID_VAR;
-
SET_FS_ID (frame->root->uid, frame->root->gid);
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (fd, out);
- ret = fd_ctx_get (fd, this, &tmp_pfd);
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv, out);
+
+ ret = posix_fd_ctx_get (fd, this, &pfd);
if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_WARNING,
"pfd is NULL, fd=%p", fd);
op_errno = -ret;
goto out;
}
- pfd = (struct posix_fd *)(long)tmp_pfd;
_fd = pfd->fd;
- op_ret = fchown (_fd, uid, gid);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR, "fchown failed on fd=%p: %s",
- fd, strerror (op_errno));
- goto out;
- }
-
- op_ret = fstat (_fd, &buf);
+ op_ret = posix_fdstat (this, _fd, &buf);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR, "fstat failed on fd=%p: %s",
@@ -2998,792 +4024,621 @@ posix_fchown (call_frame_t *frame, xlator_t *this,
op_ret = 0;
- out:
+out:
SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, &buf);
-
+ STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, &buf, NULL);
return 0;
}
+static int gf_posix_lk_log;
int32_t
-posix_fchmod (call_frame_t *frame, xlator_t *this,
- fd_t *fd, mode_t mode)
+posix_lk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int _fd = -1;
- struct stat buf = {0,};
- struct posix_fd * pfd = NULL;
- int ret = -1;
- uint64_t tmp_pfd = 0;
-
- DECLARE_OLD_FS_ID_VAR;
-
- SET_FS_ID (frame->root->uid, frame->root->gid);
+ struct gf_flock nullock = {0, };
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- ret = fd_ctx_get (fd, this, &tmp_pfd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "pfd is NULL fd=%p", fd);
- op_errno = -ret;
- goto out;
- }
- pfd = (struct posix_fd *)(long)tmp_pfd;
+ GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_CRITICAL,
+ "\"features/locks\" translator is "
+ "not loaded. You need to use it for proper "
+ "functioning of your application.");
- _fd = pfd->fd;
+ STACK_UNWIND_STRICT (lk, frame, -1, ENOSYS, &nullock, NULL);
+ return 0;
+}
- op_ret = fchmod (_fd, mode);
+int32_t
+posix_inodelk (call_frame_t *frame, xlator_t *this,
+ const char *volume, loc_t *loc, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata)
+{
+ GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_CRITICAL,
+ "\"features/locks\" translator is "
+ "not loaded. You need to use it for proper "
+ "functioning of your application.");
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "fchmod failed on fd=%p: %s", fd, strerror (errno));
- goto out;
- }
+ STACK_UNWIND_STRICT (inodelk, frame, -1, ENOSYS, NULL);
+ return 0;
+}
- op_ret = fstat (_fd, &buf);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "fstat failed on fd=%p: %s",
- fd, strerror (errno));
- goto out;
- }
+int32_t
+posix_finodelk (call_frame_t *frame, xlator_t *this,
+ const char *volume, fd_t *fd, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata)
+{
+ GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_CRITICAL,
+ "\"features/locks\" translator is "
+ "not loaded. You need to use it for proper "
+ "functioning of your application.");
- op_ret = 0;
+ STACK_UNWIND_STRICT (finodelk, frame, -1, ENOSYS, NULL);
+ return 0;
+}
- out:
- SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, &buf);
+int32_t
+posix_entrylk (call_frame_t *frame, xlator_t *this,
+ const char *volume, loc_t *loc, const char *basename,
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
+{
+ GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_CRITICAL,
+ "\"features/locks\" translator is "
+ "not loaded. You need to use it for proper "
+ "functioning of your application.");
+ STACK_UNWIND_STRICT (entrylk, frame, -1, ENOSYS, NULL);
return 0;
}
-
-static int
-same_file_type (mode_t m1, mode_t m2)
+int32_t
+posix_fentrylk (call_frame_t *frame, xlator_t *this,
+ const char *volume, fd_t *fd, const char *basename,
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
{
- return ((S_IFMT & (m1 ^ m2)) == 0);
+ GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_CRITICAL,
+ "\"features/locks\" translator is "
+ "not loaded. You need to use it for proper "
+ "functioning of your application.");
+
+ STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOSYS, NULL);
+ return 0;
}
-static int
-ensure_file_type (xlator_t *this, char *pathname, mode_t mode)
+int
+posix_fill_readdir (fd_t *fd, DIR *dir, off_t off, size_t size,
+ gf_dirent_t *entries, xlator_t *this, int32_t skip_dirs)
{
- struct stat stbuf = {0,};
- int op_ret = 0;
- int ret = -1;
-
- ret = lstat (pathname, &stbuf);
- if (ret == -1) {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR,
- "stat failed while trying to make sure entry %s "
- "is a directory: %s", pathname, strerror (errno));
- goto out;
+ off_t in_case = -1;
+ size_t filled = 0;
+ int count = 0;
+ char entrybuf[sizeof(struct dirent) + 256 + 8];
+ struct dirent *entry = NULL;
+ int32_t this_size = -1;
+ gf_dirent_t *this_entry = NULL;
+ uuid_t rootgfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1};
+ struct stat stbuf = {0,};
+ char *hpath = NULL;
+ int len = 0;
+ int ret = 0;
+
+ if (skip_dirs) {
+ len = posix_handle_path (this, fd->inode->gfid, NULL, NULL, 0);
+ hpath = alloca (len + 256); /* NAME_MAX */
+ posix_handle_path (this, fd->inode->gfid, NULL, hpath, len);
+ len = strlen (hpath);
+ hpath[len] = '/';
}
- if (!same_file_type (mode, stbuf.st_mode)) {
- op_ret = -EEXIST;
- gf_log (this->name, GF_LOG_ERROR,
- "entry %s is a different type of file "
- "than expected", pathname);
- goto out;
+ if (!off) {
+ rewinddir (dir);
+ } else {
+ seekdir (dir, off);
}
- out:
- return op_ret;
-}
-static int
-create_entry (xlator_t *this, int32_t flags,
- dir_entry_t *entry, char *pathname)
-{
- int op_ret = 0;
- int ret = -1;
- struct timeval tv[2] = {{0,0},{0,0}};
-
- if (S_ISDIR (entry->buf.st_mode)) {
- /*
- * If the entry is directory, create it by
- * calling 'mkdir'. If the entry is already
- * present, check if it is a directory,
- * and issue a warning if otherwise.
- */
+ while (filled <= size) {
+ in_case = telldir (dir);
- ret = mkdir (pathname, entry->buf.st_mode);
- if (ret == -1) {
- if (errno == EEXIST) {
- op_ret = ensure_file_type (this, pathname,
- entry->buf.st_mode);
- }
- else {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR,
- "mkdir %s with mode (0%o) failed: %s",
- pathname, entry->buf.st_mode,
- strerror (errno));
- goto out;
- }
+ if (in_case == -1) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "telldir failed on dir=%p: %s",
+ dir, strerror (errno));
+ goto out;
}
- } else if ((flags & GF_SET_IF_NOT_PRESENT)
- || !(flags & GF_SET_DIR_ONLY)) {
-
- /* create a 0-byte file here */
-
- if (S_ISREG (entry->buf.st_mode)) {
- ret = open (pathname, O_CREAT|O_EXCL,
- entry->buf.st_mode);
+ errno = 0;
+ entry = NULL;
+ readdir_r (dir, (struct dirent *)entrybuf, &entry);
- if (ret == -1) {
- if (errno == EEXIST) {
- op_ret = ensure_file_type (this,
- pathname,
- entry->buf.st_mode);
- }
- else {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR,
- "Error creating file %s with "
- "mode (0%o): %s",
- pathname, entry->buf.st_mode,
- strerror (errno));
- goto out;
- }
+ if (!entry) {
+ if (errno == EBADF) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "readdir failed on dir=%p: %s",
+ dir, strerror (errno));
+ goto out;
}
+ break;
+ }
- close (ret);
-
- } else if (S_ISLNK (entry->buf.st_mode)) {
- ret = symlink (entry->link, pathname);
-
- if (ret == -1) {
- if (errno == EEXIST) {
- op_ret = ensure_file_type (this,
- pathname,
- entry->buf.st_mode);
- }
- else {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR,
- "error creating symlink %s: %s"
- , pathname, strerror (errno));
- goto out;
- }
- }
+#ifdef __NetBSD__
+ /*
+ * NetBSD with UFS1 backend uses backing files for
+ * extended attributes. They can be found in a
+ * .attribute file located at the root of the filesystem
+ * We hide it to glusterfs clients, since chaos will occur
+ * when the cluster/dht xlator decides to distribute
+ * exended attribute backing file accross storage servers.
+ */
+ if ((uuid_compare (fd->inode->gfid, rootgfid) == 0)
+ && (!strcmp(entry->d_name, ".attribute")))
+ continue;
+#endif /* __NetBSD__ */
+
+ if ((uuid_compare (fd->inode->gfid, rootgfid) == 0)
+ && (!strcmp (GF_HIDDEN_PATH, entry->d_name))) {
+ continue;
+ }
- } else if (S_ISBLK (entry->buf.st_mode) ||
- S_ISCHR (entry->buf.st_mode) ||
- S_ISFIFO (entry->buf.st_mode) ||
- S_ISSOCK (entry->buf.st_mode)) {
-
- ret = mknod (pathname, entry->buf.st_mode,
- entry->buf.st_dev);
-
- if (ret == -1) {
- if (errno == EEXIST) {
- op_ret = ensure_file_type (this,
- pathname,
- entry->buf.st_mode);
- } else {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR,
- "error creating device file "
- "%s: %s",
- pathname, strerror (errno));
- goto out;
- }
+ if (skip_dirs) {
+ if (DT_ISDIR (entry->d_type)) {
+ continue;
+ } else if (hpath) {
+ strcpy (&hpath[len+1],entry->d_name);
+ ret = lstat (hpath, &stbuf);
+ if (!ret && S_ISDIR (stbuf.st_mode))
+ continue;
}
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "invalid mode 0%o for %s", entry->buf.st_mode,
- pathname);
- op_ret = -EINVAL;
- goto out;
- }
- }
-
- /*
- * Preserve atime and mtime
- */
-
- if (!S_ISLNK (entry->buf.st_mode)) {
- tv[0].tv_sec = entry->buf.st_atime;
- tv[1].tv_sec = entry->buf.st_mtime;
- ret = utimes (pathname, tv);
- if (ret == -1) {
- op_ret = -errno;
- gf_log (this->name, GF_LOG_ERROR,
- "utimes %s failed: %s",
- pathname, strerror (errno));
- goto out;
- }
- }
-
-out:
- return op_ret;
-
-}
-
+ }
-int
-posix_setdents (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t flags, dir_entry_t *entries,
- int32_t count)
-{
- char * real_path = NULL;
- char * entry_path = NULL;
- int32_t real_path_len = -1;
- int32_t entry_path_len = -1;
- int32_t ret = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- struct posix_fd * pfd = {0, };
- struct timeval tv[2] = {{0, }, {0, }};
- uint64_t tmp_pfd = 0;
- char pathname[ZR_PATH_MAX] = {0,};
- dir_entry_t * trav = NULL;
+ this_size = max (sizeof (gf_dirent_t),
+ sizeof (gfs3_dirplist))
+ + strlen (entry->d_name) + 1;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
- VALIDATE_OR_GOTO (entries, out);
+ if (this_size + filled > size) {
+ seekdir (dir, in_case);
+ break;
+ }
- tv[0].tv_sec = tv[0].tv_usec = 0;
- tv[1].tv_sec = tv[1].tv_usec = 0;
+ this_entry = gf_dirent_for_name (entry->d_name);
- ret = fd_ctx_get (fd, this, &tmp_pfd);
- if (ret < 0) {
- op_errno = -ret;
- gf_log (this->name, GF_LOG_DEBUG,
- "fd's ctx not found on fd=%p for %s",
- fd, this->name);
- goto out;
- }
- pfd = (struct posix_fd *)(long)tmp_pfd;
+ if (!this_entry) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "could not create gf_dirent for entry %s: (%s)",
+ entry->d_name, strerror (errno));
+ goto out;
+ }
+ this_entry->d_off = telldir (dir);
+ this_entry->d_ino = entry->d_ino;
+ this_entry->d_type = entry->d_type;
- real_path = pfd->path;
+ list_add_tail (&this_entry->list, &entries->list);
- if (!real_path) {
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_DEBUG,
- "path is NULL on pfd=%p fd=%p", pfd, fd);
- goto out;
+ filled += this_size;
+ count ++;
}
- real_path_len = strlen (real_path);
- entry_path_len = real_path_len + 256;
- entry_path = CALLOC (1, entry_path_len);
-
- if (!entry_path) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR, "Out of memory.");
- goto out;
- }
+ if ((!readdir (dir) && (errno == 0)))
+ /* Indicate EOF */
+ errno = ENOENT;
+out:
+ return count;
+}
- strcpy (entry_path, real_path);
- entry_path[real_path_len] = '/';
+dict_t *
+posix_entry_xattr_fill (xlator_t *this, inode_t *inode,
+ fd_t *fd, char *name, dict_t *dict,
+ struct iatt *stbuf)
+{
+ loc_t tmp_loc = {0,};
+ char *entry_path = NULL;
- /* fd exists, and everything looks fine */
- /**
- * create an entry for each one present in '@entries'
- * - if flag is set (ie, if its namespace), create both directories
- * and files
- * - if not set, create only directories.
- *
- * after the entry is created, change the mode and ownership of the
- * entry according to the stat present in entries->buf.
- */
+ /* if we don't send the 'loc', open-fd-count be a problem. */
+ tmp_loc.inode = inode;
- trav = entries->next;
- while (trav) {
- strcpy (pathname, entry_path);
- strcat (pathname, trav->name);
+ MAKE_HANDLE_PATH (entry_path, this, fd->inode->gfid, name);
- ret = create_entry (this, flags, trav, pathname);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ return posix_lookup_xattr_fill (this, entry_path,
+ &tmp_loc, dict, stbuf);
- /* TODO: handle another flag, GF_SET_OVERWRITE */
+}
- /* Change the mode */
- if (!S_ISLNK (trav->buf.st_mode)) {
- ret = chmod (pathname, trav->buf.st_mode);
- if (ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "chmod on %s failed: %s", pathname,
- strerror (op_errno));
- goto out;
- }
- }
- /* change the ownership */
- ret = lchown (pathname, trav->buf.st_uid, trav->buf.st_gid);
- if (ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "chmod on %s failed: %s", pathname,
- strerror (op_errno));
- goto out;
- }
-
- if (flags & GF_SET_EPOCH_TIME) {
- ret = utimes (pathname, tv);
- if (ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "utimes on %s failed: %s", pathname,
- strerror (op_errno));
- goto out;
- }
+int
+posix_readdirp_fill (xlator_t *this, fd_t *fd, gf_dirent_t *entries, dict_t *dict)
+{
+ gf_dirent_t *entry = NULL;
+ inode_table_t *itable = NULL;
+ inode_t *inode = NULL;
+ char *hpath = NULL;
+ int len = 0;
+ struct iatt stbuf = {0, };
+ uuid_t gfid;
+
+ if (list_empty(&entries->list))
+ return 0;
+
+ itable = fd->inode->table;
+
+ len = posix_handle_path (this, fd->inode->gfid, NULL, NULL, 0);
+ hpath = alloca (len + 256); /* NAME_MAX */
+ posix_handle_path (this, fd->inode->gfid, NULL, hpath, len);
+ len = strlen (hpath);
+ hpath[len] = '/';
+
+ list_for_each_entry (entry, &entries->list, list) {
+ memset (gfid, 0, 16);
+ inode = inode_grep (fd->inode->table, fd->inode,
+ entry->d_name);
+ if (inode)
+ uuid_copy (gfid, inode->gfid);
+
+ strcpy (&hpath[len+1], entry->d_name);
+
+ posix_pstat (this, gfid, hpath, &stbuf);
+
+ if (!inode)
+ inode = inode_find (itable, stbuf.ia_gfid);
+
+ if (!inode)
+ inode = inode_new (itable);
+
+ entry->inode = inode;
+
+ if (dict) {
+ entry->dict =
+ posix_entry_xattr_fill (this, entry->inode,
+ fd, entry->d_name,
+ dict, &stbuf);
+ dict_ref (entry->dict);
}
- /* consider the next entry */
- trav = trav->next;
+ entry->d_stat = stbuf;
+ if (stbuf.ia_ino)
+ entry->d_ino = stbuf.ia_ino;
+ inode = NULL;
}
- op_ret = 0;
- out:
- STACK_UNWIND (frame, op_ret, op_errno);
- if (entry_path)
- FREE (entry_path);
-
- return 0;
+ return 0;
}
+
int32_t
-posix_fstat (call_frame_t *frame, xlator_t *this,
- fd_t *fd)
+posix_do_readdir (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t off, int whichop, dict_t *dict)
{
- int _fd = -1;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- struct stat buf = {0,};
- struct posix_fd * pfd = NULL;
- uint64_t tmp_pfd = 0;
- int ret = -1;
+ struct posix_fd *pfd = NULL;
+ DIR *dir = NULL;
+ int ret = -1;
+ int count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ gf_dirent_t entries;
+ int32_t skip_dirs = 0;
- DECLARE_OLD_FS_ID_VAR;
- SET_FS_ID (frame->root->uid, frame->root->gid);
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (fd, out);
- ret = fd_ctx_get (fd, this, &tmp_pfd);
+ INIT_LIST_HEAD (&entries.list);
+
+ ret = posix_fd_ctx_get (fd, this, &pfd);
if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_WARNING,
"pfd is NULL, fd=%p", fd);
op_errno = -ret;
goto out;
}
- pfd = (struct posix_fd *)(long)tmp_pfd;
- _fd = pfd->fd;
+ dir = pfd->dir;
+
+ if (!dir) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dir is NULL for fd=%p", fd);
+ op_errno = EINVAL;
+ goto out;
+ }
- op_ret = fstat (_fd, &buf);
+ /* When READDIR_FILTER option is set to on, we can filter out
+ * directory's entry from the entry->list.
+ */
+ ret = dict_get_int32 (dict, GF_READDIR_SKIP_DIRS, &skip_dirs);
+
+ LOCK (&fd->lock);
+ {
+ /* posix_fill_readdir performs multiple separate individual
+ readdir() calls to fill up the buffer.
+
+ In case of NFS where the same anonymous FD is shared between
+ different applications, reading a common directory can
+ result in the anonymous fd getting re-used unsafely between
+ the two readdir requests (in two different io-threads).
+
+ It would also help, in the future, to replace the loop
+ around readdir() with a single large getdents() call.
+ */
+ count = posix_fill_readdir (fd, dir, off, size, &entries, this,
+ skip_dirs);
+ }
+ UNLOCK (&fd->lock);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR, "fstat failed on fd=%p: %s",
- fd, strerror (op_errno));
+ /* pick ENOENT to indicate EOF */
+ op_errno = errno;
+ op_ret = count;
+
+ if (whichop != GF_FOP_READDIRP)
goto out;
- }
- op_ret = 0;
+ posix_readdirp_fill (this, fd, &entries, dict);
- out:
- SET_TO_OLD_FS_ID ();
+out:
+ STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, NULL);
+
+ gf_dirent_free (&entries);
- STACK_UNWIND (frame, op_ret, op_errno, &buf);
return 0;
}
-static int gf_posix_lk_log;
int32_t
-posix_lk (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t cmd, struct flock *lock)
+posix_readdir (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t off, dict_t *xdata)
{
- struct flock nullock = {0, };
-
- gf_posix_lk_log++;
-
- GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_ERROR,
- "\"features/locks\" translator is "
- "not loaded. You need to use it for proper "
- "functioning of your application.");
-
- STACK_UNWIND (frame, -1, ENOSYS, &nullock);
+ posix_do_readdir (frame, this, fd, size, off, GF_FOP_READDIR, xdata);
return 0;
}
+
int32_t
-posix_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd, struct flock *lock)
+posix_readdirp (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t off, dict_t *dict)
{
- gf_log (this->name, GF_LOG_CRITICAL,
- "\"features/locks\" translator is not loaded. "
- "You need to use it for proper functioning of GlusterFS");
-
- STACK_UNWIND (frame, -1, ENOSYS);
+ posix_do_readdir (frame, this, fd, size, off, GF_FOP_READDIRP, dict);
return 0;
}
int32_t
-posix_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd, struct flock *lock)
+posix_priv (xlator_t *this)
{
- gf_log (this->name, GF_LOG_CRITICAL,
- "\"features/locks\" translator is not loaded. "
- "You need to use it for proper functioning of GlusterFS");
+ struct posix_private *priv = NULL;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
- STACK_UNWIND (frame, -1, ENOSYS);
- return 0;
-}
+ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type,
+ this->name);
+ gf_proc_dump_add_section(key_prefix);
+ if (!this)
+ return 0;
-int32_t
-posix_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
-{
- gf_log (this->name, GF_LOG_CRITICAL,
- "\"features/locks\" translator is not loaded. "
- "You need to use it for proper functioning of GlusterFS");
+ priv = this->private;
+
+ if (!priv)
+ return 0;
+
+ gf_proc_dump_write("base_path","%s", priv->base_path);
+ gf_proc_dump_write("base_path_length","%d", priv->base_path_length);
+ gf_proc_dump_write("max_read","%d", priv->read_value);
+ gf_proc_dump_write("max_write","%d", priv->write_value);
+ gf_proc_dump_write("nr_files","%ld", priv->nr_files);
- STACK_UNWIND (frame, -1, ENOSYS);
return 0;
}
int32_t
-posix_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
+posix_inode (xlator_t *this)
{
- gf_log (this->name, GF_LOG_CRITICAL,
- "\"features/locks\" translator is not loaded. "
- " You need to use it for proper functioning of GlusterFS");
-
- STACK_UNWIND (frame, -1, ENOSYS);
return 0;
}
int32_t
-posix_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t off)
+posix_rchecksum (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, off_t offset, int32_t len, dict_t *xdata)
{
- uint64_t tmp_pfd = 0;
- struct posix_fd * pfd = NULL;
- DIR * dir = NULL;
- int ret = -1;
- size_t filled = 0;
- int count = 0;
-
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- gf_dirent_t * this_entry = NULL;
- gf_dirent_t entries;
- struct dirent * entry = NULL;
- off_t in_case = -1;
- int32_t this_size = -1;
-
+ char *alloc_buf = NULL;
+ char *buf = NULL;
+ int _fd = -1;
+ struct posix_fd *pfd = NULL;
+ int op_ret = -1;
+ int op_errno = 0;
+ int ret = 0;
+ int32_t weak_checksum = 0;
+ unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0};
+ struct posix_private *priv = NULL;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (fd, out);
- INIT_LIST_HEAD (&entries.list);
+ priv = this->private;
+ memset (strong_checksum, 0, MD5_DIGEST_LENGTH);
- ret = fd_ctx_get (fd, this, &tmp_pfd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "pfd is NULL, fd=%p", fd);
- op_errno = -ret;
+ alloc_buf = _page_aligned_alloc (len, &buf);
+ if (!alloc_buf) {
+ op_errno = ENOMEM;
goto out;
}
- pfd = (struct posix_fd *)(long)tmp_pfd;
-
- dir = pfd->dir;
- if (!dir) {
- gf_log (this->name, GF_LOG_DEBUG,
- "dir is NULL for fd=%p", fd);
- op_errno = EINVAL;
+ ret = posix_fd_ctx_get (fd, this, &pfd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "pfd is NULL, fd=%p", fd);
+ op_errno = -ret;
goto out;
}
+ _fd = pfd->fd;
- if (!off) {
- rewinddir (dir);
- } else {
- seekdir (dir, off);
- }
+ LOCK (&fd->lock);
+ {
+ if (priv->aio_capable && priv->aio_init_done)
+ __posix_fd_set_odirect (fd, pfd, 0, offset, len);
- while (filled <= size) {
- in_case = telldir (dir);
+ ret = pread (_fd, buf, len, offset);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "pread of %d bytes returned %d (%s)",
+ len, ret, strerror (errno));
- if (in_case == -1) {
op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "telldir failed on dir=%p: %s",
- dir, strerror (errno));
- goto out;
- }
-
- errno = 0;
- entry = readdir (dir);
-
- if (!entry) {
- if (errno == EBADF) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "readdir failed on dir=%p: %s",
- dir, strerror (op_errno));
- goto out;
- }
- break;
- }
-
- this_size = dirent_size (entry);
-
- if (this_size + filled > size) {
- seekdir (dir, in_case);
- break;
}
-
- this_entry = gf_dirent_for_name (entry->d_name);
-
- if (!this_entry) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not create gf_dirent for entry %s: (%s)",
- entry->d_name, strerror (errno));
- goto out;
- }
- this_entry->d_off = telldir (dir);
- this_entry->d_ino = entry->d_ino;
-
- list_add_tail (&this_entry->list, &entries.list);
-
- filled += this_size;
- count ++;
}
+ UNLOCK (&fd->lock);
- op_ret = count;
+ if (ret < 0)
+ goto out;
+
+ weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf, (size_t) len);
+ gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len, (unsigned char *) strong_checksum);
- out:
- STACK_UNWIND (frame, op_ret, op_errno, &entries);
+ op_ret = 0;
+out:
+ STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno,
+ weak_checksum, strong_checksum, NULL);
- gf_dirent_free (&entries);
+ GF_FREE (alloc_buf);
return 0;
}
+/**
+ * notify - when parent sends PARENT_UP, send CHILD_UP event from here
+ */
int32_t
-posix_stats (call_frame_t *frame, xlator_t *this,
- int32_t flags)
-
+notify (xlator_t *this,
+ int32_t event,
+ void *data,
+ ...)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- struct xlator_stats xlstats = {0, };
- struct xlator_stats * stats = NULL;
- struct statvfs buf = {0,};
- struct timeval tv = {0,};
- struct posix_private * priv = (struct posix_private *)this->private;
-
- int64_t avg_read = 0;
- int64_t avg_write = 0;
- int64_t _time_ms = 0;
-
- DECLARE_OLD_FS_ID_VAR;
-
- SET_FS_ID (frame->root->uid, frame->root->gid);
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
-
- stats = &xlstats;
-
- op_ret = statvfs (priv->base_path, &buf);
-
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR, "statvfs failed: %s",
- strerror (op_errno));
- goto out;
+ switch (event)
+ {
+ case GF_EVENT_PARENT_UP:
+ {
+ /* Tell the parent that posix xlator is up */
+ default_notify (this, GF_EVENT_CHILD_UP, data);
}
-
- /* client info is maintained at FSd */
- stats->nr_clients = priv->stats.nr_clients;
- stats->nr_files = priv->stats.nr_files;
-
- /* number of free block in the filesystem. */
- stats->free_disk = buf.f_bfree * buf.f_bsize;
-
- stats->total_disk_size = buf.f_blocks * buf.f_bsize;
- stats->disk_usage = (buf.f_blocks - buf.f_bavail) * buf.f_bsize;
-
- /* Calculate read and write usage */
- op_ret = gettimeofday (&tv, NULL);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "gettimeofday failed: %s", strerror (errno));
- goto out;
+ break;
+ default:
+ /* */
+ break;
}
+ return 0;
+}
- /* Read */
- _time_ms = (tv.tv_sec - priv->init_time.tv_sec) * 1000 +
- ((tv.tv_usec - priv->init_time.tv_usec) / 1000);
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
- avg_read = (_time_ms) ? (priv->read_value / _time_ms) : 0; /* KBps */
- avg_write = (_time_ms) ? (priv->write_value / _time_ms) : 0; /* KBps */
+ if (!this)
+ return ret;
- _time_ms = (tv.tv_sec - priv->prev_fetch_time.tv_sec) * 1000 +
- ((tv.tv_usec - priv->prev_fetch_time.tv_usec) / 1000);
+ ret = xlator_mem_acct_init (this, gf_posix_mt_end + 1);
- if (_time_ms && ((priv->interval_read / _time_ms) > priv->max_read)) {
- priv->max_read = (priv->interval_read / _time_ms);
+ if (ret != 0) {
+ gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+ return ret;
}
- if (_time_ms &&
- ((priv->interval_write / _time_ms) > priv->max_write)) {
- priv->max_write = priv->interval_write / _time_ms;
- }
+ return ret;
+}
- stats->read_usage = avg_read / priv->max_read;
- stats->write_usage = avg_write / priv->max_write;
+static int
+posix_set_owner (xlator_t *this, uid_t uid, gid_t gid)
+{
+ struct posix_private *priv = NULL;
+ int ret = -1;
- op_ret = gettimeofday (&(priv->prev_fetch_time), NULL);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR, "gettimeofday failed: %s",
- strerror (op_errno));
- goto out;
- }
+ priv = this->private;
- priv->interval_read = 0;
- priv->interval_write = 0;
+ ret = sys_chown (priv->base_path, uid, gid);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "uid/gid for brick path %s, %s",
+ priv->base_path, strerror (errno));
- op_ret = 0;
+ return ret;
+}
- out:
- SET_TO_OLD_FS_ID ();
- STACK_UNWIND (frame, op_ret, op_errno, stats);
- return 0;
+static int
+set_batch_fsync_mode (struct posix_private *priv, const char *str)
+{
+ if (strcmp (str, "none") == 0)
+ priv->batch_fsync_mode = BATCH_NONE;
+ else if (strcmp (str, "syncfs") == 0)
+ priv->batch_fsync_mode = BATCH_SYNCFS;
+ else if (strcmp (str, "syncfs-single-fsync") == 0)
+ priv->batch_fsync_mode = BATCH_SYNCFS_SINGLE_FSYNC;
+ else if (strcmp (str, "syncfs-reverse-fsync") == 0)
+ priv->batch_fsync_mode = BATCH_SYNCFS_REVERSE_FSYNC;
+ else if (strcmp (str, "reverse-fsync") == 0)
+ priv->batch_fsync_mode = BATCH_REVERSE_FSYNC;
+ else
+ return -1;
+
+ return 0;
}
-int32_t
-posix_checksum (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flag)
+
+int
+reconfigure (xlator_t *this, dict_t *options)
{
- char * real_path = NULL;
- DIR * dir = NULL;
- struct dirent * dirent = NULL;
- uint8_t file_checksum[ZR_FILENAME_MAX] = {0,};
- uint8_t dir_checksum[ZR_FILENAME_MAX] = {0,};
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int i = 0;
- int length = 0;
+ int ret = -1;
+ struct posix_private *priv = NULL;
+ uid_t uid = -1;
+ gid_t gid = -1;
+ char *batch_fsync_mode_str = NULL;
- struct stat buf = {0,};
- char tmp_real_path[ZR_PATH_MAX] = {0,};
- int ret = -1;
+ priv = this->private;
- MAKE_REAL_PATH (real_path, this, loc->path);
+ GF_OPTION_RECONF ("brick-uid", uid, options, uint32, out);
+ GF_OPTION_RECONF ("brick-gid", gid, options, uint32, out);
+ posix_set_owner (this, uid, gid);
- dir = opendir (real_path);
+ GF_OPTION_RECONF ("batch-fsync-delay-usec", priv->batch_fsync_delay_usec,
+ options, uint32, out);
- if (!dir){
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "opendir() failed on `%s': %s",
- real_path, strerror (op_errno));
- goto out;
- }
+ GF_OPTION_RECONF ("batch-fsync-mode", batch_fsync_mode_str,
+ options, str, out);
- while ((dirent = readdir (dir))) {
- errno = 0;
- if (!dirent) {
- if (errno != 0) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "readdir() failed on dir=%p: %s",
- dir, strerror (errno));
- goto out;
- }
- break;
- }
+ if (set_batch_fsync_mode (priv, batch_fsync_mode_str) != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s",
+ batch_fsync_mode_str);
+ goto out;
+ }
- length = strlen (dirent->d_name);
+ GF_OPTION_RECONF ("linux-aio", priv->aio_configured,
+ options, bool, out);
- strcpy (tmp_real_path, real_path);
- strcat (tmp_real_path, "/");
- strcat (tmp_real_path, dirent->d_name);
- ret = lstat (tmp_real_path, &buf);
+ if (priv->aio_configured)
+ posix_aio_on (this);
+ else
+ posix_aio_off (this);
- if (ret == -1)
- continue;
+ GF_OPTION_RECONF ("node-uuid-pathinfo", priv->node_uuid_pathinfo,
+ options, bool, out);
- if (S_ISDIR (buf.st_mode)) {
- for (i = 0; i < length; i++)
- dir_checksum[i] ^= dirent->d_name[i];
- } else {
- for (i = 0; i < length; i++)
- file_checksum[i] ^= dirent->d_name[i];
- }
+ if (priv->node_uuid_pathinfo &&
+ (uuid_is_null (priv->glusterd_uuid))) {
+ gf_log (this->name, GF_LOG_INFO,
+ "glusterd uuid is NULL, pathinfo xattr would"
+ " fallback to <hostname>:<export>");
}
- closedir (dir);
-
- op_ret = 0;
- out:
- STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum);
+ GF_OPTION_RECONF ("health-check-interval", priv->health_check_interval,
+ options, uint32, out);
+ posix_spawn_health_check_thread (this);
- return 0;
+ ret = 0;
+out:
+ return ret;
}
-/**
- * notify - when parent sends PARENT_UP, send CHILD_UP event from here
- */
-int32_t
-notify (xlator_t *this,
- int32_t event,
- void *data,
- ...)
-{
- switch (event)
- {
- case GF_EVENT_PARENT_UP:
- {
- /* Tell the parent that posix xlator is up */
- default_notify (this, GF_EVENT_CHILD_UP, data);
- }
- break;
- default:
- /* */
- break;
- }
- return 0;
-}
/**
* init -
@@ -3791,13 +4646,24 @@ notify (xlator_t *this,
int
init (xlator_t *this)
{
- int ret = 0;
- int op_ret = -1;
- gf_boolean_t tmp_bool = 0;
- struct stat buf = {0,};
- struct posix_private * _private = NULL;
- data_t * dir_data = NULL;
- data_t * tmp_data = NULL;
+ struct posix_private *_private = NULL;
+ data_t *dir_data = NULL;
+ data_t *tmp_data = NULL;
+ struct stat buf = {0,};
+ gf_boolean_t tmp_bool = 0;
+ int dict_ret = 0;
+ int ret = 0;
+ int op_ret = -1;
+ ssize_t size = -1;
+ int32_t janitor_sleep = 0;
+ uuid_t old_uuid = {0,};
+ uuid_t dict_uuid = {0,};
+ uuid_t gfid = {0,};
+ uuid_t rootgfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1};
+ char *guuid = NULL;
+ uid_t uid = -1;
+ gid_t gid = -1;
+ char *batch_fsync_mode_str;
dir_data = dict_get (this->options, "directory");
@@ -3808,10 +4674,10 @@ init (xlator_t *this)
goto out;
}
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "Volume is dangling. Please check the volume file.");
- }
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Volume is dangling. Please check the volume file.");
+ }
if (!dir_data) {
gf_log (this->name, GF_LOG_CRITICAL,
@@ -3822,144 +4688,249 @@ init (xlator_t *this)
umask (000); // umask `masking' is done at the client side
- /* Check whether the specified directory exists, if not create it. */
- op_ret = lstat (dir_data->data, &buf);
- if ((ret != 0) || !S_ISDIR (buf.st_mode)) {
+ /* Check whether the specified directory exists, if not log it. */
+ op_ret = stat (dir_data->data, &buf);
+ if ((op_ret != 0) || !S_ISDIR (buf.st_mode)) {
gf_log (this->name, GF_LOG_ERROR,
"Directory '%s' doesn't exist, exiting.",
- dir_data->data);
+ dir_data->data);
ret = -1;
goto out;
}
-
/* Check for Extended attribute support, if not present, log it */
op_ret = sys_lsetxattr (dir_data->data,
- "trusted.glusterfs.test", "working", 8, 0);
- if (op_ret < 0) {
- tmp_data = dict_get (this->options,
- "mandate-attribute");
- if (tmp_data) {
- if (gf_string2boolean (tmp_data->data,
- &tmp_bool) == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "wrong option provided for key "
- "\"mandate-xattr\"");
- ret = -1;
- goto out;
- }
- if (!tmp_bool) {
- gf_log (this->name, GF_LOG_WARNING,
- "Extended attribute not supported, "
- "starting as per option");
- } else {
- gf_log (this->name, GF_LOG_CRITICAL,
- "Extended attribute not supported, "
- "exiting.");
- ret = -1;
- goto out;
- }
- } else {
- gf_log (this->name, GF_LOG_CRITICAL,
- "Extended attribute not supported, exiting.");
- ret = -1;
- goto out;
- }
+ "trusted.glusterfs.test", "working", 8, 0);
+ if (op_ret == 0) {
+ sys_lremovexattr (dir_data->data, "trusted.glusterfs.test");
+ } else {
+ tmp_data = dict_get (this->options,
+ "mandate-attribute");
+ if (tmp_data) {
+ if (gf_string2boolean (tmp_data->data,
+ &tmp_bool) == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "wrong option provided for key "
+ "\"mandate-attribute\"");
+ ret = -1;
+ goto out;
+ }
+ if (!tmp_bool) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Extended attribute not supported, "
+ "starting as per option");
+ } else {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Extended attribute not supported, "
+ "exiting.");
+ ret = -1;
+ goto out;
+ }
+ } else {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Extended attribute not supported, exiting.");
+ ret = -1;
+ goto out;
+ }
}
- _private = CALLOC (1, sizeof (*_private));
+ tmp_data = dict_get (this->options, "volume-id");
+ if (tmp_data) {
+ op_ret = uuid_parse (tmp_data->data, dict_uuid);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "wrong volume-id (%s) set in volume file",
+ tmp_data->data);
+ ret = -1;
+ goto out;
+ }
+ size = sys_lgetxattr (dir_data->data,
+ "trusted.glusterfs.volume-id", old_uuid, 16);
+ if (size == 16) {
+ if (uuid_compare (old_uuid, dict_uuid)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "mismatching volume-id (%s) received. "
+ "already is a part of volume %s ",
+ tmp_data->data, uuid_utoa (old_uuid));
+ ret = -1;
+ goto out;
+ }
+ } else if ((size == -1) && (errno == ENODATA)) {
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "Extended attribute trusted.glusterfs."
+ "volume-id is absent");
+ ret = -1;
+ goto out;
+
+ } else if ((size == -1) && (errno != ENODATA)) {
+ /* Wrong 'volume-id' is set, it should be error */
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to fetch volume-id (%s)",
+ dir_data->data, strerror (errno));
+ ret = -1;
+ goto out;
+ } else {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to fetch proper volume id from export");
+ goto out;
+ }
+ }
+
+ /* Now check if the export directory has some other 'gfid',
+ other than that of root '/' */
+ size = sys_lgetxattr (dir_data->data, "trusted.gfid", gfid, 16);
+ if (size == 16) {
+ if (!__is_root_gfid (gfid)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: gfid (%s) is not that of glusterfs '/' ",
+ dir_data->data, uuid_utoa (gfid));
+ ret = -1;
+ goto out;
+ }
+ } else if (size != -1) {
+ /* Wrong 'gfid' is set, it should be error */
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: wrong value set as gfid",
+ dir_data->data);
+ ret = -1;
+ goto out;
+ } else if ((size == -1) && (errno != ENODATA)) {
+ /* Wrong 'gfid' is set, it should be error */
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to fetch gfid (%s)",
+ dir_data->data, strerror (errno));
+ ret = -1;
+ goto out;
+ } else {
+ /* First time volume, set the GFID */
+ size = sys_lsetxattr (dir_data->data, "trusted.gfid", rootgfid,
+ 16, XATTR_CREATE);
+ if (size) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to set gfid (%s)",
+ dir_data->data, strerror (errno));
+ ret = -1;
+ goto out;
+ }
+ }
+
+ size = sys_lgetxattr (dir_data->data, POSIX_ACL_ACCESS_XATTR,
+ NULL, 0);
+ if ((size < 0) && (errno == ENOTSUP))
+ gf_log (this->name, GF_LOG_WARNING,
+ "Posix access control list is not supported.");
+
+ ret = 0;
+ _private = GF_CALLOC (1, sizeof (*_private),
+ gf_posix_mt_posix_private);
if (!_private) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
ret = -1;
goto out;
}
- _private->base_path = strdup (dir_data->data);
+ _private->base_path = gf_strdup (dir_data->data);
_private->base_path_length = strlen (_private->base_path);
- {
- /* Stats related variables */
- gettimeofday (&_private->init_time, NULL);
- gettimeofday (&_private->prev_fetch_time, NULL);
- _private->max_read = 1;
- _private->max_write = 1;
+ LOCK_INIT (&_private->lock);
+
+ ret = dict_get_str (this->options, "hostname", &_private->hostname);
+ if (ret) {
+ _private->hostname = GF_CALLOC (256, sizeof (char),
+ gf_common_mt_char);
+ if (!_private->hostname) {
+ goto out;
+ }
+ ret = gethostname (_private->hostname, 256);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not find hostname (%s)", strerror (errno));
+ }
}
_private->export_statfs = 1;
tmp_data = dict_get (this->options, "export-statfs-size");
if (tmp_data) {
- if (gf_string2boolean (tmp_data->data,
- &_private->export_statfs) == -1) {
- ret = -1;
- gf_log (this->name, GF_LOG_ERROR,
- "'export-statfs-size' takes only boolean "
- "options");
- goto out;
- }
+ if (gf_string2boolean (tmp_data->data,
+ &_private->export_statfs) == -1) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "'export-statfs-size' takes only boolean "
+ "options");
+ goto out;
+ }
if (!_private->export_statfs)
gf_log (this->name, GF_LOG_DEBUG,
- "'statfs()' returns dummy size");
+ "'statfs()' returns dummy size");
}
_private->background_unlink = 0;
tmp_data = dict_get (this->options, "background-unlink");
if (tmp_data) {
- if (gf_string2boolean (tmp_data->data,
- &_private->background_unlink) == -1) {
- ret = -1;
- gf_log (this->name, GF_LOG_ERROR,
- "'export-statfs-size' takes only boolean "
- "options");
- goto out;
- }
+ if (gf_string2boolean (tmp_data->data,
+ &_private->background_unlink) == -1) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "'background-unlink' takes only boolean "
+ "options");
+ goto out;
+ }
if (_private->background_unlink)
gf_log (this->name, GF_LOG_DEBUG,
- "unlinks will be performed in background");
+ "unlinks will be performed in background");
}
tmp_data = dict_get (this->options, "o-direct");
if (tmp_data) {
- if (gf_string2boolean (tmp_data->data,
- &_private->o_direct) == -1) {
- ret = -1;
- gf_log (this->name, GF_LOG_ERROR,
- "wrong option provided for 'o-direct'");
- goto out;
- }
- if (_private->o_direct)
+ if (gf_string2boolean (tmp_data->data,
+ &_private->o_direct) == -1) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "wrong option provided for 'o-direct'");
+ goto out;
+ }
+ if (_private->o_direct)
gf_log (this->name, GF_LOG_DEBUG,
"o-direct mode is enabled (O_DIRECT "
- "for every open)");
+ "for every open)");
}
- _private->num_devices_to_span = 1;
-
- tmp_data = dict_get (this->options, "span-devices");
- if (tmp_data) {
- if (gf_string2int32 (tmp_data->data,
- &_private->num_devices_to_span) == -1) {
- ret = -1;
- gf_log (this->name, GF_LOG_ERROR,
- "wrong option provided for 'span-devices'");
- goto out;
- }
- if (_private->num_devices_to_span > 1) {
- gf_log (this->name, GF_LOG_NORMAL,
- "spanning enabled accross %d mounts",
- _private->num_devices_to_span);
- _private->span_devices = 1;
- }
- if (_private->num_devices_to_span < 1)
- _private->num_devices_to_span = 1;
+ ret = dict_get_str (this->options, "glusterd-uuid", &guuid);
+ if (!ret) {
+ if (uuid_parse (guuid, _private->glusterd_uuid))
+ gf_log (this->name, GF_LOG_WARNING, "Cannot parse "
+ "glusterd (node) UUID, node-uuid xattr "
+ "request would return - \"No such attribute\"");
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG, "No glusterd (node) UUID "
+ "passed - node-uuid xattr request will return "
+ "\"No such attribute\"");
}
- _private->st_device = CALLOC (1, (sizeof (dev_t) *
- _private->num_devices_to_span));
-
- /* Start with the base */
- _private->st_device[0] = buf.st_dev;
+ ret = 0;
+ _private->janitor_sleep_duration = 600;
+
+ dict_ret = dict_get_int32 (this->options, "janitor-sleep-duration",
+ &janitor_sleep);
+ if (dict_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Setting janitor sleep duration to %d.",
+ janitor_sleep);
+
+ _private->janitor_sleep_duration = janitor_sleep;
+ }
+ /* performing open dir on brick dir locks the brick dir
+ * and prevents it from being unmounted
+ */
+ _private->mount_lock = opendir (dir_data->data);
+ if (!_private->mount_lock) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not lock brick directory");
+ goto out;
+ }
#ifndef GF_DARWIN_HOST_OS
{
struct rlimit lim;
@@ -3968,29 +4939,106 @@ init (xlator_t *this)
if (setrlimit (RLIMIT_NOFILE, &lim) == -1) {
gf_log (this->name, GF_LOG_WARNING,
- "Failed to set 'ulimit -n "
- " 1048576': %s", strerror(errno));
+ "Failed to set 'ulimit -n "
+ " 1048576': %s", strerror(errno));
lim.rlim_cur = 65536;
lim.rlim_max = 65536;
if (setrlimit (RLIMIT_NOFILE, &lim) == -1) {
gf_log (this->name, GF_LOG_WARNING,
- "Failed to set maximum allowed open "
- "file descriptors to 64k: %s",
+ "Failed to set maximum allowed open "
+ "file descriptors to 64k: %s",
strerror(errno));
}
else {
- gf_log (this->name, GF_LOG_NORMAL,
- "Maximum allowed open file descriptors "
+ gf_log (this->name, GF_LOG_INFO,
+ "Maximum allowed open file descriptors "
"set to 65536");
}
}
}
#endif
-
this->private = (void *)_private;
- out:
+ op_ret = posix_handle_init (this);
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Posix handle setup failed");
+ ret = -1;
+ goto out;
+ }
+
+ op_ret = posix_handle_trash_init (this);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Posix landfill setup failed");
+ ret = -1;
+ goto out;
+ }
+
+ _private->aio_init_done = _gf_false;
+ _private->aio_capable = _gf_false;
+
+ GF_OPTION_INIT ("brick-uid", uid, uint32, out);
+ GF_OPTION_INIT ("brick-gid", gid, uint32, out);
+ posix_set_owner (this, uid, gid);
+
+ GF_OPTION_INIT ("linux-aio", _private->aio_configured, bool, out);
+
+ if (_private->aio_configured) {
+ op_ret = posix_aio_on (this);
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Posix AIO init failed");
+ ret = -1;
+ goto out;
+ }
+ }
+
+ GF_OPTION_INIT ("node-uuid-pathinfo",
+ _private->node_uuid_pathinfo, bool, out);
+ if (_private->node_uuid_pathinfo &&
+ (uuid_is_null (_private->glusterd_uuid))) {
+ gf_log (this->name, GF_LOG_INFO,
+ "glusterd uuid is NULL, pathinfo xattr would"
+ " fallback to <hostname>:<export>");
+ }
+
+ _private->health_check_active = _gf_false;
+ GF_OPTION_INIT ("health-check-interval",
+ _private->health_check_interval, uint32, out);
+ if (_private->health_check_interval)
+ posix_spawn_health_check_thread (this);
+
+ pthread_mutex_init (&_private->janitor_lock, NULL);
+ pthread_cond_init (&_private->janitor_cond, NULL);
+ INIT_LIST_HEAD (&_private->janitor_fds);
+
+ posix_spawn_janitor_thread (this);
+
+ pthread_mutex_init (&_private->fsync_mutex, NULL);
+ pthread_cond_init (&_private->fsync_cond, NULL);
+ INIT_LIST_HEAD (&_private->fsyncs);
+
+ ret = gf_thread_create (&_private->fsyncer, NULL, posix_fsyncer, this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "fsyncer thread"
+ " creation failed (%s)", strerror (errno));
+ goto out;
+ }
+
+ GF_OPTION_INIT ("batch-fsync-mode", batch_fsync_mode_str, str, out);
+
+ if (set_batch_fsync_mode (_private, batch_fsync_mode_str) != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s",
+ batch_fsync_mode_str);
+ goto out;
+ }
+
+ GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec,
+ uint32, out);
+out:
return ret;
}
@@ -3998,13 +5046,19 @@ void
fini (xlator_t *this)
{
struct posix_private *priv = this->private;
- sys_lremovexattr (priv->base_path, "trusted.glusterfs.test");
- FREE (priv);
+ if (!priv)
+ return;
+ this->private = NULL;
+ /*unlock brick dir*/
+ if (priv->mount_lock)
+ closedir (priv->mount_lock);
+ GF_FREE (priv);
return;
}
-struct xlator_mops mops = {
- .stats = posix_stats,
+struct xlator_dumpops dumpops = {
+ .priv = posix_priv,
+ .inode = posix_inode,
};
struct xlator_fops fops = {
@@ -4012,6 +5066,7 @@ struct xlator_fops fops = {
.stat = posix_stat,
.opendir = posix_opendir,
.readdir = posix_readdir,
+ .readdirp = posix_readdirp,
.readlink = posix_readlink,
.mknod = posix_mknod,
.mkdir = posix_mkdir,
@@ -4020,10 +5075,7 @@ struct xlator_fops fops = {
.symlink = posix_symlink,
.rename = posix_rename,
.link = posix_link,
- .chmod = posix_chmod,
- .chown = posix_chown,
.truncate = posix_truncate,
- .utimens = posix_utimens,
.create = posix_create,
.open = posix_open,
.readv = posix_readv,
@@ -4036,42 +5088,105 @@ struct xlator_fops fops = {
.getxattr = posix_getxattr,
.fgetxattr = posix_fgetxattr,
.removexattr = posix_removexattr,
+ .fremovexattr = posix_fremovexattr,
.fsyncdir = posix_fsyncdir,
.access = posix_access,
.ftruncate = posix_ftruncate,
.fstat = posix_fstat,
.lk = posix_lk,
- .inodelk = posix_inodelk,
- .finodelk = posix_finodelk,
- .entrylk = posix_entrylk,
- .fentrylk = posix_fentrylk,
- .fchown = posix_fchown,
- .fchmod = posix_fchmod,
- .setdents = posix_setdents,
- .getdents = posix_getdents,
- .checksum = posix_checksum,
- .xattrop = posix_xattrop,
- .fxattrop = posix_fxattrop,
+ .inodelk = posix_inodelk,
+ .finodelk = posix_finodelk,
+ .entrylk = posix_entrylk,
+ .fentrylk = posix_fentrylk,
+ .rchecksum = posix_rchecksum,
+ .xattrop = posix_xattrop,
+ .fxattrop = posix_fxattrop,
+ .setattr = posix_setattr,
+ .fsetattr = posix_fsetattr,
+ .fallocate = _posix_fallocate,
+ .discard = posix_discard,
+ .zerofill = posix_zerofill,
};
struct xlator_cbks cbks = {
- .release = posix_release,
- .releasedir = posix_releasedir,
- .forget = posix_forget
+ .release = posix_release,
+ .releasedir = posix_releasedir,
+ .forget = posix_forget
};
struct volume_options options[] = {
- { .key = {"o-direct"},
- .type = GF_OPTION_TYPE_BOOL },
- { .key = {"directory"},
- .type = GF_OPTION_TYPE_PATH },
- { .key = {"export-statfs-size"},
- .type = GF_OPTION_TYPE_BOOL },
- { .key = {"mandate-attribute"},
- .type = GF_OPTION_TYPE_BOOL },
- { .key = {"span-devices"},
- .type = GF_OPTION_TYPE_INT },
+ { .key = {"o-direct"},
+ .type = GF_OPTION_TYPE_BOOL },
+ { .key = {"directory"},
+ .type = GF_OPTION_TYPE_PATH },
+ { .key = {"hostname"},
+ .type = GF_OPTION_TYPE_ANY },
+ { .key = {"export-statfs-size"},
+ .type = GF_OPTION_TYPE_BOOL },
+ { .key = {"mandate-attribute"},
+ .type = GF_OPTION_TYPE_BOOL },
{ .key = {"background-unlink"},
- .type = GF_OPTION_TYPE_BOOL },
- { .key = {NULL} }
+ .type = GF_OPTION_TYPE_BOOL },
+ { .key = {"janitor-sleep-duration"},
+ .type = GF_OPTION_TYPE_INT },
+ { .key = {"volume-id"},
+ .type = GF_OPTION_TYPE_ANY },
+ { .key = {"glusterd-uuid"},
+ .type = GF_OPTION_TYPE_STR },
+ {
+ .key = {"linux-aio"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Support for native Linux AIO"
+ },
+ {
+ .key = {"brick-uid"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Support for setting uid of brick's owner"
+ },
+ {
+ .key = {"brick-gid"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Support for setting gid of brick's owner"
+ },
+ { .key = {"node-uuid-pathinfo"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "return glusterd's node-uuid in pathinfo xattr"
+ " string instead of hostname"
+ },
+ {
+ .key = {"health-check-interval"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .default_value = "30",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Interval in seconds for a filesystem health check, "
+ "set to 0 to disable"
+ },
+ { .key = {"batch-fsync-mode"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "reverse-fsync",
+ .description = "Possible values:\n"
+ "\t- syncfs: Perform one syncfs() on behalf oa batch"
+ "of fsyncs.\n"
+ "\t- syncfs-single-fsync: Perform one syncfs() on behalf of a batch"
+ " of fsyncs and one fsync() per batch.\n"
+ "\t- syncfs-reverse-fsync: Preform one syncfs() on behalf of a batch"
+ " of fsyncs and fsync() each file in the batch in reverse order.\n"
+ " in reverse order.\n"
+ "\t- reverse-fsync: Perform fsync() of each file in the batch in"
+ " reverse order."
+ },
+ { .key = {"batch-fsync-delay-usec"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "0",
+ .description = "Num of usecs to wait for aggregating fsync"
+ " requests",
+ },
+ { .key = {NULL} }
};
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
index ed6b46430..3121db271 100644
--- a/xlators/storage/posix/src/posix.h
+++ b/xlators/storage/posix/src/posix.h
@@ -1,22 +1,12 @@
/*
- Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
#ifndef _POSIX_H
#define _POSIX_H
@@ -29,6 +19,7 @@
#include <unistd.h>
#include <sys/types.h>
#include <dirent.h>
+#include <time.h>
#ifdef linux
#ifdef __GLIBC__
@@ -49,7 +40,18 @@
#include "xlator.h"
#include "inode.h"
#include "compat.h"
+#include "timer.h"
+#include "posix-mem-types.h"
+#include "posix-handle.h"
+#include "call-stub.h"
+
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#include "posix-aio.h"
+#endif
+#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/
+#define MAX_NO_VECT 1024
/**
* posix_fd - internal structure common to file and directory fd's
*/
@@ -57,27 +59,33 @@
struct posix_fd {
int fd; /* fd returned by the kernel */
int32_t flags; /* flags for open/creat */
- char * path; /* used by setdents/getdents */
DIR * dir; /* handle returned by the kernel */
+ int odirect;
+ struct list_head list; /* to add to the janitor list */
};
+
struct posix_private {
char *base_path;
int32_t base_path_length;
+ gf_lock_t lock;
+
+ char *hostname;
/* Statistics, provides activity of the server */
- struct xlator_stats stats;
-
+
struct timeval prev_fetch_time;
struct timeval init_time;
- int32_t max_read; /* */
- int32_t max_write; /* */
- int64_t interval_read; /* Used to calculate the max_read value */
- int64_t interval_write; /* Used to calculate the max_write value */
+ time_t last_landfill_check;
+ int32_t janitor_sleep_duration;
+ struct list_head janitor_fds;
+ pthread_cond_t janitor_cond;
+ pthread_mutex_t janitor_lock;
+
int64_t read_value; /* Total read, from init */
int64_t write_value; /* Total write, from init */
-
+ int64_t nr_files;
/*
In some cases, two exported volumes may reside on the same
partition on the server. Sending statvfs info for both
@@ -91,28 +99,110 @@ struct posix_private {
gf_boolean_t o_direct; /* always open files in O_DIRECT mode */
- gf_boolean_t span_devices;
-/*
+/*
decide whether posix_unlink does open (file), unlink (file), close (fd)
instead of just unlink (file). with the former approach there is no lockout
of access to parent directory during removal of very large files for the
entire duration of freeing of data blocks.
-*/
+*/
gf_boolean_t background_unlink;
- int num_devices_to_span;
- dev_t *st_device;
+/* janitor thread which cleans up /.trash (created by replicate) */
+ pthread_t janitor;
+ gf_boolean_t janitor_present;
+ char * trash_path;
+/* lock for brick dir */
+ DIR *mount_lock;
+
+ struct stat handledir;
+
+/* uuid of glusterd that swapned the brick process */
+ uuid_t glusterd_uuid;
+
+ gf_boolean_t aio_configured;
+ gf_boolean_t aio_init_done;
+ gf_boolean_t aio_capable;
+#ifdef HAVE_LIBAIO
+ io_context_t ctxp;
+ pthread_t aiothread;
+#endif
+
+ /* node-uuid in pathinfo xattr */
+ gf_boolean_t node_uuid_pathinfo;
+
+ pthread_t fsyncer;
+ struct list_head fsyncs;
+ pthread_mutex_t fsync_mutex;
+ pthread_cond_t fsync_cond;
+ int fsync_queue_count;
+
+ enum {
+ BATCH_NONE = 0,
+ BATCH_SYNCFS,
+ BATCH_SYNCFS_SINGLE_FSYNC,
+ BATCH_REVERSE_FSYNC,
+ BATCH_SYNCFS_REVERSE_FSYNC
+ } batch_fsync_mode;
+
+ uint32_t batch_fsync_delay_usec;
+
+ /* seconds to sleep between health checks */
+ uint32_t health_check_interval;
+ pthread_t health_check;
+ gf_boolean_t health_check_active;
};
+typedef struct {
+ xlator_t *this;
+ const char *real_path;
+ dict_t *xattr;
+ struct iatt *stbuf;
+ loc_t *loc;
+ inode_t *inode; /* for all do_xattrop() key handling */
+ int fd;
+ int flags;
+ int32_t op_errno;
+} posix_xattr_filler_t;
+
+
#define POSIX_BASE_PATH(this) (((struct posix_private *)this->private)->base_path)
#define POSIX_BASE_PATH_LEN(this) (((struct posix_private *)this->private)->base_path_length)
-#define MAKE_REAL_PATH(var, this, path) do { \
- var = alloca (strlen (path) + POSIX_BASE_PATH_LEN(this) + 2); \
- strcpy (var, POSIX_BASE_PATH(this)); \
- strcpy (&var[POSIX_BASE_PATH_LEN(this)], path); \
- } while (0)
-
+/* Helper functions */
+int posix_gfid_set (xlator_t *this, const char *path, loc_t *loc,
+ dict_t *xattr_req);
+int posix_fdstat (xlator_t *this, int fd, struct iatt *stbuf_p);
+int posix_istat (xlator_t *this, uuid_t gfid, const char *basename,
+ struct iatt *iatt);
+int posix_pstat (xlator_t *this, uuid_t gfid, const char *real_path,
+ struct iatt *iatt);
+dict_t *posix_lookup_xattr_fill (xlator_t *this, const char *path,
+ loc_t *loc, dict_t *xattr, struct iatt *buf);
+int posix_handle_pair (xlator_t *this, const char *real_path, char *key,
+ data_t *value, int flags);
+int posix_fhandle_pair (xlator_t *this, int fd, char *key, data_t *value,
+ int flags);
+void posix_spawn_janitor_thread (xlator_t *this);
+int posix_get_file_contents (xlator_t *this, uuid_t pargfid,
+ const char *name, char **contents);
+int posix_set_file_contents (xlator_t *this, const char *path, char *key,
+ data_t *value, int flags);
+int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req);
+int posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req);
+int posix_entry_create_xattr_set (xlator_t *this, const char *path,
+ dict_t *dict);
+
+int posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd);
+void posix_fill_ino_from_gfid (xlator_t *this, struct iatt *buf);
+
+gf_boolean_t posix_special_xattr (char **pattern, char *key);
+
+void
+__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags,
+ off_t offset, size_t size);
+void posix_spawn_health_check_thread (xlator_t *this);
+
+void *posix_fsyncer (void *);
#endif /* _POSIX_H */