summaryrefslogtreecommitdiffstats
path: root/xlators/storage/posix/src/posix-aio.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/storage/posix/src/posix-aio.c')
-rw-r--r--xlators/storage/posix/src/posix-aio.c556
1 files changed, 556 insertions, 0 deletions
diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c
new file mode 100644
index 00000000000..d0cb0002bbf
--- /dev/null
+++ b/xlators/storage/posix/src/posix-aio.c
@@ -0,0 +1,556 @@
+/*
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#include "posix.h"
+#include <sys/uio.h>
+#include "posix-messages.h"
+
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+
+void
+__posix_fd_set_odirect(fd_t *fd, struct posix_fd *pfd, int opflags,
+ off_t offset, size_t size)
+{
+ int odirect = 0;
+ int flags = 0;
+ int ret = 0;
+
+ odirect = pfd->odirect;
+
+ if ((fd->flags | opflags) & O_DIRECT) {
+ /* if instructed, use O_DIRECT always */
+ odirect = 1;
+ } else {
+ /* else use O_DIRECT when feasible */
+ if ((offset | size) & 0xfff)
+ odirect = 0;
+ else
+ odirect = 1;
+ }
+
+ if (!odirect && pfd->odirect) {
+ flags = fcntl(pfd->fd, F_GETFL);
+ ret = fcntl(pfd->fd, F_SETFL, (flags & (~O_DIRECT)));
+ pfd->odirect = 0;
+ }
+
+ if (odirect && !pfd->odirect) {
+ flags = fcntl(pfd->fd, F_GETFL);
+ ret = fcntl(pfd->fd, F_SETFL, (flags | O_DIRECT));
+ pfd->odirect = 1;
+ }
+
+ if (ret) {
+ gf_msg(THIS->name, GF_LOG_WARNING, errno, P_MSG_FCNTL_FAILED,
+ "fcntl() failed. fd=%d flags=%d pfd->odirect=%d", pfd->fd, flags,
+ pfd->odirect);
+ }
+}
+
+struct posix_aio_cb {
+ struct iocb iocb;
+ call_frame_t *frame;
+ struct iobuf *iobuf;
+ struct iobref *iobref;
+ struct iatt prebuf;
+ int _fd;
+ fd_t *fd;
+ int op;
+ off_t offset;
+};
+
+int
+posix_aio_readv_complete(struct posix_aio_cb *paiocb, int res, int res2)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct iobuf *iobuf = NULL;
+ struct iatt postbuf = {
+ 0,
+ };
+ int _fd = -1;
+ int op_ret = -1;
+ int op_errno = 0;
+ struct iovec iov;
+ struct iobref *iobref = NULL;
+ int ret = 0;
+ off_t offset = 0;
+ struct posix_private *priv = NULL;
+ fd_t *fd = NULL;
+
+ frame = paiocb->frame;
+ this = frame->this;
+ priv = this->private;
+ iobuf = paiocb->iobuf;
+ fd = paiocb->fd;
+ _fd = paiocb->_fd;
+ offset = paiocb->offset;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_READV_FAILED,
+ "readv(async) failed fd=%d,size=%lu,offset=%llu (%d)", _fd,
+ paiocb->iocb.u.c.nbytes, (unsigned long long)paiocb->offset,
+ res);
+ goto out;
+ }
+
+ ret = posix_fdstat(this, fd->inode, _fd, &postbuf);
+ if (ret != 0) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSTAT_FAILED,
+ "fstat failed on fd=%d", _fd);
+ goto out;
+ }
+
+ op_ret = res;
+ op_errno = 0;
+
+ iobref = iobref_new();
+ if (!iobref) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ iobref_add(iobref, iobuf);
+
+ iov.iov_base = iobuf_ptr(iobuf);
+ iov.iov_len = op_ret;
+
+ /* Hack to notify higher layers of EOF. */
+ if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size)
+ op_errno = ENOENT;
+
+ GF_ATOMIC_ADD(priv->read_value, op_ret);
+
+out:
+ STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, &iov, 1, &postbuf,
+ iobref, NULL);
+ if (iobuf)
+ iobuf_unref(iobuf);
+ if (iobref)
+ iobref_unref(iobref);
+
+ if (paiocb->fd)
+ fd_unref(paiocb->fd);
+
+ GF_FREE(paiocb);
+
+ return 0;
+}
+
+int
+posix_aio_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ int32_t op_errno = EINVAL;
+ int _fd = -1;
+ struct iobuf *iobuf = NULL;
+ struct posix_fd *pfd = NULL;
+ int ret = -1;
+ struct posix_aio_cb *paiocb = NULL;
+ struct posix_private *priv = NULL;
+ struct iocb *iocb = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ priv = this->private;
+
+ ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL,
+ "pfd is NULL from fd=%p", fd);
+ goto err;
+ }
+ _fd = pfd->fd;
+
+ if (!size) {
+ op_errno = EINVAL;
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_INVALID_ARGUMENT,
+ "size=%" GF_PRI_SIZET, size);
+ goto err;
+ }
+
+ iobuf = iobuf_get2(this->ctx->iobuf_pool, size);
+ if (!iobuf) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ paiocb = GF_CALLOC(1, sizeof(*paiocb), gf_posix_mt_paiocb);
+ if (!paiocb) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ paiocb->frame = frame;
+ paiocb->iobuf = iobuf;
+ paiocb->offset = offset;
+ paiocb->fd = fd_ref(fd);
+ paiocb->_fd = _fd;
+ paiocb->op = GF_FOP_READ;
+
+ paiocb->iocb.data = paiocb;
+ paiocb->iocb.aio_fildes = _fd;
+ paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD;
+ paiocb->iocb.aio_reqprio = 0;
+ paiocb->iocb.u.c.buf = iobuf_ptr(iobuf);
+ paiocb->iocb.u.c.nbytes = size;
+ paiocb->iocb.u.c.offset = offset;
+
+ iocb = &paiocb->iocb;
+
+ LOCK(&fd->lock);
+ {
+ __posix_fd_set_odirect(fd, pfd, flags, offset, size);
+
+ ret = io_submit(priv->ctxp, 1, &iocb);
+ }
+ UNLOCK(&fd->lock);
+
+ if (ret != 1) {
+ op_errno = -ret;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_IO_SUBMIT_FAILED,
+ "io_submit() returned %d", ret);
+ goto err;
+ }
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT(readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
+ if (iobuf)
+ iobuf_unref(iobuf);
+
+ if (paiocb) {
+ if (paiocb->fd)
+ fd_unref(paiocb->fd);
+ GF_FREE(paiocb);
+ }
+
+ return 0;
+}
+
+int
+posix_aio_writev_complete(struct posix_aio_cb *paiocb, int res, int res2)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct iatt prebuf = {
+ 0,
+ };
+ struct iatt postbuf = {
+ 0,
+ };
+ int _fd = -1;
+ int op_ret = -1;
+ int op_errno = 0;
+ int ret = 0;
+ struct posix_private *priv = NULL;
+ fd_t *fd = NULL;
+
+ if (!paiocb) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ frame = paiocb->frame;
+ this = frame->this;
+ priv = this->private;
+ prebuf = paiocb->prebuf;
+ fd = paiocb->fd;
+ _fd = paiocb->_fd;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_WRITEV_FAILED,
+ "writev(async) failed fd=%d,offset=%llu (%d)", _fd,
+ (unsigned long long)paiocb->offset, res);
+
+ goto out;
+ }
+
+ ret = posix_fdstat(this, fd->inode, _fd, &postbuf);
+ if (ret != 0) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSTAT_FAILED,
+ "fstat failed on fd=%d", _fd);
+ goto out;
+ }
+
+ op_ret = res;
+ op_errno = 0;
+
+ GF_ATOMIC_ADD(priv->write_value, op_ret);
+
+out:
+ STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, &prebuf, &postbuf,
+ NULL);
+
+ if (paiocb) {
+ if (paiocb->iobref)
+ iobref_unref(paiocb->iobref);
+ if (paiocb->fd)
+ fd_unref(paiocb->fd);
+ GF_FREE(paiocb);
+ }
+
+ return 0;
+}
+
+int
+posix_aio_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *iov, int count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ int32_t op_errno = EINVAL;
+ int _fd = -1;
+ struct posix_fd *pfd = NULL;
+ int ret = -1;
+ struct posix_aio_cb *paiocb = NULL;
+ struct posix_private *priv = NULL;
+ struct iocb *iocb = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ priv = this->private;
+ DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_errno, op_errno, err);
+
+ ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL,
+ "pfd is NULL from fd=%p", fd);
+ goto err;
+ }
+ _fd = pfd->fd;
+
+ paiocb = GF_CALLOC(1, sizeof(*paiocb), gf_posix_mt_paiocb);
+ if (!paiocb) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ paiocb->frame = frame;
+ paiocb->offset = offset;
+ paiocb->fd = fd_ref(fd);
+ paiocb->_fd = _fd;
+ paiocb->op = GF_FOP_WRITE;
+
+ paiocb->iocb.data = paiocb;
+ paiocb->iocb.aio_fildes = _fd;
+ paiocb->iobref = iobref_ref(iobref);
+ paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV;
+ paiocb->iocb.aio_reqprio = 0;
+ paiocb->iocb.u.v.vec = iov;
+ paiocb->iocb.u.v.nr = count;
+ paiocb->iocb.u.v.offset = offset;
+
+ iocb = &paiocb->iocb;
+
+ ret = posix_fdstat(this, fd->inode, _fd, &paiocb->prebuf);
+ if (ret != 0) {
+ op_errno = errno;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSTAT_FAILED,
+ "fstat failed on fd=%p", fd);
+ goto err;
+ }
+
+ LOCK(&fd->lock);
+ {
+ __posix_fd_set_odirect(fd, pfd, flags, offset, iov_length(iov, count));
+
+ ret = io_submit(priv->ctxp, 1, &iocb);
+ }
+ UNLOCK(&fd->lock);
+
+ if (ret != 1) {
+ op_errno = -ret;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_IO_SUBMIT_FAILED,
+ "io_submit() returned %d,gfid=%s", ret,
+ uuid_utoa(fd->inode->gfid));
+ goto err;
+ }
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT(writev, frame, -1, op_errno, 0, 0, 0);
+
+ if (paiocb) {
+ if (paiocb->iobref)
+ iobref_unref(paiocb->iobref);
+ if (paiocb->fd)
+ fd_unref(paiocb->fd);
+ GF_FREE(paiocb);
+ }
+
+ return 0;
+}
+
+void *
+posix_aio_thread(void *data)
+{
+ xlator_t *this = NULL;
+ struct posix_private *priv = NULL;
+ int ret = 0;
+ int i = 0;
+ struct io_event events[POSIX_AIO_MAX_NR_GETEVENTS];
+ struct io_event *event = NULL;
+ struct posix_aio_cb *paiocb = NULL;
+
+ this = data;
+ THIS = this;
+ priv = this->private;
+
+ for (;;) {
+ memset(&events[0], 0, sizeof(events));
+ ret = io_getevents(priv->ctxp, 1, POSIX_AIO_MAX_NR_GETEVENTS,
+ &events[0], NULL);
+ if (ret <= 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, P_MSG_IO_GETEVENTS_FAILED,
+ "io_getevents() returned %d", ret);
+ if (ret == -EINTR)
+ continue;
+ break;
+ }
+
+ for (i = 0; i < ret; i++) {
+ event = &events[i];
+
+ paiocb = event->data;
+
+ switch (paiocb->op) {
+ case GF_FOP_READ:
+ posix_aio_readv_complete(paiocb, event->res, event->res2);
+ break;
+ case GF_FOP_WRITE:
+ posix_aio_writev_complete(paiocb, event->res, event->res2);
+ break;
+ default:
+ gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_UNKNOWN_OP,
+ "unknown op %d found in piocb", paiocb->op);
+ break;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+int
+posix_aio_init(xlator_t *this)
+{
+ struct posix_private *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ ret = io_setup(POSIX_AIO_MAX_NR_EVENTS, &priv->ctxp);
+ if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_AIO_UNAVAILABLE,
+ "Linux AIO not available at run-time."
+ " Continuing with synchronous IO");
+ ret = 0;
+ goto out;
+ }
+
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, P_MSG_IO_SETUP_FAILED,
+ "io_setup() failed. ret=%d", ret);
+ goto out;
+ }
+
+ ret = gf_thread_create(&priv->aiothread, NULL, posix_aio_thread, this,
+ "posixaio");
+ if (ret != 0) {
+ io_destroy(priv->ctxp);
+ goto out;
+ }
+
+ this->fops->readv = posix_aio_readv;
+ this->fops->writev = posix_aio_writev;
+out:
+ return ret;
+}
+
+int
+posix_aio_on(xlator_t *this)
+{
+ struct posix_private *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ if (!priv->aio_init_done) {
+ ret = posix_aio_init(this);
+ if (ret == 0)
+ priv->aio_capable = _gf_true;
+ else
+ priv->aio_capable = _gf_false;
+ priv->aio_init_done = _gf_true;
+ }
+
+ if (priv->aio_capable) {
+ this->fops->readv = posix_aio_readv;
+ this->fops->writev = posix_aio_writev;
+ }
+
+ return ret;
+}
+
+int
+posix_aio_off(xlator_t *this)
+{
+ this->fops->readv = posix_readv;
+ this->fops->writev = posix_writev;
+
+ return 0;
+}
+
+#else
+
+int
+posix_aio_on(xlator_t *this)
+{
+ gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_AIO_UNAVAILABLE,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return 0;
+}
+
+int
+posix_aio_off(xlator_t *this)
+{
+ gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_AIO_UNAVAILABLE,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return 0;
+}
+
+void
+__posix_fd_set_odirect(fd_t *fd, struct posix_fd *pfd, int opflags,
+ off_t offset, size_t size)
+{
+ xlator_t *this = THIS;
+ gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_AIO_UNAVAILABLE,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return;
+}
+
+#endif