diff options
Diffstat (limited to 'xlators/storage/posix/src')
| -rw-r--r-- | xlators/storage/posix/src/Makefile.am | 18 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-aio.c | 569 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-aio.h | 39 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-handle.c | 744 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-handle.h | 143 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-helpers.c | 1391 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-mem-types.h | 27 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.c | 5723 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.h | 158 |
9 files changed, 6466 insertions, 2346 deletions
diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am index 9acaad651..88efcc784 100644 --- a/xlators/storage/posix/src/Makefile.am +++ b/xlators/storage/posix/src/Makefile.am @@ -2,16 +2,18 @@ xlator_LTLIBRARIES = posix.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage -posix_la_LDFLAGS = -module -avoidversion +posix_la_LDFLAGS = -module -avoid-version -posix_la_SOURCES = posix.c -posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c posix-aio.c +posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBAIO) -noinst_HEADERS = posix.h +noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h posix-aio.h -AM_CFLAGS = -fPIC -fno-strict-aliasing -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) -Wall \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \ - $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src -CLEANFILES = +AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c new file mode 100644 index 000000000..c3bbddd67 --- /dev/null +++ b/xlators/storage/posix/src/posix-aio.c @@ -0,0 +1,569 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "glusterfs.h" +#include "posix.h" +#include <sys/uio.h> + +#ifdef HAVE_LIBAIO +#include <libaio.h> + + +void +__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size) +{ + int odirect = 0; + int flags = 0; + int ret = 0; + + odirect = pfd->odirect; + + if ((fd->flags|opflags) & O_DIRECT) { + /* if instructed, use O_DIRECT always */ + odirect = 1; + } else { + /* else use O_DIRECT when feasible */ + if ((offset|size) & 0xfff) + odirect = 0; + else + odirect = 1; + } + + if (!odirect && pfd->odirect) { + flags = fcntl (pfd->fd, F_GETFL); + ret = fcntl (pfd->fd, F_SETFL, (flags & (~O_DIRECT))); + pfd->odirect = 0; + } + + if (odirect && !pfd->odirect) { + flags = fcntl (pfd->fd, F_GETFL); + ret = fcntl (pfd->fd, F_SETFL, (flags | O_DIRECT)); + pfd->odirect = 1; + } + + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, + "fcntl() failed (%s). fd=%d flags=%d pfd->odirect=%d", + strerror (errno), pfd->fd, flags, pfd->odirect); + } +} + + +struct posix_aio_cb { + struct iocb iocb; + call_frame_t *frame; + struct iobuf *iobuf; + struct iobref *iobref; + struct iatt prebuf; + int fd; + int op; + off_t offset; +}; + + +int +posix_aio_readv_complete (struct posix_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iobuf *iobuf = NULL; + struct iatt postbuf = {0,}; + int _fd = -1; + int op_ret = -1; + int op_errno = 0; + struct iovec iov; + struct iobref *iobref = NULL; + int ret = 0; + off_t offset = 0; + struct posix_private * priv = NULL; + + + frame = paiocb->frame; + this = frame->this; + priv = this->private; + iobuf = paiocb->iobuf; + _fd = paiocb->fd; + offset = paiocb->offset; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "readv(async) failed fd=%d,size=%lu,offset=%llu (%d/%s)", + _fd, paiocb->iocb.u.c.nbytes, + (unsigned long long) paiocb->offset, + res, strerror (op_errno)); + goto out; + } + + ret = posix_fdstat (this, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%d: %s", _fd, + strerror (op_errno)); + goto out; + } + + op_ret = res; + op_errno = 0; + + iobref = iobref_new (); + if (!iobref) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + iobref_add (iobref, iobuf); + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = op_ret; + + + /* Hack to notify higher layers of EOF. */ + if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size) + op_errno = ENOENT; + + LOCK (&priv->lock); + { + priv->read_value += op_ret; + } + UNLOCK (&priv->lock); + +out: + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, &iov, 1, + &postbuf, iobref, NULL); + if (iobuf) + iobuf_unref (iobuf); + if (iobref) + iobref_unref (iobref); + + GF_FREE (paiocb); + + return 0; +} + + +int +posix_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t offset, uint32_t flags, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct iobuf *iobuf = NULL; + struct posix_fd * pfd = NULL; + int ret = -1; + struct posix_aio_cb *paiocb = NULL; + struct posix_private *priv = NULL; + struct iocb *iocb = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); + goto err; + } + _fd = pfd->fd; + + if (!size) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); + goto err; + } + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto err; + } + + paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_posix_mt_paiocb); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + + paiocb->frame = frame; + paiocb->iobuf = iobuf; + paiocb->offset = offset; + paiocb->fd = _fd; + paiocb->op = GF_FOP_READ; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.c.buf = iobuf_ptr (iobuf); + paiocb->iocb.u.c.nbytes = size; + paiocb->iocb.u.c.offset = offset; + + iocb = &paiocb->iocb; + + LOCK (&fd->lock); + { + __posix_fd_set_odirect (fd, pfd, flags, offset, size); + + ret = io_submit (priv->ctxp, 1, &iocb); + } + UNLOCK (&fd->lock); + + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + if (iobuf) + iobuf_unref (iobuf); + + if (paiocb) + GF_FREE (paiocb); + + return 0; +} + + +int +posix_aio_writev_complete (struct posix_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iatt prebuf = {0,}; + struct iatt postbuf = {0,}; + int _fd = -1; + int op_ret = -1; + int op_errno = 0; + int ret = 0; + struct posix_private * priv = NULL; + + + frame = paiocb->frame; + this = frame->this; + priv = this->private; + prebuf = paiocb->prebuf; + _fd = paiocb->fd; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "writev(async) failed fd=%d,offset=%llu (%d/%s)", + _fd, (unsigned long long) paiocb->offset, res, + strerror (op_errno)); + + goto out; + } + + ret = posix_fdstat (this, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%d: %s", _fd, + strerror (op_errno)); + goto out; + } + + + op_ret = res; + op_errno = 0; + + LOCK (&priv->lock); + { + priv->write_value += op_ret; + } + UNLOCK (&priv->lock); + +out: + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &prebuf, &postbuf, + NULL); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + + +int +posix_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *iov, int count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct posix_fd * pfd = NULL; + int ret = -1; + struct posix_aio_cb *paiocb = NULL; + struct posix_private *priv = NULL; + struct iocb *iocb = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); + goto err; + } + _fd = pfd->fd; + + paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_posix_mt_paiocb); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + + paiocb->frame = frame; + paiocb->offset = offset; + paiocb->fd = _fd; + paiocb->op = GF_FOP_WRITE; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iobref = iobref_ref (iobref); + paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.v.vec = iov; + paiocb->iocb.u.v.nr = count; + paiocb->iocb.u.v.offset = offset; + + iocb = &paiocb->iocb; + + ret = posix_fdstat (this, _fd, &paiocb->prebuf); + if (ret != 0) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%p: %s", fd, + strerror (op_errno)); + goto err; + } + + + LOCK (&fd->lock); + { + __posix_fd_set_odirect (fd, pfd, flags, offset, + iov_length (iov, count)); + + ret = io_submit (priv->ctxp, 1, &iocb); + } + UNLOCK (&fd->lock); + + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, 0, 0, 0); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + + +void * +posix_aio_thread (void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + int ret = 0; + int i = 0; + struct io_event events[POSIX_AIO_MAX_NR_GETEVENTS]; + struct io_event *event = NULL; + struct posix_aio_cb *paiocb = NULL; + + this = data; + THIS = this; + priv = this->private; + + for (;;) { + memset (&events[0], 0, sizeof (events)); + ret = io_getevents (priv->ctxp, 1, POSIX_AIO_MAX_NR_GETEVENTS, + &events[0], NULL); + if (ret <= 0) { + gf_log (this->name, GF_LOG_ERROR, + "io_getevents() returned %d", ret); + if (ret == -EINTR) + continue; + break; + } + + for (i = 0; i < ret; i++) { + event = &events[i]; + + paiocb = event->data; + + switch (paiocb->op) { + case GF_FOP_READ: + posix_aio_readv_complete (paiocb, event->res, + event->res2); + break; + case GF_FOP_WRITE: + posix_aio_writev_complete (paiocb, event->res, + event->res2); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "unknown op %d found in piocb", + paiocb->op); + break; + } + } + } + + return NULL; +} + + +int +posix_aio_init (xlator_t *this) +{ + struct posix_private *priv = NULL; + int ret = 0; + + priv = this->private; + + ret = io_setup (POSIX_AIO_MAX_NR_EVENTS, &priv->ctxp); + if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) { + gf_log (this->name, GF_LOG_WARNING, + "Linux AIO not available at run-time." + " Continuing with synchronous IO"); + ret = 0; + goto out; + } + + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "io_setup() failed. ret=%d, errno=%d", + ret, errno); + goto out; + } + + ret = gf_thread_create (&priv->aiothread, NULL, + posix_aio_thread, this); + if (ret != 0) { + io_destroy (priv->ctxp); + goto out; + } + + this->fops->readv = posix_aio_readv; + this->fops->writev = posix_aio_writev; +out: + return ret; +} + + +int +posix_aio_on (xlator_t *this) +{ + struct posix_private *priv = NULL; + int ret = 0; + + priv = this->private; + + if (!priv->aio_init_done) { + ret = posix_aio_init (this); + if (ret == 0) + priv->aio_capable = _gf_true; + else + priv->aio_capable = _gf_false; + priv->aio_init_done = _gf_true; + } + + if (priv->aio_capable) { + this->fops->readv = posix_aio_readv; + this->fops->writev = posix_aio_writev; + } + + return ret; +} + +int +posix_aio_off (xlator_t *this) +{ + this->fops->readv = posix_readv; + this->fops->writev = posix_writev; + + return 0; +} + + +#else + + +int +posix_aio_on (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +int +posix_aio_off (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +void +__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size) +{ + xlator_t *this = THIS; + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return; +} +#endif diff --git a/xlators/storage/posix/src/posix-aio.h b/xlators/storage/posix/src/posix-aio.h new file mode 100644 index 000000000..5bde71601 --- /dev/null +++ b/xlators/storage/posix/src/posix-aio.h @@ -0,0 +1,39 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _POSIX_AIO_H +#define _POSIX_AIO_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "glusterfs.h" + +// Maximum number of concurrently submitted IO events. The heaviest load +// GlusterFS has been able to handle had 60-80 concurrent calls +#define POSIX_AIO_MAX_NR_EVENTS 256 + +// Maximum number of completed IO operations to reap per getevents syscall +#define POSIX_AIO_MAX_NR_GETEVENTS 16 + + +int posix_aio_on (xlator_t *this); +int posix_aio_off (xlator_t *this); + +int posix_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata); + +#endif /* !_POSIX_AIO_H */ diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c new file mode 100644 index 000000000..219a582c9 --- /dev/null +++ b/xlators/storage/posix/src/posix-handle.c @@ -0,0 +1,744 @@ +/* + Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <libgen.h> +#ifdef GF_LINUX_HOST_OS +#include <alloca.h> +#endif + +#include "posix-handle.h" +#include "posix.h" +#include "xlator.h" +#include "syscall.h" + + +#define HANDLE_PFX ".glusterfs" +#define TRASH_DIR "landfill" + +#define UUID0_STR "00000000-0000-0000-0000-000000000000" +#define SLEN(str) (sizeof(str) - 1) + + +int +posix_handle_relpath (xlator_t *this, uuid_t gfid, const char *basename, + char *buf, size_t buflen) +{ + char *uuid_str = NULL; + int len = 0; + + len = SLEN("../") + + SLEN("../") + + SLEN("00/") + + SLEN("00/") + + SLEN(UUID0_STR) + + 1 /* '\0' */ + ; + + if (basename) { + len += (strlen (basename) + 1); + } + + if (buflen < len || !buf) + return len; + + uuid_str = uuid_utoa (gfid); + + if (basename) { + len = snprintf (buf, buflen, "../../%02x/%02x/%s/%s", + gfid[0], gfid[1], uuid_str, basename); + } else { + len = snprintf (buf, buflen, "../../%02x/%02x/%s", + gfid[0], gfid[1], uuid_str); + } + + return len; +} + + +/* + TODO: explain how this pump fixes ELOOP +*/ +int +posix_handle_pump (xlator_t *this, char *buf, int len, int maxlen, + char *base_str, int base_len, int pfx_len) +{ + char linkname[512] = {0,}; /* "../../<gfid>/<NAME_MAX>" */ + int ret = 0; + int blen = 0; + int link_len = 0; + + /* is a directory's symlink-handle */ + ret = readlink (base_str, linkname, 512); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "internal readlink failed on %s (%s)", + base_str, strerror (errno)); + goto err; + } + + if (ret < 512) + linkname[ret] = 0; + + link_len = ret; + + if ((ret == 8) && memcmp (linkname, "../../..", 8) == 0) { + if (strcmp (base_str, buf) == 0) { + strcpy (buf + pfx_len, ".."); + } + goto out; + } + + if (ret < 50 || ret >= 512) { + gf_log (this->name, GF_LOG_ERROR, + "malformed internal link %s for %s", + linkname, base_str); + goto err; + } + + if (memcmp (linkname, "../../", 6) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "malformed internal link %s for %s", + linkname, base_str); + goto err; + } + + if ((linkname[2] != '/') || + (linkname[5] != '/') || + (linkname[8] != '/') || + (linkname[11] != '/') || + (linkname[48] != '/')) { + gf_log (this->name, GF_LOG_ERROR, + "malformed internal link %s for %s", + linkname, base_str); + goto err; + } + + if ((linkname[20] != '-') || + (linkname[25] != '-') || + (linkname[30] != '-') || + (linkname[35] != '-')) { + gf_log (this->name, GF_LOG_ERROR, + "malformed internal link %s for %s", + linkname, base_str); + goto err; + } + + blen = link_len - 48; + memmove (buf + base_len + blen, buf + base_len, + (strlen (buf) - base_len) + 1); + + strncpy (base_str + pfx_len, linkname + 6, 42); + + if (len + blen < maxlen) + strncpy (buf + pfx_len, linkname + 6, link_len - 6); +out: + return len + blen; +err: + return -1; +} + + +/* + posix_handle_path differs from posix_handle_gfid_path in the way that the + path filled in @buf by posix_handle_path will return type IA_IFDIR when + an lstat() is performed on it, whereas posix_handle_gfid_path returns path + to the handle symlink (typically used for the purpose of unlinking it). + + posix_handle_path also guarantees immunity to ELOOP on the path returned by it +*/ + +int +posix_handle_path (xlator_t *this, uuid_t gfid, const char *basename, + char *ubuf, size_t size) +{ + struct posix_private *priv = NULL; + char *uuid_str = NULL; + int len = 0; + int ret = -1; + struct stat stat; + char *base_str = NULL; + int base_len = 0; + int pfx_len; + int maxlen; + char *buf; + + priv = this->private; + + uuid_str = uuid_utoa (gfid); + + if (ubuf) { + buf = ubuf; + maxlen = size; + } else { + maxlen = PATH_MAX; + buf = alloca (maxlen); + } + + base_len = (priv->base_path_length + SLEN(HANDLE_PFX) + 45); + base_str = alloca (base_len + 1); + base_len = snprintf (base_str, base_len + 1, "%s/%s/%02x/%02x/%s", + priv->base_path, HANDLE_PFX, gfid[0], gfid[1], + uuid_str); + + pfx_len = priv->base_path_length + 1 + SLEN(HANDLE_PFX) + 1; + + if (basename) { + len = snprintf (buf, maxlen, "%s/%s", base_str, basename); + } else { + len = snprintf (buf, maxlen, "%s", base_str); + } + + ret = lstat (base_str, &stat); + + if (!(ret == 0 && S_ISLNK(stat.st_mode) && stat.st_nlink == 1)) + goto out; + + do { + errno = 0; + ret = posix_handle_pump (this, buf, len, maxlen, + base_str, base_len, pfx_len); + if (ret == -1) + break; + + len = ret; + + ret = lstat (buf, &stat); + } while ((ret == -1) && errno == ELOOP); + +out: + return len + 1; +} + + +int +posix_handle_gfid_path (xlator_t *this, uuid_t gfid, const char *basename, + char *buf, size_t buflen) +{ + struct posix_private *priv = NULL; + char *uuid_str = NULL; + int len = 0; + + priv = this->private; + + len = priv->base_path_length /* option directory "/export" */ + + SLEN("/") + + SLEN(HANDLE_PFX) + + SLEN("/") + + SLEN("00/") + + SLEN("00/") + + SLEN(UUID0_STR) + + 1 /* '\0' */ + ; + + if (basename) { + len += (strlen (basename) + 1); + } else { + len += 256; /* worst-case for directory's symlink-handle expansion */ + } + + if ((buflen < len) || !buf) + return len; + + uuid_str = uuid_utoa (gfid); + + if (__is_root_gfid (gfid)) { + if (basename) { + len = snprintf (buf, buflen, "%s/%s", priv->base_path, + basename); + } else { + strncpy (buf, priv->base_path, buflen); + } + goto out; + } + + if (basename) { + len = snprintf (buf, buflen, "%s/%s/%02x/%02x/%s/%s", priv->base_path, + HANDLE_PFX, gfid[0], gfid[1], uuid_str, basename); + } else { + len = snprintf (buf, buflen, "%s/%s/%02x/%02x/%s", priv->base_path, + HANDLE_PFX, gfid[0], gfid[1], uuid_str); + } +out: + return len; +} + + +int +posix_handle_init (xlator_t *this) +{ + struct posix_private *priv = NULL; + char *handle_pfx = NULL; + int ret = 0; + int len = 0; + struct stat stbuf; + struct stat rootbuf; + struct stat exportbuf; + char *rootstr = NULL; + uuid_t gfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; + + priv = this->private; + + ret = stat (priv->base_path, &exportbuf); + if (ret || !S_ISDIR (exportbuf.st_mode)) { + gf_log (this->name, GF_LOG_ERROR, + "Not a directory: %s", priv->base_path); + return -1; + } + + handle_pfx = alloca (priv->base_path_length + 1 + strlen (HANDLE_PFX) + + 1); + + sprintf (handle_pfx, "%s/%s", priv->base_path, HANDLE_PFX); + + ret = stat (handle_pfx, &stbuf); + switch (ret) { + case -1: + if (errno == ENOENT) { + ret = mkdir (handle_pfx, 0600); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Creating directory %s failed: %s", + handle_pfx, strerror (errno)); + return -1; + } + } else { + gf_log (this->name, GF_LOG_ERROR, + "Checking for %s failed: %s", + handle_pfx, strerror (errno)); + return -1; + } + break; + case 0: + if (!S_ISDIR (stbuf.st_mode)) { + gf_log (this->name, GF_LOG_ERROR, + "Not a directory: %s", + handle_pfx); + return -1; + } + break; + default: + break; + } + + stat (handle_pfx, &priv->handledir); + + len = posix_handle_path (this, gfid, NULL, NULL, 0); + rootstr = alloca (len); + posix_handle_path (this, gfid, NULL, rootstr, len); + + ret = stat (rootstr, &rootbuf); + switch (ret) { + case -1: + if (errno != ENOENT) { + gf_log (this->name, GF_LOG_ERROR, + "%s: %s", priv->base_path, + strerror (errno)); + return -1; + } + + ret = posix_handle_mkdir_hashes (this, rootstr); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "mkdir %s failed (%s)", + rootstr, strerror (errno)); + return -1; + } + + ret = symlink ("../../..", rootstr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "symlink %s creation failed (%s)", + rootstr, strerror (errno)); + return -1; + } + break; + case 0: + if ((exportbuf.st_ino == rootbuf.st_ino) && + (exportbuf.st_dev == rootbuf.st_dev)) + return 0; + + gf_log (this->name, GF_LOG_ERROR, + "Different dirs %s (%lld/%lld) != %s (%lld/%lld)", + priv->base_path, (long long) exportbuf.st_ino, + (long long) exportbuf.st_dev, rootstr, + (long long) rootbuf.st_ino, (long long) rootbuf.st_dev); + return -1; + + break; + } + + return 0; +} + +gf_boolean_t +posix_does_old_trash_exists (char *old_trash) +{ + uuid_t gfid = {0}; + gf_boolean_t exists = _gf_false; + struct stat stbuf = {0}; + int ret = 0; + + ret = lstat (old_trash, &stbuf); + if ((ret == 0) && S_ISDIR (stbuf.st_mode)) { + ret = sys_lgetxattr (old_trash, "trusted.gfid", gfid, 16); + if ((ret < 0) && (errno == ENODATA)) + exists = _gf_true; + } + return exists; +} + +int +posix_handle_new_trash_init (xlator_t *this, char *trash) +{ + int ret = 0; + struct stat stbuf = {0}; + + ret = lstat (trash, &stbuf); + switch (ret) { + case -1: + if (errno == ENOENT) { + ret = mkdir (trash, 0755); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Creating directory %s failed: %s", + trash, strerror (errno)); + } + } else { + gf_log (this->name, GF_LOG_ERROR, "Checking for %s " + "failed: %s", trash, strerror (errno)); + } + break; + case 0: + if (!S_ISDIR (stbuf.st_mode)) { + gf_log (this->name, GF_LOG_ERROR, + "Not a directory: %s", trash); + ret = -1; + } + break; + default: + break; + } + return ret; +} + +int +posix_mv_old_trash_into_new_trash (xlator_t *this, char *old, char *new) +{ + char dest_old[PATH_MAX] = {0}; + int ret = 0; + uuid_t dest_name = {0}; + + if (!posix_does_old_trash_exists (old)) + goto out; + uuid_generate (dest_name); + snprintf (dest_old, sizeof (dest_old), "%s/%s", new, + uuid_utoa (dest_name)); + ret = rename (old, dest_old); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Not able to move " + "%s -> %s (%s)", old, dest_old, strerror (errno)); + } +out: + return ret; +} + +int +posix_handle_trash_init (xlator_t *this) +{ + int ret = -1; + struct posix_private *priv = NULL; + char old_trash[PATH_MAX] = {0}; + + priv = this->private; + + priv->trash_path = GF_CALLOC (1, priv->base_path_length + strlen ("/") + + strlen (HANDLE_PFX) + strlen ("/") + + strlen (TRASH_DIR) + 1, + gf_posix_mt_trash_path); + + if (!priv->trash_path) + goto out; + + strncpy (priv->trash_path, priv->base_path, priv->base_path_length); + strcat (priv->trash_path, "/" HANDLE_PFX "/" TRASH_DIR); + ret = posix_handle_new_trash_init (this, priv->trash_path); + if (ret) + goto out; + snprintf (old_trash, sizeof (old_trash), "%s/.landfill", + priv->base_path); + ret = posix_mv_old_trash_into_new_trash (this, old_trash, + priv->trash_path); +out: + return ret; +} + +int +posix_handle_mkdir_hashes (xlator_t *this, const char *newpath) +{ + char *duppath = NULL; + char *parpath = NULL; + int ret = 0; + + duppath = strdupa (newpath); + parpath = dirname (duppath); + parpath = dirname (duppath); + + ret = mkdir (parpath, 0700); + if (ret == -1 && errno != EEXIST) { + gf_log (this->name, GF_LOG_ERROR, + "error mkdir hash-1 %s (%s)", + parpath, strerror (errno)); + return -1; + } + + strcpy (duppath, newpath); + parpath = dirname (duppath); + + ret = mkdir (parpath, 0700); + if (ret == -1 && errno != EEXIST) { + gf_log (this->name, GF_LOG_ERROR, + "error mkdir hash-2 %s (%s)", + parpath, strerror (errno)); + return -1; + } + + return 0; +} + + +int +posix_handle_hard (xlator_t *this, const char *oldpath, uuid_t gfid, struct stat *oldbuf) +{ + char *newpath = NULL; + struct stat newbuf; + int ret = -1; + + + MAKE_HANDLE_PATH (newpath, this, gfid, NULL); + + ret = lstat (newpath, &newbuf); + if (ret == -1 && errno != ENOENT) { + gf_log (this->name, GF_LOG_WARNING, + "%s: %s", newpath, strerror (errno)); + return -1; + } + + if (ret == -1 && errno == ENOENT) { + ret = posix_handle_mkdir_hashes (this, newpath); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "mkdir %s failed (%s)", + newpath, strerror (errno)); + return -1; + } + +#ifdef HAVE_LINKAT + /* + * Use linkat if the target may be a symlink to a directory + * or without an existing target. See comment about linkat() + * usage in posix_link() in posix.c for details + */ + ret = linkat (AT_FDCWD, oldpath, AT_FDCWD, newpath, 0); +#else + ret = link (oldpath, newpath); +#endif + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "link %s -> %s failed (%s)", + oldpath, newpath, strerror (errno)); + return -1; + } + + ret = lstat (newpath, &newbuf); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "lstat on %s failed (%s)", + newpath, strerror (errno)); + return -1; + } + } + + if (newbuf.st_ino != oldbuf->st_ino || + newbuf.st_dev != oldbuf->st_dev) { + gf_log (this->name, GF_LOG_WARNING, + "mismatching ino/dev between file %s (%lld/%lld) " + "and handle %s (%lld/%lld)", + oldpath, (long long) oldbuf->st_ino, (long long) oldbuf->st_dev, + newpath, (long long) newbuf.st_ino, (long long) newbuf.st_dev); + ret = -1; + } + + return ret; +} + + +int +posix_handle_soft (xlator_t *this, const char *real_path, loc_t *loc, + uuid_t gfid, struct stat *oldbuf) +{ + char *oldpath = NULL; + char *newpath = NULL; + struct stat newbuf; + int ret = -1; + + + MAKE_HANDLE_PATH (newpath, this, gfid, NULL); + MAKE_HANDLE_RELPATH (oldpath, this, loc->pargfid, loc->name); + + + ret = lstat (newpath, &newbuf); + if (ret == -1 && errno != ENOENT) { + gf_log (this->name, GF_LOG_WARNING, + "%s: %s", newpath, strerror (errno)); + return -1; + } + + if (ret == -1 && errno == ENOENT) { + ret = posix_handle_mkdir_hashes (this, newpath); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "mkdir %s failed (%s)", + newpath, strerror (errno)); + return -1; + } + + ret = symlink (oldpath, newpath); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "symlink %s -> %s failed (%s)", + oldpath, newpath, strerror (errno)); + return -1; + } + + ret = lstat (newpath, &newbuf); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "stat on %s failed (%s)", + newpath, strerror (errno)); + return -1; + } + } + + ret = stat (real_path, &newbuf); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "stat on %s failed (%s)", newpath, strerror (errno)); + return -1; + } + + if (!oldbuf) + return ret; + + if (newbuf.st_ino != oldbuf->st_ino || + newbuf.st_dev != oldbuf->st_dev) { + gf_log (this->name, GF_LOG_WARNING, + "mismatching ino/dev between file %s (%lld/%lld) " + "and handle %s (%lld/%lld)", + oldpath, (long long) oldbuf->st_ino, (long long) oldbuf->st_dev, + newpath, (long long) newbuf.st_ino, (long long) newbuf.st_dev); + ret = -1; + } + + return ret; +} + + +static int +posix_handle_unset_gfid (xlator_t *this, uuid_t gfid) +{ + char *path = NULL; + int ret = 0; + struct stat stat; + + MAKE_HANDLE_GFID_PATH (path, this, gfid, NULL); + + ret = lstat (path, &stat); + + if (ret == -1) { + if (errno != ENOENT) { + gf_log (this->name, GF_LOG_WARNING, + "%s: %s", path, strerror (errno)); + } + goto out; + } + + ret = unlink (path); + if (ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "unlink %s failed (%s)", path, strerror (errno)); + } + +out: + return ret; +} + + +int +posix_handle_unset (xlator_t *this, uuid_t gfid, const char *basename) +{ + int ret; + struct iatt stat; + char *path = NULL; + + + if (!basename) { + ret = posix_handle_unset_gfid (this, gfid); + return ret; + } + + MAKE_HANDLE_PATH (path, this, gfid, basename); + + ret = posix_istat (this, gfid, basename, &stat); + + if (ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s: %s", path, strerror (errno)); + return -1; + } + + ret = posix_handle_unset_gfid (this, stat.ia_gfid); + + return ret; +} + + +int +posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid, + char *real_path) +{ + int ret = -1; + struct stat stbuf = {0,}; + char *newpath = NULL; + + MAKE_HANDLE_PATH (newpath, this, gfid, NULL); + ret = lstat (newpath, &stbuf); + if (!ret) { +#ifdef HAVE_LINKAT + /* + * Use linkat if the target may be a symlink to a directory + * or without an existing target. See comment about linkat() + * usage in posix_link() in posix.c for details + */ + ret = linkat (AT_FDCWD, newpath, AT_FDCWD, real_path, 0); +#else + ret = link (newpath, real_path); +#endif + } + + return ret; +} diff --git a/xlators/storage/posix/src/posix-handle.h b/xlators/storage/posix/src/posix-handle.h new file mode 100644 index 000000000..f1163b727 --- /dev/null +++ b/xlators/storage/posix/src/posix-handle.h @@ -0,0 +1,143 @@ +/* + Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _POSIX_HANDLE_H +#define _POSIX_HANDLE_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/types.h> +#include "xlator.h" + + +#define LOC_HAS_ABSPATH(loc) ((loc) && (loc->path) && (loc->path[0] == '/')) + +#define MAKE_REAL_PATH(var, this, path) do { \ + var = alloca (strlen (path) + POSIX_BASE_PATH_LEN(this) + 2); \ + strcpy (var, POSIX_BASE_PATH(this)); \ + strcpy (&var[POSIX_BASE_PATH_LEN(this)], path); \ + } while (0) + + +#define MAKE_HANDLE_PATH(var, this, gfid, base) do { \ + int __len; \ + __len = posix_handle_path (this, gfid, base, NULL, 0); \ + if (__len <= 0) \ + break; \ + var = alloca (__len); \ + __len = posix_handle_path (this, gfid, base, var, __len); \ + } while (0) + + +#define MAKE_HANDLE_GFID_PATH(var, this, gfid, base) do { \ + int __len = 0; \ + __len = posix_handle_gfid_path (this, gfid, base, NULL, 0); \ + if (__len <= 0) \ + break; \ + var = alloca (__len); \ + __len = posix_handle_gfid_path (this, gfid, base, var, __len); \ + } while (0) + + +#define MAKE_HANDLE_RELPATH(var, this, gfid, base) do { \ + int __len; \ + __len = posix_handle_relpath (this, gfid, base, NULL, 0); \ + if (__len <= 0) \ + break; \ + var = alloca (__len); \ + __len = posix_handle_relpath (this, gfid, base, var, __len); \ + } while (0) + + +#define MAKE_INODE_HANDLE(rpath, this, loc, iatt_p) do { \ + if (uuid_is_null (loc->gfid)) { \ + gf_log (this->name, GF_LOG_ERROR, \ + "null gfid for path %s", loc->path); \ + break; \ + } \ + if (LOC_HAS_ABSPATH (loc)) { \ + MAKE_REAL_PATH (rpath, this, loc->path); \ + op_ret = posix_pstat (this, loc->gfid, rpath, iatt_p); \ + break; \ + } \ + errno = 0; \ + op_ret = posix_istat (this, loc->gfid, NULL, iatt_p); \ + if (errno != ELOOP) { \ + MAKE_HANDLE_PATH (rpath, this, loc->gfid, NULL); \ + break; \ + } \ + /* __ret == -1 && errno == ELOOP */ \ + } while (0) + + +#define MAKE_ENTRY_HANDLE(entp, parp, this, loc, ent_p) do { \ + char *__parp; \ + \ + if (uuid_is_null (loc->pargfid) || !loc->name) { \ + gf_log (this->name, GF_LOG_ERROR, \ + "null pargfid/name for path %s", loc->path); \ + break; \ + } \ + \ + if (LOC_HAS_ABSPATH (loc)) { \ + MAKE_REAL_PATH (entp, this, loc->path); \ + __parp = strdupa (entp); \ + parp = dirname (__parp); \ + op_ret = posix_pstat (this, NULL, entp, ent_p); \ + break; \ + } \ + errno = 0; \ + op_ret = posix_istat (this, loc->pargfid, loc->name, ent_p); \ + if (errno != ELOOP) { \ + MAKE_HANDLE_PATH (parp, this, loc->pargfid, NULL); \ + MAKE_HANDLE_PATH (entp, this, loc->pargfid, loc->name); \ + break; \ + } \ + /* __ret == -1 && errno == ELOOP */ \ + /* expand ELOOP */ \ + } while (0) + + + +int +posix_handle_path (xlator_t *this, uuid_t gfid, const char *basename, char *buf, + size_t len); +int +posix_handle_path_safe (xlator_t *this, uuid_t gfid, const char *basename, + char *buf, size_t len); + +int +posix_handle_gfid_path (xlator_t *this, uuid_t gfid, const char *basename, + char *buf, size_t len); + +int +posix_handle_hard (xlator_t *this, const char *path, uuid_t gfid, + struct stat *buf); + + +int +posix_handle_soft (xlator_t *this, const char *real_path, loc_t *loc, + uuid_t gfid, struct stat *buf); + +int +posix_handle_unset (xlator_t *this, uuid_t gfid, const char *basename); + +int posix_handle_mkdir_hashes (xlator_t *this, const char *newpath); + +int posix_handle_init (xlator_t *this); + +int posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid, + char *real_path); + +int +posix_handle_trash_init (xlator_t *this); +#endif /* !_POSIX_HANDLE_H */ diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c new file mode 100644 index 000000000..e295f8850 --- /dev/null +++ b/xlators/storage/posix/src/posix-helpers.c @@ -0,0 +1,1391 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#define __XOPEN_SOURCE 500 + +#include <stdint.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <errno.h> +#include <libgen.h> +#include <pthread.h> +#include <ftw.h> +#include <sys/stat.h> +#include <signal.h> + +#ifndef GF_BSD_HOST_OS +#include <alloca.h> +#endif /* GF_BSD_HOST_OS */ + +#include "glusterfs.h" +#include "checksum.h" +#include "dict.h" +#include "logging.h" +#include "posix.h" +#include "xlator.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" +#include "syscall.h" +#include "statedump.h" +#include "locking.h" +#include "timer.h" +#include "glusterfs3-xdr.h" +#include "hashfn.h" +#include "glusterfs-acl.h" +#include <fnmatch.h> + +char *marker_xattrs[] = {"trusted.glusterfs.quota.*", + "trusted.glusterfs.*.xtime", + NULL}; + +static char* posix_ignore_xattrs[] = { + "gfid-req", + GLUSTERFS_ENTRYLK_COUNT, + GLUSTERFS_INODELK_COUNT, + GLUSTERFS_POSIXLK_COUNT, + NULL +}; + +gf_boolean_t +posix_special_xattr (char **pattern, char *key) +{ + int i = 0; + gf_boolean_t flag = _gf_false; + + GF_VALIDATE_OR_GOTO ("posix", pattern, out); + GF_VALIDATE_OR_GOTO ("posix", key, out); + + for (i = 0; pattern[i]; i++) { + if (!fnmatch (pattern[i], key, 0)) { + flag = _gf_true; + break; + } + } +out: + return flag; +} + +static gf_boolean_t +posix_xattr_ignorable (char *key, posix_xattr_filler_t *filler) +{ + int i = 0; + gf_boolean_t ignore = _gf_false; + + GF_ASSERT (key); + if (!key) + goto out; + for (i = 0; posix_ignore_xattrs[i]; i++) { + if (!strcmp (key, posix_ignore_xattrs[i])) { + ignore = _gf_true; + goto out; + } + } + if ((!strcmp (key, GF_CONTENT_KEY)) + && (!IA_ISREG (filler->stbuf->ia_type))) + ignore = _gf_true; +out: + return ignore; +} + +static int +_posix_xattr_get_set (dict_t *xattr_req, + char *key, + data_t *data, + void *xattrargs) +{ + posix_xattr_filler_t *filler = xattrargs; + char *value = NULL; + ssize_t xattr_size = -1; + int ret = -1; + char *databuf = NULL; + int _fd = -1; + loc_t *loc = NULL; + ssize_t req_size = 0; + + + if (posix_xattr_ignorable (key, filler)) + goto out; + /* should size be put into the data_t ? */ + if (!strcmp (key, GF_CONTENT_KEY) + && IA_ISREG (filler->stbuf->ia_type)) { + + /* file content request */ + req_size = data_to_uint64 (data); + if (req_size >= filler->stbuf->ia_size) { + _fd = open (filler->real_path, O_RDONLY); + if (_fd == -1) { + gf_log (filler->this->name, GF_LOG_ERROR, + "Opening file %s failed: %s", + filler->real_path, strerror (errno)); + goto err; + } + + databuf = GF_CALLOC (1, filler->stbuf->ia_size, + gf_posix_mt_char); + if (!databuf) { + goto err; + } + + ret = read (_fd, databuf, filler->stbuf->ia_size); + if (ret == -1) { + gf_log (filler->this->name, GF_LOG_ERROR, + "Read on file %s failed: %s", + filler->real_path, strerror (errno)); + goto err; + } + + ret = close (_fd); + _fd = -1; + if (ret == -1) { + gf_log (filler->this->name, GF_LOG_ERROR, + "Close on file %s failed: %s", + filler->real_path, strerror (errno)); + goto err; + } + + ret = dict_set_bin (filler->xattr, key, + databuf, filler->stbuf->ia_size); + if (ret < 0) { + gf_log (filler->this->name, GF_LOG_ERROR, + "failed to set dict value. key: %s, path: %s", + key, filler->real_path); + goto err; + } + + /* To avoid double free in cleanup below */ + databuf = NULL; + err: + if (_fd != -1) + close (_fd); + GF_FREE (databuf); + } + } else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) { + loc = filler->loc; + if (loc) { + ret = dict_set_uint32 (filler->xattr, key, + loc->inode->fd_count); + if (ret < 0) + gf_log (filler->this->name, GF_LOG_WARNING, + "Failed to set dictionary value for %s", + key); + } + } else { + xattr_size = sys_lgetxattr (filler->real_path, key, NULL, 0); + + if (xattr_size > 0) { + value = GF_CALLOC (1, xattr_size + 1, + gf_posix_mt_char); + if (!value) + return -1; + + xattr_size = sys_lgetxattr (filler->real_path, key, value, + xattr_size); + if (xattr_size <= 0) { + gf_log (filler->this->name, GF_LOG_WARNING, + "getxattr failed. path: %s, key: %s", + filler->real_path, key); + GF_FREE (value); + return -1; + } + + value[xattr_size] = '\0'; + ret = dict_set_bin (filler->xattr, key, + value, xattr_size); + if (ret < 0) { + gf_log (filler->this->name, GF_LOG_DEBUG, + "dict set failed. path: %s, key: %s", + filler->real_path, key); + GF_FREE (value); + } + } + } +out: + return 0; +} + + +int +posix_fill_gfid_path (xlator_t *this, const char *path, struct iatt *iatt) +{ + int ret = 0; + ssize_t size = 0; + + if (!iatt) + return 0; + + size = sys_lgetxattr (path, GFID_XATTR_KEY, iatt->ia_gfid, 16); + /* Return value of getxattr */ + if ((size == 16) || (size == -1)) + ret = 0; + else + ret = size; + + return ret; +} + + +int +posix_fill_gfid_fd (xlator_t *this, int fd, struct iatt *iatt) +{ + int ret = 0; + ssize_t size = 0; + + if (!iatt) + return 0; + + size = sys_fgetxattr (fd, GFID_XATTR_KEY, iatt->ia_gfid, 16); + /* Return value of getxattr */ + if ((size == 16) || (size == -1)) + ret = 0; + else + ret = size; + + return ret; +} + +void +posix_fill_ino_from_gfid (xlator_t *this, struct iatt *buf) +{ + uint64_t temp_ino = 0; + int j = 0; + int i = 0; + + /* consider least significant 8 bytes of value out of gfid */ + if (uuid_is_null (buf->ia_gfid)) { + buf->ia_ino = -1; + goto out; + } + for (i = 15; i > (15 - 8); i--) { + temp_ino += (uint64_t)(buf->ia_gfid[i]) << j; + j += 8; + } + buf->ia_ino = temp_ino; +out: + return; +} + +int +posix_fdstat (xlator_t *this, int fd, struct iatt *stbuf_p) +{ + int ret = 0; + struct stat fstatbuf = {0, }; + struct iatt stbuf = {0, }; + + ret = fstat (fd, &fstatbuf); + if (ret == -1) + goto out; + + if (fstatbuf.st_nlink && !S_ISDIR (fstatbuf.st_mode)) + fstatbuf.st_nlink--; + + iatt_from_stat (&stbuf, &fstatbuf); + + ret = posix_fill_gfid_fd (this, fd, &stbuf); + if (ret) + gf_log_callingfn (this->name, GF_LOG_DEBUG, "failed to get gfid"); + + posix_fill_ino_from_gfid (this, &stbuf); + + if (stbuf_p) + *stbuf_p = stbuf; + +out: + return ret; +} + + +int +posix_istat (xlator_t *this, uuid_t gfid, const char *basename, + struct iatt *buf_p) +{ + char *real_path = NULL; + struct stat lstatbuf = {0, }; + struct iatt stbuf = {0, }; + int ret = 0; + struct posix_private *priv = NULL; + + + priv = this->private; + + MAKE_HANDLE_PATH (real_path, this, gfid, basename); + + ret = lstat (real_path, &lstatbuf); + + if (ret != 0) { + if (ret == -1) { + if (errno != ENOENT && errno != ELOOP) + gf_log (this->name, GF_LOG_WARNING, + "lstat failed on %s (%s)", + real_path, strerror (errno)); + } else { + // may be some backend filesystem issue + gf_log (this->name, GF_LOG_ERROR, "lstat failed on " + "%s and return value is %d instead of -1. " + "Please see dmesg output to check whether the " + "failure is due to backend filesystem issue", + real_path, ret); + ret = -1; + } + goto out; + } + + if ((lstatbuf.st_ino == priv->handledir.st_ino) && + (lstatbuf.st_dev == priv->handledir.st_dev)) { + errno = ENOENT; + return -1; + } + + if (!S_ISDIR (lstatbuf.st_mode)) + lstatbuf.st_nlink --; + + iatt_from_stat (&stbuf, &lstatbuf); + + if (basename) + posix_fill_gfid_path (this, real_path, &stbuf); + else + uuid_copy (stbuf.ia_gfid, gfid); + + posix_fill_ino_from_gfid (this, &stbuf); + + if (buf_p) + *buf_p = stbuf; +out: + return ret; +} + + + +int +posix_pstat (xlator_t *this, uuid_t gfid, const char *path, + struct iatt *buf_p) +{ + struct stat lstatbuf = {0, }; + struct iatt stbuf = {0, }; + int ret = 0; + struct posix_private *priv = NULL; + + + priv = this->private; + + ret = lstat (path, &lstatbuf); + + if (ret != 0) { + if (ret == -1) { + if (errno != ENOENT) + gf_log (this->name, GF_LOG_WARNING, + "lstat failed on %s (%s)", + path, strerror (errno)); + } else { + // may be some backend filesytem issue + gf_log (this->name, GF_LOG_ERROR, "lstat failed on " + "%s and return value is %d instead of -1. " + "Please see dmesg output to check whether the " + "failure is due to backend filesystem issue", + path, ret); + ret = -1; + } + goto out; + } + + if ((lstatbuf.st_ino == priv->handledir.st_ino) && + (lstatbuf.st_dev == priv->handledir.st_dev)) { + errno = ENOENT; + return -1; + } + + if (!S_ISDIR (lstatbuf.st_mode)) + lstatbuf.st_nlink --; + + iatt_from_stat (&stbuf, &lstatbuf); + + if (gfid && !uuid_is_null (gfid)) + uuid_copy (stbuf.ia_gfid, gfid); + else + posix_fill_gfid_path (this, path, &stbuf); + + posix_fill_ino_from_gfid (this, &stbuf); + + if (buf_p) + *buf_p = stbuf; +out: + return ret; +} + + +dict_t * +posix_lookup_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc, + dict_t *xattr_req, struct iatt *buf) +{ + dict_t *xattr = NULL; + posix_xattr_filler_t filler = {0, }; + + xattr = get_new_dict(); + if (!xattr) { + goto out; + } + + filler.this = this; + filler.real_path = real_path; + filler.xattr = xattr; + filler.stbuf = buf; + filler.loc = loc; + + dict_foreach (xattr_req, _posix_xattr_get_set, &filler); +out: + return xattr; +} + + +int +posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) +{ + void *uuid_req = NULL; + uuid_t uuid_curr; + int ret = 0; + ssize_t size = 0; + struct stat stat = {0, }; + + + if (!xattr_req) + goto out; + + if (sys_lstat (path, &stat) != 0) + goto out; + + size = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); + if (size == 16) { + ret = 0; + goto verify_handle; + } + + ret = dict_get_ptr (xattr_req, "gfid-req", &uuid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get the gfid from dict for %s", + loc->path); + goto out; + } + + ret = sys_lsetxattr (path, GFID_XATTR_KEY, uuid_req, 16, XATTR_CREATE); + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, + "setting GFID on %s failed (%s)", path, + strerror (errno)); + goto out; + } + uuid_copy (uuid_curr, uuid_req); + +verify_handle: + if (!S_ISDIR (stat.st_mode)) + ret = posix_handle_hard (this, path, uuid_curr, &stat); + else + ret = posix_handle_soft (this, path, loc, uuid_curr, &stat); + +out: + return ret; +} + + +int +posix_set_file_contents (xlator_t *this, const char *path, char *keyp, + data_t *value, int flags) +{ + char * key = NULL; + char real_path[PATH_MAX]; + int32_t file_fd = -1; + int op_ret = 0; + int ret = -1; + + + /* XXX: does not handle assigning GFID to created files */ + return -1; + + key = &(keyp[15]); + sprintf (real_path, "%s/%s", path, key); + + if (flags & XATTR_REPLACE) { + /* if file exists, replace it + * else, error out */ + file_fd = open (real_path, O_TRUNC|O_WRONLY); + + if (file_fd == -1) { + goto create; + } + + if (value->len) { + ret = write (file_fd, value->data, value->len); + if (ret == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "write failed while doing setxattr " + "for key %s on path %s: %s", + key, real_path, strerror (errno)); + goto out; + } + + ret = close (file_fd); + if (ret == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "close failed on %s: %s", + real_path, strerror (errno)); + goto out; + } + } + + create: /* we know file doesn't exist, create it */ + + file_fd = open (real_path, O_CREAT|O_WRONLY, 0644); + + if (file_fd == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "failed to open file %s with O_CREAT: %s", + key, strerror (errno)); + goto out; + } + + ret = write (file_fd, value->data, value->len); + if (ret == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "write failed on %s while setxattr with " + "key %s: %s", + real_path, key, strerror (errno)); + goto out; + } + + ret = close (file_fd); + if (ret == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "close failed on %s while setxattr with " + "key %s: %s", + real_path, key, strerror (errno)); + goto out; + } + } + +out: + return op_ret; +} + + +int +posix_get_file_contents (xlator_t *this, uuid_t pargfid, + const char *name, char **contents) +{ + char *real_path = NULL; + int32_t file_fd = -1; + struct iatt stbuf = {0,}; + int op_ret = 0; + int ret = -1; + + + MAKE_HANDLE_PATH (real_path, this, pargfid, name); + + op_ret = posix_istat (this, pargfid, name, &stbuf); + if (op_ret == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, "lstat failed on %s: %s", + real_path, strerror (errno)); + goto out; + } + + file_fd = open (real_path, O_RDONLY); + + if (file_fd == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, "open failed on %s: %s", + real_path, strerror (errno)); + goto out; + } + + *contents = GF_CALLOC (stbuf.ia_size + 1, sizeof(char), + gf_posix_mt_char); + if (! *contents) { + op_ret = -errno; + goto out; + } + + ret = read (file_fd, *contents, stbuf.ia_size); + if (ret <= 0) { + op_ret = -1; + gf_log (this->name, GF_LOG_ERROR, "read on %s failed: %s", + real_path, strerror (errno)); + goto out; + } + + *contents[stbuf.ia_size] = '\0'; + + op_ret = close (file_fd); + file_fd = -1; + if (op_ret == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, "close on %s failed: %s", + real_path, strerror (errno)); + goto out; + } + +out: + if (op_ret < 0) { + GF_FREE (*contents); + if (file_fd != -1) + close (file_fd); + } + + return op_ret; +} + +static int gf_xattr_enotsup_log; + +int +posix_handle_pair (xlator_t *this, const char *real_path, + char *key, data_t *value, int flags) +{ + int sys_ret = -1; + int ret = 0; + + if (ZR_FILE_CONTENT_REQUEST(key)) { + ret = posix_set_file_contents (this, real_path, key, value, + flags); + } else { + sys_ret = sys_lsetxattr (real_path, key, value->data, + value->len, flags); + + if (sys_ret < 0) { + if (errno == ENOTSUP) { + GF_LOG_OCCASIONALLY(gf_xattr_enotsup_log, + this->name,GF_LOG_WARNING, + "Extended attributes not " + "supported (try remounting " + "brick with 'user_xattr' " + "flag)"); + } else if (errno == ENOENT) { + if (!posix_special_xattr (marker_xattrs, + key)) { + gf_log (this->name, GF_LOG_ERROR, + "setxattr on %s failed: %s", + real_path, strerror (errno)); + } + } else { + +#ifdef GF_DARWIN_HOST_OS + gf_log (this->name, + ((errno == EINVAL) ? + GF_LOG_DEBUG : GF_LOG_ERROR), + "%s: key:%s error:%s", + real_path, key, + strerror (errno)); +#else /* ! DARWIN */ + gf_log (this->name, GF_LOG_ERROR, + "%s: key:%s error:%s", + real_path, key, + strerror (errno)); +#endif /* DARWIN */ + } + + ret = -errno; + goto out; + } + } +out: + return ret; +} + +int +posix_fhandle_pair (xlator_t *this, int fd, + char *key, data_t *value, int flags) +{ + int sys_ret = -1; + int ret = 0; + + sys_ret = sys_fsetxattr (fd, key, value->data, + value->len, flags); + + if (sys_ret < 0) { + if (errno == ENOTSUP) { + GF_LOG_OCCASIONALLY(gf_xattr_enotsup_log, + this->name,GF_LOG_WARNING, + "Extended attributes not " + "supported (try remounting " + "brick with 'user_xattr' " + "flag)"); + } else if (errno == ENOENT) { + gf_log (this->name, GF_LOG_ERROR, + "fsetxattr on fd=%d failed: %s", fd, + strerror (errno)); + } else { + +#ifdef GF_DARWIN_HOST_OS + gf_log (this->name, + ((errno == EINVAL) ? + GF_LOG_DEBUG : GF_LOG_ERROR), + "fd=%d: key:%s error:%s", + fd, key, strerror (errno)); +#else /* ! DARWIN */ + gf_log (this->name, GF_LOG_ERROR, + "fd=%d: key:%s error:%s", + fd, key, strerror (errno)); +#endif /* DARWIN */ + } + + ret = -errno; + goto out; + } + +out: + return ret; +} + + +static int +janitor_walker (const char *fpath, const struct stat *sb, + int typeflag, struct FTW *ftwbuf) +{ + struct iatt stbuf = {0, }; + xlator_t *this = NULL; + + this = THIS; + posix_pstat (this, NULL, fpath, &stbuf); + switch (sb->st_mode & S_IFMT) { + case S_IFREG: + case S_IFBLK: + case S_IFLNK: + case S_IFCHR: + case S_IFIFO: + case S_IFSOCK: + gf_log (THIS->name, GF_LOG_TRACE, + "unlinking %s", fpath); + unlink (fpath); + if (stbuf.ia_nlink == 1) + posix_handle_unset (this, stbuf.ia_gfid, NULL); + break; + + case S_IFDIR: + if (ftwbuf->level) { /* don't remove top level dir */ + gf_log (THIS->name, GF_LOG_TRACE, + "removing directory %s", fpath); + + rmdir (fpath); + posix_handle_unset (this, stbuf.ia_gfid, NULL); + } + break; + } + + return 0; /* 0 = FTW_CONTINUE */ +} + + +static struct posix_fd * +janitor_get_next_fd (xlator_t *this) +{ + struct posix_private *priv = NULL; + struct posix_fd *pfd = NULL; + + struct timespec timeout; + + priv = this->private; + + pthread_mutex_lock (&priv->janitor_lock); + { + if (list_empty (&priv->janitor_fds)) { + time (&timeout.tv_sec); + timeout.tv_sec += priv->janitor_sleep_duration; + timeout.tv_nsec = 0; + + pthread_cond_timedwait (&priv->janitor_cond, + &priv->janitor_lock, + &timeout); + goto unlock; + } + + pfd = list_entry (priv->janitor_fds.next, struct posix_fd, + list); + + list_del (priv->janitor_fds.next); + } +unlock: + pthread_mutex_unlock (&priv->janitor_lock); + + return pfd; +} + + +static void * +posix_janitor_thread_proc (void *data) +{ + xlator_t * this = NULL; + struct posix_private *priv = NULL; + struct posix_fd *pfd; + + time_t now; + + this = data; + priv = this->private; + + THIS = this; + + while (1) { + time (&now); + if ((now - priv->last_landfill_check) > priv->janitor_sleep_duration) { + gf_log (this->name, GF_LOG_TRACE, + "janitor cleaning out %s", priv->trash_path); + + nftw (priv->trash_path, + janitor_walker, + 32, + FTW_DEPTH | FTW_PHYS); + + priv->last_landfill_check = now; + } + + pfd = janitor_get_next_fd (this); + if (pfd) { + if (pfd->dir == NULL) { + gf_log (this->name, GF_LOG_TRACE, + "janitor: closing file fd=%d", pfd->fd); + close (pfd->fd); + } else { + gf_log (this->name, GF_LOG_TRACE, + "janitor: closing dir fd=%p", pfd->dir); + closedir (pfd->dir); + } + + GF_FREE (pfd); + } + } + + return NULL; +} + + +void +posix_spawn_janitor_thread (xlator_t *this) +{ + struct posix_private *priv = NULL; + int ret = 0; + + priv = this->private; + + LOCK (&priv->lock); + { + if (!priv->janitor_present) { + ret = gf_thread_create (&priv->janitor, NULL, + posix_janitor_thread_proc, this); + + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "spawning janitor thread failed: %s", + strerror (errno)); + goto unlock; + } + + priv->janitor_present = _gf_true; + } + } +unlock: + UNLOCK (&priv->lock); +} + +static int +is_fresh_file (struct stat *stat) +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + + if ((stat->st_ctime >= (tv.tv_sec - 1)) + && (stat->st_ctime <= tv.tv_sec)) + return 1; + + return 0; +} + + +int +posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) +{ + /* The purpose of this function is to prevent a race + where an inode creation FOP (like mkdir/mknod/create etc) + races with lookup in the following way: + + {create thread} | {lookup thread} + | + t0 + mkdir ("name") | + t1 + | posix_gfid_set ("name", 2); + t2 + posix_gfid_set ("name", 1); | + t3 + lstat ("name"); | lstat ("name"); + + In the above case mkdir FOP would have resulted with GFID 2 while + it should have been GFID 1. It matters in the case where GFID would + have gotten set to 1 on other subvolumes of replciate/distribute + + The "solution" here is that, if we detect lookup is attempting to + set a GFID on a file which is created very recently, but does not + yet have a GFID (i.e, between t1 and t2), then "fake" it as though + posix_gfid_heal was called at t0 instead. + */ + + uuid_t uuid_curr; + int ret = 0; + struct stat stat = {0, }; + + if (!xattr_req) + goto out; + + if (sys_lstat (path, &stat) != 0) + goto out; + + ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); + if (ret != 16) { + if (is_fresh_file (&stat)) { + ret = -1; + errno = ENOENT; + goto out; + } + } + + ret = posix_gfid_set (this, path, loc, xattr_req); +out: + return ret; +} + + +int +posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req) +{ + int ret = 0; + data_t *data = NULL; + struct stat stat = {0, }; + + if (!xattr_req) + goto out; + + if (sys_lstat (path, &stat) != 0) + goto out; + + data = dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR); + if (data) { + ret = sys_lsetxattr (path, POSIX_ACL_ACCESS_XATTR, + data->data, data->len, 0); + if (ret != 0) + goto out; + } + + data = dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR); + if (data) { + ret = sys_lsetxattr (path, POSIX_ACL_DEFAULT_XATTR, + data->data, data->len, 0); + if (ret != 0) + goto out; + } + +out: + return ret; +} + +static int +_handle_entry_create_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + int ret = -1; + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + if (!strcmp (GFID_XATTR_KEY, k) || + !strcmp ("gfid-req", k) || + !strcmp (POSIX_ACL_DEFAULT_XATTR, k) || + !strcmp (POSIX_ACL_ACCESS_XATTR, k) || + ZR_FILE_CONTENT_REQUEST(k)) { + return 0; + } + + ret = posix_handle_pair (filler->this, filler->real_path, k, v, + XATTR_CREATE); + if (ret < 0) { + errno = -ret; + return -1; + } + return 0; +} + +int +posix_entry_create_xattr_set (xlator_t *this, const char *path, + dict_t *dict) +{ + int ret = -1; + + posix_xattr_filler_t filler = {0,}; + + if (!dict) + goto out; + + filler.this = this; + filler.real_path = path; + + ret = dict_foreach (dict, _handle_entry_create_keyvalue_pair, &filler); + +out: + return ret; +} + + +static int +__posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd_p) +{ + uint64_t tmp_pfd = 0; + struct posix_fd *pfd = NULL; + int ret = -1; + char *real_path = NULL; + int _fd = -1; + DIR *dir = NULL; + + ret = __fd_ctx_get (fd, this, &tmp_pfd); + if (ret == 0) { + pfd = (void *)(long) tmp_pfd; + ret = 0; + goto out; + } + + if (!fd_is_anonymous(fd)) + /* anonymous fd */ + goto out; + + MAKE_HANDLE_PATH (real_path, this, fd->inode->gfid, NULL); + + pfd = GF_CALLOC (1, sizeof (*pfd), gf_posix_mt_posix_fd); + if (!pfd) { + goto out; + } + pfd->fd = -1; + + if (fd->inode->ia_type == IA_IFDIR) { + dir = opendir (real_path); + if (!dir) { + GF_FREE (pfd); + pfd = NULL; + goto out; + } + _fd = dirfd (dir); + } + + if (fd->inode->ia_type == IA_IFREG) { + _fd = open (real_path, O_RDWR|O_LARGEFILE); + if (_fd == -1) { + GF_FREE (pfd); + pfd = NULL; + goto out; + } + } + + pfd->fd = _fd; + pfd->dir = dir; + + ret = __fd_ctx_set (fd, this, (uint64_t) (long) pfd); + if (ret != 0) { + if (_fd != -1) + close (_fd); + if (dir) + closedir (dir); + GF_FREE (pfd); + pfd = NULL; + goto out; + } + + ret = 0; +out: + if (pfd_p) + *pfd_p = pfd; + return ret; +} + + +int +posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd) +{ + int ret; + + LOCK (&fd->inode->lock); + { + ret = __posix_fd_ctx_get (fd, this, pfd); + } + UNLOCK (&fd->inode->lock); + + return ret; +} + +static void * +posix_health_check_thread_proc (void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + uint32_t interval = 0; + int ret = -1; + struct stat sb = {0, }; + + this = data; + priv = this->private; + + /* prevent races when the interval is updated */ + interval = priv->health_check_interval; + if (interval == 0) + goto out; + + gf_log (this->name, GF_LOG_DEBUG, "health-check thread started, " + "interval = %d seconds", interval); + + while (1) { + /* aborting sleep() is a request to exit this thread, sleep() + * will normally not return when cancelled */ + ret = sleep (interval); + if (ret > 0) + break; + + /* prevent thread errors while doing the health-check(s) */ + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + + /* Do the health-check, it should be moved to its own function + * in case it gets more complex. */ + ret = stat (priv->base_path, &sb); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "stat() on %s returned: %s", priv->base_path, + strerror (errno)); + goto abort; + } + + pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL); + } + +out: + gf_log (this->name, GF_LOG_DEBUG, "health-check thread exiting"); + + LOCK (&priv->lock); + { + priv->health_check_active = _gf_false; + } + UNLOCK (&priv->lock); + + return NULL; + +abort: + /* health-check failed */ + gf_log (this->name, GF_LOG_EMERG, "health-check failed, going down"); + xlator_notify (this->parents->xlator, GF_EVENT_CHILD_DOWN, this); + + ret = sleep (30); + if (ret == 0) { + gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGTERM"); + kill (getpid(), SIGTERM); + } + + ret = sleep (30); + if (ret == 0) { + gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGKILL"); + kill (getpid(), SIGKILL); + } + + return NULL; +} + +void +posix_spawn_health_check_thread (xlator_t *xl) +{ + struct posix_private *priv = NULL; + int ret = -1; + + priv = xl->private; + + LOCK (&priv->lock); + { + /* cancel the running thread */ + if (priv->health_check_active == _gf_true) { + pthread_cancel (priv->health_check); + priv->health_check_active = _gf_false; + } + + /* prevent scheduling a check in a tight loop */ + if (priv->health_check_interval == 0) + goto unlock; + + ret = gf_thread_create (&priv->health_check, NULL, + posix_health_check_thread_proc, xl); + if (ret < 0) { + priv->health_check_interval = 0; + priv->health_check_active = _gf_false; + gf_log (xl->name, GF_LOG_ERROR, + "unable to setup health-check thread: %s", + strerror (errno)); + goto unlock; + } + + /* run the thread detached, resources will be freed on exit */ + pthread_detach (priv->health_check); + priv->health_check_active = _gf_true; + } +unlock: + UNLOCK (&priv->lock); +} + +int +posix_fsyncer_pick (xlator_t *this, struct list_head *head) +{ + struct posix_private *priv = NULL; + int count = 0; + + priv = this->private; + pthread_mutex_lock (&priv->fsync_mutex); + { + while (list_empty (&priv->fsyncs)) + pthread_cond_wait (&priv->fsync_cond, + &priv->fsync_mutex); + + count = priv->fsync_queue_count; + priv->fsync_queue_count = 0; + list_splice_init (&priv->fsyncs, head); + } + pthread_mutex_unlock (&priv->fsync_mutex); + + return count; +} + + +void +posix_fsyncer_process (xlator_t *this, call_stub_t *stub, gf_boolean_t do_fsync) +{ + struct posix_fd *pfd = NULL; + int ret = -1; + struct posix_private *priv = NULL; + + priv = this->private; + + ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not get fdctx for fd(%s)", + uuid_utoa (stub->args.fd->inode->gfid)); + call_unwind_error (stub, -1, EINVAL); + return; + } + + if (do_fsync) { +#ifdef HAVE_FDATASYNC + if (stub->args.datasync) + ret = fdatasync (pfd->fd); + else +#endif + ret = fsync (pfd->fd); + } else { + ret = 0; + } + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "could not fstat fd(%s)", + uuid_utoa (stub->args.fd->inode->gfid)); + call_unwind_error (stub, -1, errno); + return; + } + + call_unwind_error (stub, 0, 0); +} + + +static void +posix_fsyncer_syncfs (xlator_t *this, struct list_head *head) +{ + call_stub_t *stub = NULL; + struct posix_fd *pfd = NULL; + int ret = -1; + + stub = list_entry (head->prev, call_stub_t, list); + ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); + if (ret) + return; + +#ifdef GF_LINUX_HOST_OS + /* syncfs() is not "declared" in RHEL's glibc even though + the kernel has support. + */ +#include <sys/syscall.h> +#include <unistd.h> +#ifdef SYS_syncfs + syscall (SYS_syncfs, pfd->fd); +#else + sync(); +#endif +#else + sync(); +#endif +} + + +void * +posix_fsyncer (void *d) +{ + xlator_t *this = d; + struct posix_private *priv = NULL; + call_stub_t *stub = NULL; + call_stub_t *tmp = NULL; + struct list_head list; + int count = 0; + gf_boolean_t do_fsync = _gf_true; + + priv = this->private; + + for (;;) { + INIT_LIST_HEAD (&list); + + count = posix_fsyncer_pick (this, &list); + + usleep (priv->batch_fsync_delay_usec); + + gf_log (this->name, GF_LOG_DEBUG, + "picked %d fsyncs", count); + + switch (priv->batch_fsync_mode) { + case BATCH_NONE: + case BATCH_REVERSE_FSYNC: + break; + case BATCH_SYNCFS: + case BATCH_SYNCFS_SINGLE_FSYNC: + case BATCH_SYNCFS_REVERSE_FSYNC: + posix_fsyncer_syncfs (this, &list); + break; + } + + if (priv->batch_fsync_mode == BATCH_SYNCFS) + do_fsync = _gf_false; + else + do_fsync = _gf_true; + + list_for_each_entry_safe_reverse (stub, tmp, &list, list) { + list_del_init (&stub->list); + + posix_fsyncer_process (this, stub, do_fsync); + + if (priv->batch_fsync_mode == BATCH_SYNCFS_SINGLE_FSYNC) + do_fsync = _gf_false; + } + } +} diff --git a/xlators/storage/posix/src/posix-mem-types.h b/xlators/storage/posix/src/posix-mem-types.h new file mode 100644 index 000000000..81752c17e --- /dev/null +++ b/xlators/storage/posix/src/posix-mem-types.h @@ -0,0 +1,27 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __POSIX_MEM_TYPES_H__ +#define __POSIX_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_posix_mem_types_ { + gf_posix_mt_dir_entry_t = gf_common_mt_end + 1, + gf_posix_mt_posix_fd, + gf_posix_mt_char, + gf_posix_mt_posix_private, + gf_posix_mt_int32_t, + gf_posix_mt_posix_dev_t, + gf_posix_mt_trash_path, + gf_posix_mt_paiocb, + gf_posix_mt_end +}; +#endif + diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index b730b136f..fb45c7a67 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" @@ -24,17 +14,28 @@ #define __XOPEN_SOURCE 500 +#include <openssl/md5.h> #include <stdint.h> #include <sys/time.h> #include <sys/resource.h> #include <errno.h> +#include <libgen.h> +#include <pthread.h> #include <ftw.h> +#include <sys/stat.h> +#include <signal.h> +#include <sys/uio.h> #ifndef GF_BSD_HOST_OS #include <alloca.h> #endif /* GF_BSD_HOST_OS */ +#ifdef HAVE_LINKAT +#include <fcntl.h> +#endif /* HAVE_LINKAT */ + #include "glusterfs.h" +#include "checksum.h" #include "dict.h" #include "logging.h" #include "posix.h" @@ -45,18 +46,28 @@ #include "compat.h" #include "byte-order.h" #include "syscall.h" +#include "statedump.h" +#include "locking.h" +#include "timer.h" +#include "glusterfs3-xdr.h" +#include "hashfn.h" +#include "posix-aio.h" +#include "glusterfs-acl.h" + +extern char *marker_xattrs[]; +#define ALIGN_SIZE 4096 #undef HAVE_SET_FSID #ifdef HAVE_SET_FSID #define DECLARE_OLD_FS_ID_VAR uid_t old_fsuid; gid_t old_fsgid; -#define SET_FS_ID(uid, gid) do { \ +#define SET_FS_ID(uid, gid) do { \ old_fsuid = setfsuid (uid); \ old_fsgid = setfsgid (gid); \ } while (0) -#define SET_TO_OLD_FS_ID() do { \ +#define SET_TO_OLD_FS_ID() do { \ setfsuid (old_fsuid); \ setfsgid (old_fsgid); \ } while (0) @@ -69,243 +80,106 @@ #endif -typedef struct { - xlator_t *this; - const char *real_path; - dict_t *xattr; - struct stat *stbuf; - loc_t *loc; -} posix_xattr_filler_t; - int posix_forget (xlator_t *this, inode_t *inode) { - uint64_t tmp_cache = 0; - if (!inode_ctx_del (inode, this, &tmp_cache)) - dict_destroy ((dict_t *)(long)tmp_cache); - - return 0; -} - -static void -_posix_xattr_get_set (dict_t *xattr_req, - char *key, - data_t *data, - void *xattrargs) -{ - posix_xattr_filler_t *filler = xattrargs; - char *value = NULL; - ssize_t xattr_size = -1; - int ret = -1; - char *databuf = NULL; - int _fd = -1; - loc_t *loc = NULL; - ssize_t req_size = 0; - - - /* should size be put into the data_t ? */ - if (!strcmp (key, "glusterfs.content")) { - /* file content request */ - req_size = data_to_uint64 (data); - if (req_size >= filler->stbuf->st_size) { - _fd = open (filler->real_path, O_RDONLY); - - if (_fd == -1) { - gf_log (filler->this->name, GF_LOG_ERROR, - "Opening file %s failed: %s", - filler->real_path, strerror (errno)); - goto err; - } - - databuf = calloc (1, filler->stbuf->st_size); - - if (!databuf) { - gf_log (filler->this->name, GF_LOG_ERROR, - "Out of memory."); - goto err; - } - - ret = read (_fd, databuf, filler->stbuf->st_size); - if (ret == -1) { - gf_log (filler->this->name, GF_LOG_ERROR, - "Read on file %s failed: %s", - filler->real_path, strerror (errno)); - goto err; - } - - ret = close (_fd); - _fd = -1; - if (ret == -1) { - gf_log (filler->this->name, GF_LOG_ERROR, - "Close on file %s failed: %s", - filler->real_path, strerror (errno)); - goto err; - } - - ret = dict_set_bin (filler->xattr, key, - databuf, filler->stbuf->st_size); - if (ret < 0) { - goto err; - } - - /* To avoid double free in cleanup below */ - databuf = NULL; - err: - if (_fd != -1) - close (_fd); - if (databuf) - FREE (databuf); - } - } else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) { - loc = filler->loc; - if (!list_empty (&loc->inode->fd_list)) { - ret = dict_set_uint32 (filler->xattr, key, 1); - } else { - ret = dict_set_uint32 (filler->xattr, key, 0); - } - } else { - xattr_size = sys_lgetxattr (filler->real_path, key, NULL, 0); - - if (xattr_size > 0) { - value = calloc (1, xattr_size + 1); - - sys_lgetxattr (filler->real_path, key, value, - xattr_size); - - value[xattr_size] = '\0'; - ret = dict_set_bin (filler->xattr, key, - value, xattr_size); - if (ret < 0) - gf_log (filler->this->name, GF_LOG_DEBUG, - "dict set failed. path: %s, key: %s", - filler->real_path, key); - } - } -} - - -dict_t * -posix_lookup_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc, - dict_t *xattr_req, struct stat *buf) -{ - dict_t *xattr = NULL; - posix_xattr_filler_t filler = {0, }; - - xattr = get_new_dict(); - if (!xattr) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - filler.this = this; - filler.real_path = real_path; - filler.xattr = xattr; - filler.stbuf = buf; - filler.loc = loc; - - dict_foreach (xattr_req, _posix_xattr_get_set, &filler); -out: - return xattr; -} - - -static int -posix_scale_st_ino (struct posix_private *priv, struct stat *buf) -{ - int i = 0; - int ret = -1; - ino_t temp_ino = 0; - - for (i = 0; i < priv->num_devices_to_span; i++) { - if (buf->st_dev == priv->st_device[i]) - break; - if (priv->st_device[i] == 0) { - priv->st_device[i] = buf->st_dev; - break; - } - } - - if (i == priv->num_devices_to_span) - goto out; - - temp_ino = (buf->st_ino * priv->num_devices_to_span) + i; - - buf->st_ino = temp_ino; + uint64_t tmp_cache = 0; + if (!inode_ctx_del (inode, this, &tmp_cache)) + dict_destroy ((dict_t *)(long)tmp_cache); - ret = 0; - out: - return ret; + return 0; } +/* Regular fops */ int32_t posix_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) + loc_t *loc, dict_t *xdata) { - struct stat buf = {0, }; - char * real_path = NULL; + struct iatt buf = {0, }; int32_t op_ret = -1; + int32_t entry_ret = 0; int32_t op_errno = 0; dict_t * xattr = NULL; - - struct posix_private *priv = NULL; + char * real_path = NULL; + char * par_path = NULL; + struct iatt postparent = {0,}; + int32_t gfidless = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (loc->path, out); - MAKE_REAL_PATH (real_path, this, loc->path); + /* The Hidden directory should be for housekeeping purpose and it + should not get any gfid on it */ + if (__is_root_gfid (loc->pargfid) && + (strcmp (loc->name, GF_HIDDEN_PATH) == 0)) { + gf_log (this->name, GF_LOG_WARNING, + "Lookup issued on %s, which is not permitted", + GF_HIDDEN_PATH); + op_errno = EPERM; + op_ret = -1; + goto out; + } - priv = this->private; + op_ret = dict_get_int32 (xdata, GF_GFIDLESS_LOOKUP, &gfidless); + op_ret = -1; + if (uuid_is_null (loc->pargfid)) { + /* nameless lookup */ + MAKE_INODE_HANDLE (real_path, this, loc, &buf); + } else { + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf); + + if (uuid_is_null (loc->inode->gfid)) { + posix_gfid_heal (this, real_path, loc, xdata); + MAKE_ENTRY_HANDLE (real_path, par_path, this, + loc, &buf); + } + } - op_ret = lstat (real_path, &buf); op_errno = errno; if (op_ret == -1) { - if (op_errno != ENOENT) { - gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", - loc->path, strerror (op_errno)); - } - goto out; - } - - /* Make sure we don't access another mountpoint inside export dir. - * It may cause inode number to repeat from single export point, - * which leads to severe problems.. - */ - if (!priv->span_devices) { - if (priv->st_device[0] != buf.st_dev) { - op_errno = ENOENT; + if (op_errno != ENOENT) { gf_log (this->name, GF_LOG_ERROR, - "%s: different mountpoint/device, returning " - "ENOENT", loc->path); - goto out; + "lstat on %s failed: %s", + real_path, strerror (op_errno)); } - } else { - op_ret = posix_scale_st_ino (priv, &buf); - if (-1 == op_ret) { - op_errno = ENOENT; + + entry_ret = -1; + goto parent; + } + + if (xdata && (op_ret == 0)) { + xattr = posix_lookup_xattr_fill (this, real_path, loc, + xdata, &buf); + } + +parent: + if (par_path) { + op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent); + if (op_ret == -1) { + op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "%s: from different mountpoint", - loc->path); + "post-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); goto out; } } - if (xattr_req && (op_ret == 0)) { - xattr = posix_lookup_xattr_fill (this, real_path, loc, - xattr_req, &buf); - } - - op_ret = 0; + op_ret = entry_ret; out: if (xattr) dict_ref (xattr); - STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &buf, xattr); + if (!op_ret && !gfidless && uuid_is_null (buf.ia_gfid)) { + gf_log (this->name, GF_LOG_ERROR, "buf->ia_gfid is null for " + "%s", (real_path) ? real_path: ""); + op_ret = -1; + op_errno = ENODATA; + } + STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, + (loc)?loc->inode:NULL, &buf, xattr, &postparent); if (xattr) dict_unref (xattr); @@ -315,14 +189,13 @@ out: int32_t -posix_stat (call_frame_t *frame, - xlator_t *this, - loc_t *loc) +posix_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - struct stat buf = {0,}; - char * real_path = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; + struct iatt buf = {0,}; + int32_t op_ret = -1; + int32_t op_errno = 0; + struct posix_private *priv = NULL; + char *real_path = NULL; DECLARE_OLD_FS_ID_VAR; @@ -330,402 +203,829 @@ posix_stat (call_frame_t *frame, VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); + priv = this->private; + VALIDATE_OR_GOTO (priv, out); + SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); - op_ret = lstat (real_path, &buf); + MAKE_INODE_HANDLE (real_path, this, loc, &buf); if (op_ret == -1) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", loc->path, + gf_log (this->name, (op_errno == ENOENT)? + GF_LOG_DEBUG:GF_LOG_ERROR, + "lstat on %s failed: %s", real_path, strerror (op_errno)); goto out; } op_ret = 0; - out: +out: SET_TO_OLD_FS_ID(); - STACK_UNWIND (frame, op_ret, op_errno, &buf); + STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, &buf, NULL); return 0; } -int32_t -posix_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) +static int +posix_do_chmod (xlator_t *this, const char *path, struct iatt *stbuf) { - char * real_path = NULL; - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - DIR * dir = NULL; - struct posix_fd * pfd = NULL; + int32_t ret = -1; + mode_t mode = 0; + struct stat stat; + int is_symlink = 0; + + ret = sys_lstat (path, &stat); + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, + "lstat failed: %s (%s)", path, strerror (errno)); + goto out; + } + + if (S_ISLNK (stat.st_mode)) + is_symlink = 1; + + mode = st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type); + ret = lchmod (path, mode); + if ((ret == -1) && (errno == ENOSYS)) { + /* in Linux symlinks are always in mode 0777 and no + such call as lchmod exists. + */ + gf_log (this->name, GF_LOG_DEBUG, + "%s (%s)", path, strerror (errno)); + if (is_symlink) { + ret = 0; + goto out; + } + + ret = chmod (path, mode); + } +out: + return ret; +} + +static int +posix_do_chown (xlator_t *this, + const char *path, + struct iatt *stbuf, + int32_t valid) +{ + int32_t ret = -1; + uid_t uid = -1; + gid_t gid = -1; + + if (valid & GF_SET_ATTR_UID) + uid = stbuf->ia_uid; + + if (valid & GF_SET_ATTR_GID) + gid = stbuf->ia_gid; + + ret = lchown (path, uid, gid); + + return ret; +} + +static int +posix_do_utimes (xlator_t *this, + const char *path, + struct iatt *stbuf) +{ + int32_t ret = -1; + struct timeval tv[2] = {{0,},{0,}}; + struct stat stat; + int is_symlink = 0; + + ret = sys_lstat (path, &stat); + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s (%s)", path, strerror (errno)); + goto out; + } + + if (S_ISLNK (stat.st_mode)) + is_symlink = 1; + + tv[0].tv_sec = stbuf->ia_atime; + tv[0].tv_usec = stbuf->ia_atime_nsec / 1000; + tv[1].tv_sec = stbuf->ia_mtime; + tv[1].tv_usec = stbuf->ia_mtime_nsec / 1000; + + ret = lutimes (path, tv); + if ((ret == -1) && (errno == ENOSYS)) { + gf_log (this->name, GF_LOG_DEBUG, + "%s (%s)", path, strerror (errno)); + if (is_symlink) { + ret = 0; + goto out; + } + + ret = utimes (path, tv); + } + +out: + return ret; +} + +int +posix_setattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; DECLARE_OLD_FS_ID_VAR; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (loc->path, out); - VALIDATE_OR_GOTO (fd, out); SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_INODE_HANDLE (real_path, this, loc, &statpre); - dir = opendir (real_path); - - if (dir == NULL) { + if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "opendir failed on %s: %s", - loc->path, strerror (op_errno)); + "setattr (lstat) on %s failed: %s", real_path, + strerror (op_errno)); goto out; } - op_ret = dirfd (dir); - if (op_ret < 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "dirfd() failed on %s: %s", - loc->path, strerror (op_errno)); - goto out; - } + if (valid & GF_SET_ATTR_MODE) { + op_ret = posix_do_chmod (this, real_path, stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "setattr (chmod) on %s failed: %s", real_path, + strerror (op_errno)); + goto out; + } + } - pfd = CALLOC (1, sizeof (*fd)); - if (!pfd) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; + if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)){ + op_ret = posix_do_chown (this, real_path, stbuf, valid); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "setattr (chown) on %s failed: %s", real_path, + strerror (op_errno)); + goto out; + } } - pfd->dir = dir; - pfd->fd = dirfd (dir); - pfd->path = strdup (real_path); - if (!pfd->path) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; + if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) { + op_ret = posix_do_utimes (this, real_path, stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "setattr (utimes) on %s failed: %s", real_path, + strerror (op_errno)); + goto out; + } } - fd_ctx_set (fd, this, (uint64_t)(long)pfd); + if (!valid) { + op_ret = lchown (real_path, -1, -1); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "lchown (%s, -1, -1) failed => (%s)", + real_path, strerror (op_errno)); - op_ret = 0; + goto out; + } + } - out: + op_ret = posix_pstat (this, loc->gfid, real_path, &statpost); if (op_ret == -1) { - if (dir) { - closedir (dir); - dir = NULL; - } - if (pfd) { - if (pfd->path) - FREE (pfd->path); - FREE (pfd); - pfd = NULL; - } + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "setattr (lstat) on %s failed: %s", real_path, + strerror (op_errno)); + goto out; } + op_ret = 0; + +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, fd); + + STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, + &statpre, &statpost, NULL); + return 0; } +int32_t +posix_do_fchown (xlator_t *this, + int fd, + struct iatt *stbuf, + int32_t valid) +{ + int ret = -1; + uid_t uid = -1; + gid_t gid = -1; + + if (valid & GF_SET_ATTR_UID) + uid = stbuf->ia_uid; + + if (valid & GF_SET_ATTR_GID) + gid = stbuf->ia_gid; + + ret = fchown (fd, uid, gid); + + return ret; +} + int32_t -posix_getdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, int32_t flag) +posix_do_fchmod (xlator_t *this, + int fd, struct iatt *stbuf) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = NULL; - dir_entry_t entries = {0, }; - dir_entry_t * tmp = NULL; - DIR * dir = NULL; - struct dirent * dirent = NULL; - int real_path_len = -1; - int entry_path_len = -1; - char * entry_path = NULL; - int count = 0; - struct posix_fd * pfd = NULL; - uint64_t tmp_pfd = 0; - struct stat buf = {0,}; - int ret = -1; - char tmp_real_path[ZR_PATH_MAX]; - char linkpath[ZR_PATH_MAX]; + mode_t mode = 0; - DECLARE_OLD_FS_ID_VAR ; + mode = st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type); + return fchmod (fd, mode); +} + +static int +posix_do_futimes (xlator_t *this, + int fd, + struct iatt *stbuf) +{ + gf_log (this->name, GF_LOG_WARNING, "function not implemented fd(%d)", fd); + + errno = ENOSYS; + return -1; +} + +int +posix_fsetattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + struct posix_fd *pfd = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); - SET_FS_ID (frame->root->uid, frame->root->gid); - - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; gf_log (this->name, GF_LOG_DEBUG, - "fd %p does not have context in %s", - fd, this->name); + "pfd is NULL from fd=%p", fd); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; - if (!pfd->path) { - op_errno = EBADFD; - gf_log (this->name, GF_LOG_DEBUG, - "pfd does not have path set (possibly file " - "fd, fd=%p)", fd); + + op_ret = posix_fdstat (this, pfd->fd, &statpre); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fsetattr (fstat) failed on fd=%p: %s", fd, + strerror (op_errno)); goto out; } - real_path = pfd->path; - real_path_len = strlen (real_path); + if (valid & GF_SET_ATTR_MODE) { + op_ret = posix_do_fchmod (this, pfd->fd, stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fsetattr (fchmod) failed on fd=%p: %s", + fd, strerror (op_errno)); + goto out; + } + } + + if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) { + op_ret = posix_do_fchown (this, pfd->fd, stbuf, valid); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fsetattr (fchown) failed on fd=%p: %s", + fd, strerror (op_errno)); + goto out; + } + + } + + if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) { + op_ret = posix_do_futimes (this, pfd->fd, stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fsetattr (futimes) on failed fd=%p: %s", fd, + strerror (op_errno)); + goto out; + } + } + + if (!valid) { + op_ret = fchown (pfd->fd, -1, -1); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fchown (%d, -1, -1) failed => (%s)", + pfd->fd, strerror (op_errno)); - entry_path_len = real_path_len + NAME_MAX; - entry_path = CALLOC (1, entry_path_len); + goto out; + } + } - if (!entry_path) { + op_ret = posix_fdstat (this, pfd->fd, &statpost); + if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + "fsetattr (fstat) failed on fd=%p: %s", fd, + strerror (op_errno)); goto out; } - strncpy (entry_path, real_path, entry_path_len); - entry_path[real_path_len] = '/'; + op_ret = 0; - dir = pfd->dir; +out: + SET_TO_OLD_FS_ID (); - if (!dir) { - op_errno = EBADFD; + STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno, + &statpre, &statpost, NULL); + + return 0; +} + +static int32_t +posix_do_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + off_t offset, size_t len, struct iatt *statpre, + struct iatt *statpost) +{ + struct posix_fd *pfd = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { gf_log (this->name, GF_LOG_DEBUG, - "pfd does not have dir set (possibly file fd, " - "fd=%p, path=`%s'", - fd, real_path); + "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "fallocate (fstat) failed on fd=%p: %s", fd, + strerror (errno)); goto out; } - /* TODO: check for all the type of flag, and behave appropriately */ + ret = sys_fallocate(pfd->fd, flags, offset, len); + if (ret == -1) { + ret = -errno; + goto out; + } - while ((dirent = readdir (dir))) { - if (!dirent) - break; + ret = posix_fdstat (this, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "fallocate (fstat) failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } - /* This helps in self-heal, when only directories - needs to be replicated */ +out: + SET_TO_OLD_FS_ID (); - /* This is to reduce the network traffic, in case only - directory is needed from posix */ + return ret; +} - strncpy (tmp_real_path, real_path, ZR_PATH_MAX); - strncat (tmp_real_path, "/", - ZR_PATH_MAX - strlen (tmp_real_path)); +char* +_page_aligned_alloc (size_t size, char **aligned_buf) +{ + char *alloc_buf = NULL; + char *buf = NULL; - strncat (tmp_real_path, dirent->d_name, - ZR_PATH_MAX - strlen (tmp_real_path)); - ret = lstat (tmp_real_path, &buf); + alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char); + if (!alloc_buf) + goto out; + /* page aligned buffer */ + buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); + *aligned_buf = buf; +out: + return alloc_buf; +} - if ((flag == GF_GET_DIR_ONLY) - && (ret != -1 && !S_ISDIR(buf.st_mode))) { - continue; +static int32_t +_posix_do_zerofill(int fd, off_t offset, size_t len, int o_direct) +{ + size_t num_vect = 0; + int32_t num_loop = 1; + int32_t idx = 0; + int32_t op_ret = -1; + int32_t vect_size = VECTOR_SIZE; + size_t remain = 0; + size_t extra = 0; + struct iovec *vector = NULL; + char *iov_base = NULL; + char *alloc_buf = NULL; + + if (len == 0) + return 0; + if (len < VECTOR_SIZE) + vect_size = len; + + num_vect = len / (vect_size); + remain = len % vect_size ; + if (num_vect > MAX_NO_VECT) { + extra = num_vect % MAX_NO_VECT; + num_loop = num_vect / MAX_NO_VECT; + num_vect = MAX_NO_VECT; + } + + vector = GF_CALLOC (num_vect, sizeof(struct iovec), + gf_common_mt_iovec); + if (!vector) + return -1; + if (o_direct) { + alloc_buf = _page_aligned_alloc(vect_size, &iov_base); + if (!alloc_buf) { + gf_log ("_posix_do_zerofill", GF_LOG_DEBUG, + "memory alloc failed, vect_size %d: %s", + vect_size, strerror(errno)); + GF_FREE(vector); + return -1; } + } else { + iov_base = GF_CALLOC (vect_size, sizeof(char), + gf_common_mt_char); + if (!iov_base) { + GF_FREE(vector); + return -1; + } + } + + for (idx = 0; idx < num_vect; idx++) { + vector[idx].iov_base = iov_base; + vector[idx].iov_len = vect_size; + } + lseek(fd, offset, SEEK_SET); + for (idx = 0; idx < num_loop; idx++) { + op_ret = writev(fd, vector, num_vect); + if (op_ret < 0) + goto err; + } + if (extra) { + op_ret = writev(fd, vector, extra); + if (op_ret < 0) + goto err; + } + if (remain) { + vector[0].iov_len = remain; + op_ret = writev(fd, vector , 1); + if (op_ret < 0) + goto err; + } +err: + if (o_direct) + GF_FREE(alloc_buf); + else + GF_FREE(iov_base); + GF_FREE(vector); + return op_ret; +} - tmp = CALLOC (1, sizeof (*tmp)); +static int32_t +posix_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, struct iatt *statpre, + struct iatt *statpost) +{ + struct posix_fd *pfd = NULL; + int32_t ret = -1; - if (!tmp) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } + DECLARE_OLD_FS_ID_VAR; - tmp->name = strdup (dirent->d_name); - if (!tmp->name) { - op_errno = errno; + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "pre-operation fstat failed on fd = %p: %s", fd, + strerror (errno)); + goto out; + } + ret = _posix_do_zerofill(pfd->fd, offset, len, pfd->flags & O_DIRECT); + if (ret < 0) { + ret = -errno; + gf_log(this->name, GF_LOG_ERROR, + "zerofill failed on fd %d length %ld %s", + pfd->fd, len, strerror(errno)); + goto out; + } + if (pfd->flags & (O_SYNC|O_DSYNC)) { + ret = fsync (pfd->fd); + if (ret) { gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + "fsync() in writev on fd %d failed: %s", + pfd->fd, strerror (errno)); + ret = -errno; goto out; } + } - if (entry_path_len < - (real_path_len + 1 + strlen (tmp->name) + 1)) { - entry_path_len = (real_path_len + - strlen (tmp->name) + 1024); + ret = posix_fdstat (this, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "post operation fstat failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } - entry_path = realloc (entry_path, entry_path_len); - } +out: + SET_TO_OLD_FS_ID (); - strcpy (&entry_path[real_path_len+1], tmp->name); + return ret; +} - ret = lstat (entry_path, &tmp->buf); +static int32_t +_posix_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) +{ + int32_t ret; + int32_t flags = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", - entry_path, strerror (op_errno)); - goto out; - } + if (keep_size) + flags = FALLOC_FL_KEEP_SIZE; - if (S_ISLNK(tmp->buf.st_mode)) { + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; - ret = readlink (entry_path, linkpath, ZR_PATH_MAX); - if (ret != -1) { - linkpath[ret] = '\0'; - tmp->link = strdup (linkpath); - } - } else { - tmp->link = ""; - } + STACK_UNWIND_STRICT(fallocate, frame, 0, 0, &statpre, &statpost, NULL); + return 0; - count++; +err: + STACK_UNWIND_STRICT(fallocate, frame, -1, -ret, NULL, NULL, NULL); + return 0; +} - tmp->next = entries.next; - entries.next = tmp; +static int32_t +posix_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret; + int32_t flags = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; - /* if size is 0, count can never be = size, so entire - dir is read */ - if (count == size) - break; - } + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; - FREE (entry_path); + STACK_UNWIND_STRICT(discard, frame, 0, 0, &statpre, &statpost, NULL); + return 0; - op_ret = 0; +err: + STACK_UNWIND_STRICT(discard, frame, -1, -ret, NULL, NULL, NULL); + return 0; - out: - SET_TO_OLD_FS_ID (); +} - if (op_ret == -1) { - if (entry_path) - FREE (entry_path); +static int32_t +posix_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + ret = posix_do_zerofill(frame, this, fd, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(zerofill, frame, -1, -ret, NULL, NULL, NULL); + return 0; + +} + +int32_t +posix_opendir (call_frame_t *frame, xlator_t *this, + loc_t *loc, fd_t *fd, dict_t *xdata) +{ + char * real_path = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + DIR * dir = NULL; + struct posix_fd * pfd = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (loc->path, out); + VALIDATE_OR_GOTO (fd, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_INODE_HANDLE (real_path, this, loc, NULL); + + op_ret = -1; + dir = opendir (real_path); + + if (dir == NULL) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "opendir failed on %s: %s", + real_path, strerror (op_errno)); + goto out; } - STACK_UNWIND (frame, op_ret, op_errno, &entries, count); + op_ret = dirfd (dir); + if (op_ret < 0) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "dirfd() failed on %s: %s", + real_path, strerror (op_errno)); + goto out; + } - if (op_ret == 0) { - while (entries.next) { - tmp = entries.next; - entries.next = entries.next->next; - FREE (tmp->name); - FREE (tmp); + pfd = GF_CALLOC (1, sizeof (*pfd), gf_posix_mt_posix_fd); + if (!pfd) { + op_errno = errno; + goto out; + } + + pfd->dir = dir; + pfd->fd = dirfd (dir); + + op_ret = fd_ctx_set (fd, this, (uint64_t)(long)pfd); + if (op_ret) + gf_log (this->name, GF_LOG_WARNING, + "failed to set the fd context path=%s fd=%p", + real_path, fd); + + op_ret = 0; + +out: + if (op_ret == -1) { + if (dir) { + closedir (dir); + dir = NULL; + } + if (pfd) { + GF_FREE (pfd); + pfd = NULL; } } + SET_TO_OLD_FS_ID (); + STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, NULL); return 0; } - int32_t posix_releasedir (xlator_t *this, - fd_t *fd) + fd_t *fd) { - int32_t op_ret = -1; - int32_t op_errno = 0; struct posix_fd * pfd = NULL; - uint64_t tmp_pfd = 0; + uint64_t tmp_pfd = 0; int ret = 0; + struct posix_private *priv = NULL; + VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); ret = fd_ctx_del (fd, this, &tmp_pfd); if (ret < 0) { - op_errno = -ret; gf_log (this->name, GF_LOG_DEBUG, "pfd from fd=%p is NULL", fd); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; + pfd = (struct posix_fd *)(long)tmp_pfd; if (!pfd->dir) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_DEBUG, - "pfd->dir is NULL for fd=%p path=%s", - fd, pfd->path ? pfd->path : "<NULL>"); - goto out; - } - - ret = closedir (pfd->dir); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "closedir on %p failed: %s", pfd->dir, - strerror (errno)); - goto out; - } - pfd->dir = NULL; - - if (!pfd->path) { - op_errno = EBADFD; - gf_log (this->name, GF_LOG_DEBUG, - "pfd->path was NULL. fd=%p pfd=%p", - fd, pfd); + gf_log (this->name, GF_LOG_WARNING, + "pfd->dir is NULL for fd=%p", fd); goto out; } - op_ret = 0; + priv = this->private; - out: - if (pfd) { - if (pfd->path) - FREE (pfd->path); - FREE (pfd); + pthread_mutex_lock (&priv->janitor_lock); + { + INIT_LIST_HEAD (&pfd->list); + list_add_tail (&pfd->list, &priv->janitor_fds); + pthread_cond_signal (&priv->janitor_cond); } + pthread_mutex_unlock (&priv->janitor_lock); +out: return 0; } int32_t posix_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size) + loc_t *loc, size_t size, dict_t *xdata) { char * dest = NULL; int32_t op_ret = -1; int32_t op_errno = 0; char * real_path = NULL; + struct iatt stbuf = {0,}; DECLARE_OLD_FS_ID_VAR; - VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (frame, out); SET_FS_ID (frame->root->uid, frame->root->gid); dest = alloca (size + 1); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_INODE_HANDLE (real_path, this, loc, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "lstat on %s failed: %s", real_path, + strerror (op_errno)); + goto out; + } op_ret = readlink (real_path, dest, size); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "readlink on %s failed: %s", loc->path, + "readlink on %s failed: %s", real_path, strerror (op_errno)); goto out; } dest[op_ret] = 0; - - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, dest); + STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, dest, &stbuf, NULL); return 0; } -int32_t + +int posix_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev) + loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata) { - int tmp_fd = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = 0; - struct stat stbuf = { 0, }; + int tmp_fd = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = 0; + char *par_path = 0; + struct iatt stbuf = { 0, }; + char was_present = 1; + struct posix_private *priv = NULL; + gid_t gid = 0; + struct iatt preparent = {0,}; + struct iatt postparent = {0,}; + void * uuid_req = NULL; DECLARE_OLD_FS_ID_VAR; @@ -733,68 +1033,157 @@ posix_mknod (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); + priv = this->private; + VALIDATE_OR_GOTO (priv, out); + + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, NULL); + gid = frame->root->gid; + + SET_FS_ID (frame->root->uid, gid); + + op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "pre-operation lstat on parent of %s failed: %s", + real_path, strerror (op_errno)); + goto out; + } + + if (preparent.ia_prot.sgid) { + gid = preparent.ia_gid; + } + + /* Check if the 'gfid' already exists, because this mknod may be an + internal call from distribute for creating 'linkfile', and that + linkfile may be for a hardlinked file */ + if (dict_get (xdata, GLUSTERFS_INTERNAL_FOP_KEY)) { + dict_del (xdata, GLUSTERFS_INTERNAL_FOP_KEY); + op_ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); + if (op_ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get the gfid from dict for %s", + loc->path); + goto real_op; + } + op_ret = posix_create_link_if_gfid_exists (this, uuid_req, + real_path); + if (!op_ret) + goto post_op; + } + +real_op: +#ifdef __NetBSD__ + if (S_ISFIFO(mode)) + op_ret = mkfifo (real_path, mode); + else +#endif /* __NetBSD__ */ op_ret = mknod (real_path, mode, dev); if (op_ret == -1) { op_errno = errno; - if ((op_errno == EINVAL) && S_ISREG (mode)) { - /* Over Darwin, mknod with (S_IFREG|mode) - doesn't work */ - tmp_fd = creat (real_path, mode); - if (tmp_fd == -1) - goto out; - close (tmp_fd); - } else { + if ((op_errno == EINVAL) && S_ISREG (mode)) { + /* Over Darwin, mknod with (S_IFREG|mode) + doesn't work */ + tmp_fd = creat (real_path, mode); + if (tmp_fd == -1) { + gf_log (this->name, GF_LOG_ERROR, + "create failed on %s: %s", + real_path, strerror (errno)); + goto out; + } + close (tmp_fd); + } else { - gf_log (this->name, GF_LOG_ERROR, - "mknod on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } + gf_log (this->name, GF_LOG_ERROR, + "mknod on %s failed: %s", real_path, + strerror (op_errno)); + goto out; + } + } + + op_ret = posix_gfid_set (this, real_path, loc, xdata); + if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "setting gfid on %s failed", real_path); } #ifndef HAVE_SET_FSID - op_ret = lchown (real_path, frame->root->uid, frame->root->gid); + op_ret = lchown (real_path, frame->root->uid, gid); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "lchown on %s failed: %s", loc->path, + "lchown on %s failed: %s", real_path, strerror (op_errno)); goto out; } #endif - op_ret = lstat (real_path, &stbuf); +post_op: + op_ret = posix_acl_xattr_set (this, real_path, xdata); + if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "setting ACLs on %s failed (%s)", real_path, + strerror (errno)); + } + + op_ret = posix_entry_create_xattr_set (this, real_path, xdata); + if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "setting xattrs on %s failed (%s)", real_path, + strerror (errno)); + } + op_ret = posix_pstat (this, NULL, real_path, &stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "mknod on %s failed: %s", loc->path, + "mknod on %s failed: %s", real_path, strerror (op_errno)); goto out; } + op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "post-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); + goto out; + } + op_ret = 0; - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); + STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, + (loc)?loc->inode:NULL, &stbuf, &preparent, + &postparent, NULL); + + if ((op_ret == -1) && (!was_present)) { + unlink (real_path); + } return 0; } -int32_t + +int posix_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode) + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = NULL; - struct stat stbuf = {0, }; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = NULL; + char *par_path = NULL; + struct iatt stbuf = {0, }; + char was_present = 1; + struct posix_private *priv = NULL; + gid_t gid = 0; + struct iatt preparent = {0,}; + struct iatt postparent = {0,}; DECLARE_OLD_FS_ID_VAR; @@ -802,44 +1191,116 @@ posix_mkdir (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); + /* The Hidden directory should be for housekeeping purpose and it + should not get created from a user request */ + if (__is_root_gfid (loc->pargfid) && + (strcmp (loc->name, GF_HIDDEN_PATH) == 0)) { + gf_log (this->name, GF_LOG_WARNING, + "mkdir issued on %s, which is not permitted", + GF_HIDDEN_PATH); + op_errno = EPERM; + op_ret = -1; + goto out; + } + + priv = this->private; + VALIDATE_OR_GOTO (priv, out); + + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, NULL); + + gid = frame->root->gid; + + op_ret = posix_pstat (this, NULL, real_path, &stbuf); + if ((op_ret == -1) && (errno == ENOENT)) { + was_present = 0; + } + + SET_FS_ID (frame->root->uid, gid); + + op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "pre-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); + goto out; + } + + if (preparent.ia_prot.sgid) { + gid = preparent.ia_gid; + mode |= S_ISGID; + } op_ret = mkdir (real_path, mode); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "mkdir of %s failed: %s", loc->path, + "mkdir of %s failed: %s", real_path, strerror (op_errno)); goto out; } + op_ret = posix_gfid_set (this, real_path, loc, xdata); + if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "setting gfid on %s failed", real_path); + } + #ifndef HAVE_SET_FSID - op_ret = chown (real_path, frame->root->uid, frame->root->gid); + op_ret = chown (real_path, frame->root->uid, gid); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "chown on %s failed: %s", loc->path, + "chown on %s failed: %s", real_path, strerror (op_errno)); goto out; } #endif - op_ret = lstat (real_path, &stbuf); + op_ret = posix_acl_xattr_set (this, real_path, xdata); + if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "setting ACLs on %s failed (%s)", real_path, + strerror (errno)); + } + + op_ret = posix_entry_create_xattr_set (this, real_path, xdata); + if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "setting xattrs on %s failed (%s)", real_path, + strerror (errno)); + } + + op_ret = posix_pstat (this, NULL, real_path, &stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", loc->path, + "lstat on %s failed: %s", real_path, strerror (op_errno)); goto out; } + op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "post-operation lstat on parent of %s failed: %s", + real_path, strerror (op_errno)); + goto out; + } + op_ret = 0; - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); + STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, + (loc)?loc->inode:NULL, &stbuf, &preparent, + &postparent, NULL); + + if ((op_ret == -1) && (!was_present)) { + unlink (real_path); + } return 0; } @@ -847,13 +1308,17 @@ posix_mkdir (call_frame_t *frame, xlator_t *this, int32_t posix_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc) + loc_t *loc, int xflag, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_path = NULL; - int32_t fd = -1; - struct posix_private *priv = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = NULL; + char *par_path = NULL; + int32_t fd = -1; + struct iatt stbuf = {0,}; + struct posix_private *priv = NULL; + struct iatt preparent = {0,}; + struct iatt postparent = {0,}; DECLARE_OLD_FS_ID_VAR; @@ -862,38 +1327,60 @@ posix_unlink (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (loc, out); SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &stbuf); + + op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "pre-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); + goto out; + } + + if (stbuf.ia_nlink == 1) + posix_handle_unset (this, stbuf.ia_gfid, NULL); priv = this->private; if (priv->background_unlink) { - if (S_ISREG (loc->inode->st_mode)) { + if (IA_ISREG (loc->inode->ia_type)) { fd = open (real_path, O_RDONLY); if (fd == -1) { op_ret = -1; op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "open of %s failed: %s", loc->path, + "open of %s failed: %s", real_path, strerror (op_errno)); goto out; } } } - op_ret = unlink (real_path); + op_ret = sys_unlink (real_path); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "unlink of %s failed: %s", loc->path, + "unlink of %s failed: %s", real_path, strerror (op_errno)); goto out; } + op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "post-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); + goto out; + } + op_ret = 0; - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, + &preparent, &postparent, NULL); if (fd != -1) { close (fd); @@ -902,13 +1389,20 @@ posix_unlink (call_frame_t *frame, xlator_t *this, return 0; } -int32_t + +int posix_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc) + loc_t *loc, int flags, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; - char * real_path = 0; + char * real_path = NULL; + char * par_path = NULL; + char * gfid_str = NULL; + struct iatt preparent = {0,}; + struct iatt postparent = {0,}; + struct iatt stbuf; + struct posix_private *priv = NULL; DECLARE_OLD_FS_ID_VAR; @@ -917,38 +1411,102 @@ posix_rmdir (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (loc, out); SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); - op_ret = rmdir (real_path); + /* The Hidden directory should be for housekeeping purpose and it + should not get deleted from inside process */ + if (__is_root_gfid (loc->pargfid) && + (strcmp (loc->name, GF_HIDDEN_PATH) == 0)) { + gf_log (this->name, GF_LOG_WARNING, + "rmdir issued on %s, which is not permitted", + GF_HIDDEN_PATH); + op_errno = EPERM; + op_ret = -1; + goto out; + } + + priv = this->private; + + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &stbuf); + + op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "pre-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); + goto out; + } + + if (flags) { + gfid_str = uuid_utoa (stbuf.ia_gfid); + char *tmp_path = alloca (strlen (priv->trash_path) + + strlen ("/") + + strlen (gfid_str) + 1); + + mkdir (priv->trash_path, 0755); + sprintf (tmp_path, "%s/%s", priv->trash_path, gfid_str); + op_ret = rename (real_path, tmp_path); + } else { + op_ret = rmdir (real_path); + } op_errno = errno; - if (op_errno == EEXIST) - /* Solaris sets errno = EEXIST instead of ENOTEMPTY */ - op_errno = ENOTEMPTY; + if (op_ret == 0) { + posix_handle_unset (this, stbuf.ia_gfid, NULL); + } + + if (op_errno == EEXIST) + /* Solaris sets errno = EEXIST instead of ENOTEMPTY */ + op_errno = ENOTEMPTY; + /* No need to log a common error as ENOTEMPTY */ if (op_ret == -1 && op_errno != ENOTEMPTY) { gf_log (this->name, GF_LOG_ERROR, - "rmdir of %s failed: %s", loc->path, + "rmdir of %s failed: %s", real_path, strerror (op_errno)); + } + + if (op_ret == -1) { + gf_log (this->name, + (op_errno == ENOTEMPTY) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "%s on %s failed", (flags) ? "rename" : "rmdir", + real_path); goto out; } - out: + op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "post-operation lstat on parent of %s failed: %s", + par_path, strerror (op_errno)); + goto out; + } + +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, + &preparent, &postparent, NULL); return 0; } -int32_t + +int posix_symlink (call_frame_t *frame, xlator_t *this, - const char *linkname, loc_t *loc) + const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = 0; - struct stat stbuf = { 0, }; + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = 0; + char * par_path = 0; + struct iatt stbuf = { 0, }; + struct posix_private *priv = NULL; + gid_t gid = 0; + char was_present = 1; + struct iatt preparent = {0,}; + struct iatt postparent = {0,}; DECLARE_OLD_FS_ID_VAR; @@ -957,8 +1515,31 @@ posix_symlink (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (linkname, out); VALIDATE_OR_GOTO (loc, out); - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); + priv = this->private; + VALIDATE_OR_GOTO (priv, out); + + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &stbuf); + + if ((op_ret == -1) && (errno == ENOENT)){ + was_present = 0; + } + + SET_FS_ID (frame->root->uid, gid); + + gid = frame->root->gid; + + op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "pre-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); + goto out; + } + + if (preparent.ia_prot.sgid) { + gid = preparent.ia_gid; + } op_ret = symlink (linkname, real_path); @@ -966,35 +1547,71 @@ posix_symlink (call_frame_t *frame, xlator_t *this, op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "symlink of %s --> %s failed: %s", - loc->path, linkname, strerror (op_errno)); + real_path, linkname, strerror (op_errno)); goto out; } + op_ret = posix_gfid_set (this, real_path, loc, xdata); + if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "setting gfid on %s failed", real_path); + } + #ifndef HAVE_SET_FSID - op_ret = lchown (real_path, frame->root->uid, frame->root->gid); + op_ret = lchown (real_path, frame->root->uid, gid); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "lchown failed on %s: %s", - loc->path, strerror (op_errno)); + real_path, strerror (op_errno)); goto out; } #endif - op_ret = lstat (real_path, &stbuf); + + op_ret = posix_acl_xattr_set (this, real_path, xdata); + if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "setting ACLs on %s failed (%s)", real_path, + strerror (errno)); + } + + op_ret = posix_entry_create_xattr_set (this, real_path, xdata); + if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "setting xattrs on %s failed (%s)", real_path, + strerror (errno)); + } + + op_ret = posix_pstat (this, NULL, real_path, &stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "lstat failed on %s: %s", - loc->path, strerror (op_errno)); + real_path, strerror (op_errno)); + goto out; + } + + op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "post-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); goto out; } op_ret = 0; - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); + STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, + (loc)?loc->inode:NULL, &stbuf, &preparent, + &postparent, NULL); + + if ((op_ret == -1) && (!was_present)) { + unlink (real_path); + } return 0; } @@ -1002,13 +1619,26 @@ posix_symlink (call_frame_t *frame, xlator_t *this, int posix_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_oldpath = NULL; - char * real_newpath = NULL; - struct stat stbuf = {0, }; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_oldpath = NULL; + char *real_newpath = NULL; + char *par_oldpath = NULL; + char *par_newpath = NULL; + struct iatt stbuf = {0, }; + struct posix_private *priv = NULL; + char was_present = 1; + struct iatt preoldparent = {0, }; + struct iatt postoldparent = {0, }; + struct iatt prenewparent = {0, }; + struct iatt postnewparent = {0, }; + char olddirid[64]; + char newdirid[64]; + uuid_t victim = {0}; + int was_dir = 0; + int nlink = 0; DECLARE_OLD_FS_ID_VAR; @@ -1017,213 +1647,237 @@ posix_rename (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (oldloc, out); VALIDATE_OR_GOTO (newloc, out); + priv = this->private; + VALIDATE_OR_GOTO (priv, out); + SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_oldpath, this, oldloc->path); - MAKE_REAL_PATH (real_newpath, this, newloc->path); + MAKE_ENTRY_HANDLE (real_oldpath, par_oldpath, this, oldloc, NULL); + MAKE_ENTRY_HANDLE (real_newpath, par_newpath, this, newloc, &stbuf); - op_ret = rename (real_oldpath, real_newpath); + op_ret = posix_pstat (this, oldloc->pargfid, par_oldpath, &preoldparent); if (op_ret == -1) { op_errno = errno; - gf_log (this->name, - (op_errno == ENOTEMPTY ? GF_LOG_DEBUG : GF_LOG_ERROR), - "rename of %s to %s failed: %s", - oldloc->path, newloc->path, strerror (op_errno)); + gf_log (this->name, GF_LOG_ERROR, + "pre-operation lstat on parent %s failed: %s", + par_oldpath, strerror (op_errno)); goto out; } - op_ret = lstat (real_newpath, &stbuf); + op_ret = posix_pstat (this, newloc->pargfid, par_newpath, &prenewparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", - real_newpath, strerror (op_errno)); + "pre-operation lstat on parent of %s failed: %s", + par_newpath, strerror (op_errno)); goto out; } - op_ret = 0; - - out: - SET_TO_OLD_FS_ID (); + op_ret = posix_pstat (this, NULL, real_newpath, &stbuf); + if ((op_ret == -1) && (errno == ENOENT)){ + was_present = 0; + } else { + uuid_copy (victim, stbuf.ia_gfid); + if (IA_ISDIR (stbuf.ia_type)) + was_dir = 1; + nlink = stbuf.ia_nlink; + } - STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + if (was_present && IA_ISDIR(stbuf.ia_type) && !newloc->inode) { + gf_log (this->name, GF_LOG_WARNING, + "found directory at %s while expecting ENOENT", + real_newpath); + op_ret = -1; + op_errno = EEXIST; + goto out; + } - return 0; -} + if (was_present && IA_ISDIR(stbuf.ia_type) && + uuid_compare (newloc->inode->gfid, stbuf.ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "found directory %s at %s while renaming %s", + uuid_utoa_r (newloc->inode->gfid, olddirid), + real_newpath, + uuid_utoa_r (stbuf.ia_gfid, newdirid)); + op_ret = -1; + op_errno = EEXIST; + goto out; + } + if (IA_ISDIR (oldloc->inode->ia_type)) { + posix_handle_unset (this, oldloc->inode->gfid, NULL); + } -int -posix_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_oldpath = 0; - char * real_newpath = 0; - struct stat stbuf = {0, }; + op_ret = sys_rename (real_oldpath, real_newpath); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, + (op_errno == ENOTEMPTY ? GF_LOG_DEBUG : GF_LOG_ERROR), + "rename of %s to %s failed: %s", + real_oldpath, real_newpath, strerror (op_errno)); + goto out; + } + if (was_dir) + posix_handle_unset (this, victim, NULL); - DECLARE_OLD_FS_ID_VAR; + if (was_present && !was_dir && nlink == 1) + posix_handle_unset (this, victim, NULL); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (oldloc, out); - VALIDATE_OR_GOTO (newloc, out); + if (IA_ISDIR (oldloc->inode->ia_type)) { + posix_handle_soft (this, real_newpath, newloc, + oldloc->inode->gfid, NULL); + } - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_oldpath, this, oldloc->path); - MAKE_REAL_PATH (real_newpath, this, newloc->path); + op_ret = posix_pstat (this, NULL, real_newpath, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "lstat on %s failed: %s", + real_newpath, strerror (op_errno)); + goto out; + } - op_ret = link (real_oldpath, real_newpath); + op_ret = posix_pstat (this, oldloc->pargfid, par_oldpath, &postoldparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "link %s to %s failed: %s", - oldloc->path, newloc->path, strerror (op_errno)); + "post-operation lstat on parent %s failed: %s", + par_oldpath, strerror (op_errno)); goto out; } - op_ret = lstat (real_newpath, &stbuf); + op_ret = posix_pstat (this, newloc->pargfid, par_newpath, &postnewparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", - real_newpath, strerror (op_errno)); + "post-operation lstat on parent %s failed: %s", + par_newpath, strerror (op_errno)); goto out; } op_ret = 0; - out: +out: + SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, oldloc->inode, &stbuf); + STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, &stbuf, + &preoldparent, &postoldparent, + &prenewparent, &postnewparent, NULL); + + if ((op_ret == -1) && !was_present) { + unlink (real_newpath); + } return 0; } int -posix_chmod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode) +posix_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = 0; - struct stat stbuf = {0,}; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_oldpath = 0; + char *real_newpath = 0; + char *par_newpath = 0; + struct iatt stbuf = {0, }; + struct posix_private *priv = NULL; + char was_present = 1; + struct iatt preparent = {0,}; + struct iatt postparent = {0,}; DECLARE_OLD_FS_ID_VAR; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (oldloc, out); + VALIDATE_OR_GOTO (newloc, out); - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); + priv = this->private; + VALIDATE_OR_GOTO (priv, out); - if (S_ISLNK (loc->inode->st_mode)) { - /* chmod on a link should always succeed */ - op_ret = lstat (real_path, &stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", - real_path, strerror (op_errno)); - goto out; - } - op_ret = 0; - goto out; - } + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_INODE_HANDLE (real_oldpath, this, oldloc, &stbuf); - op_ret = lchmod (real_path, mode); - if ((op_ret == -1) && (errno == ENOSYS)) { - gf_log (this->name, GF_LOG_TRACE, - "lchmod not implemented, falling back to chmod"); - op_ret = chmod (real_path, mode); + MAKE_ENTRY_HANDLE (real_newpath, par_newpath, this, newloc, &stbuf); + if ((op_ret == -1) && (errno == ENOENT)) { + was_present = 0; } + op_ret = posix_pstat (this, newloc->pargfid, par_newpath, &preparent); if (op_ret == -1) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "chmod on %s failed: %s", - loc->path, strerror (op_errno)); + gf_log (this->name, GF_LOG_ERROR, "lstat failed: %s: %s", + par_newpath, strerror (op_errno)); goto out; } - op_ret = lstat (real_path, &stbuf); +#ifdef HAVE_LINKAT + /* + * On most systems (Linux being the notable exception), link(2) + * first resolves symlinks. If the target is a directory or + * is nonexistent, it will fail. linkat(2) operates on the + * symlink instead of its target when the AT_SYMLINK_FOLLOW + * flag is not supplied. + */ + op_ret = linkat (AT_FDCWD, real_oldpath, AT_FDCWD, real_newpath, 0); +#else + op_ret = link (real_oldpath, real_newpath); +#endif if (op_ret == -1) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "lstat on %s failed: %s", - real_path, strerror (op_errno)); + gf_log (this->name, GF_LOG_ERROR, + "link %s to %s failed: %s", + real_oldpath, real_newpath, strerror (op_errno)); goto out; } - op_ret = 0; - - out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND (frame, op_ret, op_errno, &stbuf); - - return 0; -} - - -int -posix_chown (call_frame_t *frame, xlator_t *this, - loc_t *loc, uid_t uid, gid_t gid) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = 0; - struct stat stbuf = {0,}; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = lchown (real_path, uid, gid); + op_ret = posix_pstat (this, NULL, real_newpath, &stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "lchown on %s failed: %s", - loc->path, strerror (op_errno)); + "lstat on %s failed: %s", + real_newpath, strerror (op_errno)); goto out; } - op_ret = lstat (real_path, &stbuf); + op_ret = posix_pstat (this, newloc->pargfid, par_newpath, &postparent); if (op_ret == -1) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", - real_path, strerror (op_errno)); + gf_log (this->name, GF_LOG_ERROR, "lstat failed: %s: %s", + par_newpath, strerror (op_errno)); goto out; } op_ret = 0; - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, + (oldloc)?oldloc->inode:NULL, &stbuf, &preparent, + &postparent, NULL); + + if ((op_ret == -1) && (!was_present)) { + unlink (real_newpath); + } return 0; } int32_t -posix_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset) +posix_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = 0; - struct stat stbuf = {0,}; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = 0; + struct posix_private *priv = NULL; + struct iatt prebuf = {0,}; + struct iatt postbuf = {0,}; DECLARE_OLD_FS_ID_VAR; @@ -1231,19 +1885,30 @@ posix_truncate (call_frame_t *frame, VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); + priv = this->private; + VALIDATE_OR_GOTO (priv, out); + SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); + + MAKE_INODE_HANDLE (real_path, this, loc, &prebuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "pre-operation lstat on %s failed: %s", + real_path, strerror (op_errno)); + goto out; + } op_ret = truncate (real_path, offset); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "truncate on %s failed: %s", - loc->path, strerror (op_errno)); + real_path, strerror (op_errno)); goto out; } - op_ret = lstat (real_path, &stbuf); + op_ret = posix_pstat (this, loc->gfid, real_path, &postbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "lstat on %s failed: %s", @@ -1253,97 +1918,65 @@ posix_truncate (call_frame_t *frame, op_ret = 0; - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, + &prebuf, &postbuf, NULL); return 0; } int -posix_utimens (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct timespec ts[2]) +posix_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, + mode_t umask, fd_t *fd, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = 0; - struct stat stbuf = {0,}; - struct timeval tv[2] = {{0,},{0,}}; + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t _fd = -1; + int _flags = 0; + char * real_path = NULL; + char * par_path = NULL; + struct iatt stbuf = {0, }; + struct posix_fd * pfd = NULL; + struct posix_private * priv = NULL; + char was_present = 1; + + gid_t gid = 0; + struct iatt preparent = {0,}; + struct iatt postparent = {0,}; DECLARE_OLD_FS_ID_VAR; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (fd, out); - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); + priv = this->private; + VALIDATE_OR_GOTO (priv, out); - tv[0].tv_sec = ts[0].tv_sec; - tv[0].tv_usec = ts[0].tv_nsec / 1000; - tv[1].tv_sec = ts[1].tv_sec; - tv[1].tv_usec = ts[1].tv_nsec / 1000; + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &stbuf); - op_ret = lutimes (real_path, tv); - if ((op_ret == -1) && (errno == ENOSYS)) { - op_ret = utimes (real_path, tv); - } + gid = frame->root->gid; - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "utimes on %s failed: %s", real_path, - strerror (op_errno)); - goto out; - } + SET_FS_ID (frame->root->uid, gid); - op_ret = lstat (real_path, &stbuf); + op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", real_path, - strerror (op_errno)); + "pre-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); goto out; } - op_ret = 0; - - out: - SET_TO_OLD_FS_ID (); - - STACK_UNWIND (frame, op_ret, op_errno, &stbuf); - - return 0; -} - -int32_t -posix_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t _fd = -1; - int _flags = 0; - char * real_path = NULL; - struct stat stbuf = {0, }; - struct posix_fd * pfd = NULL; - struct posix_private * priv = NULL; - - DECLARE_OLD_FS_ID_VAR; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); + if (preparent.ia_prot.sgid) { + gid = preparent.ia_gid; + } if (!flags) { _flags = O_CREAT | O_RDWR | O_EXCL; @@ -1352,6 +1985,11 @@ posix_create (call_frame_t *frame, xlator_t *this, _flags = flags | O_CREAT; } + op_ret = posix_pstat (this, NULL, real_path, &stbuf); + if ((op_ret == -1) && (errno == ENOENT)) { + was_present = 0; + } + if (priv->o_direct) _flags |= O_DIRECT; @@ -1359,23 +1997,48 @@ posix_create (call_frame_t *frame, xlator_t *this, if (_fd == -1) { op_errno = errno; + op_ret = -1; gf_log (this->name, GF_LOG_ERROR, - "open on %s failed: %s", loc->path, + "open on %s failed: %s", real_path, strerror (op_errno)); goto out; } + if (was_present) + goto fill_stat; + + op_ret = posix_gfid_set (this, real_path, loc, xdata); + if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "setting gfid on %s failed", real_path); + } + #ifndef HAVE_SET_FSID - op_ret = chown (real_path, frame->root->uid, frame->root->gid); + op_ret = chown (real_path, frame->root->uid, gid); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "chown on %s failed: %s", - real_path, strerror (op_errno)); + real_path, strerror (op_errno)); } #endif - op_ret = fstat (_fd, &stbuf); + op_ret = posix_acl_xattr_set (this, real_path, xdata); + if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "setting ACLs on %s failed (%s)", real_path, + strerror (errno)); + } + + op_ret = posix_entry_create_xattr_set (this, real_path, xdata); + if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "setting xattrs on %s failed (%s)", real_path, + strerror (errno)); + } + +fill_stat: + op_ret = posix_fdstat (this, _fd, &stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -1383,46 +2046,68 @@ posix_create (call_frame_t *frame, xlator_t *this, goto out; } - op_ret = -1; - pfd = CALLOC (1, sizeof (*pfd)); + op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "post-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); + goto out; + } + op_ret = -1; + pfd = GF_CALLOC (1, sizeof (*pfd), gf_posix_mt_posix_fd); if (!pfd) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); goto out; } pfd->flags = flags; pfd->fd = _fd; - fd_ctx_set (fd, this, (uint64_t)(long)pfd); + op_ret = fd_ctx_set (fd, this, (uint64_t)(long)pfd); + if (op_ret) + gf_log (this->name, GF_LOG_WARNING, + "failed to set the fd context path=%s fd=%p", + real_path, fd); - ((struct posix_private *)this->private)->stats.nr_files++; + LOCK (&priv->lock); + { + priv->nr_files++; + } + UNLOCK (&priv->lock); op_ret = 0; - out: +out: SET_TO_OLD_FS_ID (); - if ((-1 == op_ret) && (_fd != -1)) + if ((-1 == op_ret) && (_fd != -1)) { close (_fd); - STACK_UNWIND (frame, op_ret, op_errno, fd, loc->inode, &stbuf); + if (!was_present) { + unlink (real_path); + } + } + + STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, + fd, (loc)?loc->inode:NULL, &stbuf, &preparent, + &postparent, xdata); return 0; } int32_t posix_open (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, fd_t *fd) + loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = NULL; - int32_t _fd = -1; - struct posix_fd * pfd = NULL; - struct posix_private * priv = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = NULL; + int32_t _fd = -1; + struct posix_fd *pfd = NULL; + struct posix_private *priv = NULL; + struct iatt stbuf = {0, }; DECLARE_OLD_FS_ID_VAR; @@ -1433,75 +2118,66 @@ posix_open (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (fd, out); priv = this->private; + VALIDATE_OR_GOTO (priv, out); + MAKE_INODE_HANDLE (real_path, this, loc, &stbuf); + + op_ret = -1; SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); if (priv->o_direct) flags |= O_DIRECT; _fd = open (real_path, flags, 0); if (_fd == -1) { + op_ret = -1; op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", real_path, strerror (op_errno)); goto out; } - pfd = CALLOC (1, sizeof (*pfd)); - + pfd = GF_CALLOC (1, sizeof (*pfd), gf_posix_mt_posix_fd); if (!pfd) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); goto out; } pfd->flags = flags; pfd->fd = _fd; - fd_ctx_set (fd, this, (uint64_t)(long)pfd); - - ((struct posix_private *)this->private)->stats.nr_files++; + op_ret = fd_ctx_set (fd, this, (uint64_t)(long)pfd); + if (op_ret) + gf_log (this->name, GF_LOG_WARNING, + "failed to set the fd context path=%s fd=%p", + real_path, fd); -#ifndef HAVE_SET_FSID - if (flags & O_CREAT) { - op_ret = chown (real_path, frame->root->uid, frame->root->gid); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "chown on %s failed: %s", - real_path, strerror (op_errno)); - goto out; - } + LOCK (&priv->lock); + { + priv->nr_files++; } -#endif + UNLOCK (&priv->lock); op_ret = 0; - out: +out: if (op_ret == -1) { if (_fd != -1) { close (_fd); - _fd = -1; } } SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, fd); + STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, NULL); return 0; } -#define ALIGN_BUF(ptr,bound) ((void *)((unsigned long)(ptr + bound - 1) & \ - (unsigned long)(~(bound - 1)))) - int posix_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset) + fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) { - uint64_t tmp_pfd = 0; int32_t op_ret = -1; int32_t op_errno = 0; int _fd = -1; @@ -1510,8 +2186,7 @@ posix_readv (call_frame_t *frame, xlator_t *this, struct iobref * iobref = NULL; struct iovec vec = {0,}; struct posix_fd * pfd = NULL; - struct stat stbuf = {0,}; - int align = 1; + struct iatt stbuf = {0,}; int ret = -1; VALIDATE_OR_GOTO (frame, out); @@ -1520,45 +2195,30 @@ posix_readv (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this->private, out); priv = this->private; + VALIDATE_OR_GOTO (priv, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL from fd=%p", fd); + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; if (!size) { op_errno = EINVAL; - gf_log (this->name, GF_LOG_DEBUG, "size=%"GF_PRI_SIZET, size); + gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); goto out; } - if (pfd->flags & O_DIRECT) { - align = 4096; /* align to page boundary */ - } - - iobuf = iobuf_get (this->ctx->iobuf_pool); + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); if (!iobuf) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + op_errno = ENOMEM; goto out; } _fd = pfd->fd; - - op_ret = lseek (_fd, offset, SEEK_SET); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lseek(%"PRId64") failed: %s", - offset, strerror (op_errno)); - goto out; - } - - op_ret = read (_fd, iobuf->ptr, size); + op_ret = pread (_fd, iobuf->ptr, size, offset); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -1567,13 +2227,15 @@ posix_readv (call_frame_t *frame, xlator_t *this, goto out; } - priv->read_value += op_ret; - priv->interval_read += op_ret; + LOCK (&priv->lock); + { + priv->read_value += op_ret; + } + UNLOCK (&priv->lock); vec.iov_base = iobuf->ptr; vec.iov_len = op_ret; - op_ret = -1; iobref = iobref_new (); iobref_add (iobref, iobuf); @@ -1583,7 +2245,7 @@ posix_readv (call_frame_t *frame, xlator_t *this, * we read from */ - op_ret = fstat (_fd, &stbuf); + op_ret = posix_fdstat (this, _fd, &stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -1591,11 +2253,16 @@ posix_readv (call_frame_t *frame, xlator_t *this, strerror (op_errno)); goto out; } - - op_ret = vec.iov_len; - out: - STACK_UNWIND (frame, op_ret, op_errno, &vec, 1, &stbuf, iobref); + /* Hack to notify higher layers of EOF. */ + if (!stbuf.ia_size || (offset + vec.iov_len) >= stbuf.ia_size) + op_errno = ENOENT; + + op_ret = vec.iov_len; +out: + + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, + &vec, 1, &stbuf, iobref, NULL); if (iobref) iobref_unref (iobref); @@ -1607,25 +2274,139 @@ posix_readv (call_frame_t *frame, xlator_t *this, int32_t -posix_writev (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) +__posix_pwritev (int fd, struct iovec *vector, int count, off_t offset) +{ + int32_t op_ret = 0; + int idx = 0; + int retval = 0; + off_t internal_off = 0; + + if (!vector) + return -EFAULT; + + internal_off = offset; + for (idx = 0; idx < count; idx++) { + retval = pwrite (fd, vector[idx].iov_base, vector[idx].iov_len, + internal_off); + if (retval == -1) { + op_ret = -errno; + goto err; + } + op_ret += retval; + internal_off += retval; + } + +err: + return op_ret; +} + +int32_t +__posix_writev (int fd, struct iovec *vector, int count, off_t startoff, + int odirect) +{ + int32_t op_ret = 0; + int idx = 0; + int max_buf_size = 0; + int retval = 0; + char *buf = NULL; + char *alloc_buf = NULL; + off_t internal_off = 0; + + /* Check for the O_DIRECT flag during open() */ + if (!odirect) + return __posix_pwritev (fd, vector, count, startoff); + + for (idx = 0; idx < count; idx++) { + if (max_buf_size < vector[idx].iov_len) + max_buf_size = vector[idx].iov_len; + } + + alloc_buf = _page_aligned_alloc (max_buf_size, &buf); + if (!alloc_buf) { + op_ret = -errno; + goto err; + } + + internal_off = startoff; + for (idx = 0; idx < count; idx++) { + memcpy (buf, vector[idx].iov_base, vector[idx].iov_len); + + /* not sure whether writev works on O_DIRECT'd fd */ + retval = pwrite (fd, buf, vector[idx].iov_len, internal_off); + if (retval == -1) { + op_ret = -errno; + goto err; + } + + op_ret += retval; + internal_off += retval; + } + +err: + GF_FREE (alloc_buf); + + return op_ret; +} + +dict_t* +_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append) +{ + dict_t *rsp_xdata = NULL; + int32_t ret = 0; + inode_t *inode = NULL; + + if (fd) + inode = fd->inode; + + if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid Args: " + "fd: %p inode: %p gfid:%s", fd, inode?inode:0, + inode?uuid_utoa(inode->gfid):"N/A"); + goto out; + } + + if (!xdata || !dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) + goto out; + + rsp_xdata = dict_new(); + if (!rsp_xdata) + goto out; + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, + fd->inode->fd_count); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_OPEN_FD_COUNT); + } + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, + is_append); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_WRITE_IS_APPEND); + } +out: + return rsp_xdata; +} + +int32_t +posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; int _fd = -1; struct posix_private * priv = NULL; struct posix_fd * pfd = NULL; - struct stat stbuf = {0,}; + struct iatt preop = {0,}; + struct iatt postop = {0,}; int ret = -1; - - int idx = 0; - int align = 4096; - int max_buf_size = 0; - int retval = 0; - char * buf = NULL; - char * alloc_buf = NULL; - uint64_t tmp_pfd = 0; + dict_t *rsp_xdata = NULL; + int is_append = 0; + gf_boolean_t locked = _gf_false; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1637,115 +2418,111 @@ posix_writev (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (priv, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL from fd=%p", fd); + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); op_errno = -ret; goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; _fd = pfd->fd; - op_ret = lseek (_fd, offset, SEEK_SET); + if (xdata && dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { + /* The write_is_append check and write must happen + atomically. Else another write can overtake this + write after the check and get written earlier. + So lock before preop-stat and unlock after write. + */ + locked = _gf_true; + LOCK(&fd->inode->lock); + } + + op_ret = posix_fdstat (this, _fd, &preop); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "lseek(%"PRId64") on fd=%p failed: %s", - offset, fd, strerror (op_errno)); + "pre-operation fstat failed on fd=%p: %s", fd, + strerror (op_errno)); goto out; } - /* Check for the O_DIRECT flag during open() */ - if (pfd->flags & O_DIRECT) { - /* This is O_DIRECT'd file */ - op_ret = -1; - for (idx = 0; idx < count; idx++) { - if (max_buf_size < vector[idx].iov_len) - max_buf_size = vector[idx].iov_len; - } - - alloc_buf = MALLOC (1 * (max_buf_size + align)); - if (!alloc_buf) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - for (idx = 0; idx < count; idx++) { - /* page aligned buffer */ - buf = ALIGN_BUF (alloc_buf, align); - - memcpy (buf, vector[idx].iov_base, - vector[idx].iov_len); - - /* not sure whether writev works on O_DIRECT'd fd */ - retval = write (_fd, buf, vector[idx].iov_len); - - if (retval == -1) { - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "O_DIRECT enabled on fd=%p: %s", - fd, strerror (op_errno)); - goto out; - } + if (locked) { + if (preop.ia_size == offset || (fd->flags & O_APPEND)) + is_append = 1; + } - break; - } - if (op_ret == -1) - op_ret = 0; - op_ret += retval; - } + op_ret = __posix_writev (_fd, vector, count, offset, + (pfd->flags & O_DIRECT)); - } else /* if (O_DIRECT) */ { + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } - /* This is not O_DIRECT'd fd */ - op_ret = writev (_fd, vector, count); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "writev failed on fd=%p: %s", - fd, strerror (op_errno)); - goto out; - } + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64 + ", %s", offset, strerror (op_errno)); + goto out; } - priv->write_value += op_ret; - priv->interval_write += op_ret; + LOCK (&priv->lock); + { + priv->write_value += op_ret; + } + UNLOCK (&priv->lock); if (op_ret >= 0) { + rsp_xdata = _fill_writev_xdata (fd, xdata, this, is_append); /* wiretv successful, we also need to get the stat of * the file we wrote to */ - ret = fstat (_fd, &stbuf); + + if (flags & (O_SYNC|O_DSYNC)) { + ret = fsync (_fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fsync() in writev on fd %d failed: %s", + _fd, strerror (errno)); + op_ret = -1; + op_errno = errno; + goto out; + } + } + + ret = posix_fdstat (this, _fd, &postop); if (ret == -1) { - op_ret = -1; + op_ret = -1; op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "fstat failed on fd=%p: %s", + gf_log (this->name, GF_LOG_ERROR, + "post-operation fstat failed on fd=%p: %s", fd, strerror (op_errno)); goto out; } } - out: - if (alloc_buf) { - FREE (alloc_buf); - } +out: - STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop, + rsp_xdata); + if (rsp_xdata) + dict_unref (rsp_xdata); return 0; } int32_t posix_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc) + loc_t *loc, dict_t *xdata) { char * real_path = NULL; int32_t op_ret = -1; @@ -1758,7 +2535,7 @@ posix_statfs (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (loc, out); VALIDATE_OR_GOTO (this->private, out); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_INODE_HANDLE (real_path, this, loc, NULL); priv = this->private; @@ -1766,7 +2543,7 @@ posix_statfs (call_frame_t *frame, xlator_t *this, if (op_ret == -1) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, + gf_log (this->name, GF_LOG_ERROR, "statvfs failed on %s: %s", real_path, strerror (op_errno)); goto out; @@ -1783,116 +2560,127 @@ posix_statfs (call_frame_t *frame, xlator_t *this, op_ret = 0; - out: - STACK_UNWIND (frame, op_ret, op_errno, &buf); +out: + STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, &buf, NULL); return 0; } int32_t posix_flush (call_frame_t *frame, xlator_t *this, - fd_t *fd) + fd_t *fd, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; - int _fd = -1; - struct posix_fd * pfd = NULL; int ret = -1; - uint64_t tmp_pfd = 0; + struct posix_fd *pfd = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_WARNING, "pfd is NULL on fd=%p", fd); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; - - _fd = pfd->fd; - - /* do nothing */ op_ret = 0; - out: - STACK_UNWIND (frame, op_ret, op_errno); +out: + STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, NULL); return 0; } int32_t -posix_release (xlator_t *this, - fd_t *fd) +posix_release (xlator_t *this, fd_t *fd) { - int32_t op_ret = -1; - int32_t op_errno = 0; - int _fd = -1; struct posix_private * priv = NULL; struct posix_fd * pfd = NULL; int ret = -1; - uint64_t tmp_pfd = 0; + uint64_t tmp_pfd = 0; VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); priv = this->private; - priv->stats.nr_files--; - - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = fd_ctx_del (fd, this, &tmp_pfd); if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_WARNING, "pfd is NULL from fd=%p", fd); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; - - _fd = pfd->fd; - - op_ret = close (_fd); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "close failed on fd=%p: %s", fd, strerror (op_errno)); - goto out; - } + pfd = (struct posix_fd *)(long)tmp_pfd; if (pfd->dir) { - op_ret = -1; - op_errno = EBADF; - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_WARNING, "pfd->dir is %p (not NULL) for file fd=%p", pfd->dir, fd); - goto out; } - op_ret = 0; + pthread_mutex_lock (&priv->janitor_lock); + { + INIT_LIST_HEAD (&pfd->list); + list_add_tail (&pfd->list, &priv->janitor_fds); + pthread_cond_signal (&priv->janitor_cond); + } + pthread_mutex_unlock (&priv->janitor_lock); - out: - if (pfd) - FREE (pfd); + LOCK (&priv->lock); + { + priv->nr_files--; + } + UNLOCK (&priv->lock); +out: return 0; } +int +posix_batch_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int datasync, dict_t *xdata) +{ + call_stub_t *stub = NULL; + struct posix_private *priv = NULL; + + priv = this->private; + + stub = fop_fsync_stub (frame, default_fsync, fd, datasync, xdata); + if (!stub) { + STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + pthread_mutex_lock (&priv->fsync_mutex); + { + list_add_tail (&stub->list, &priv->fsyncs); + priv->fsync_queue_count++; + pthread_cond_signal (&priv->fsync_cond); + } + pthread_mutex_unlock (&priv->fsync_mutex); + + return 0; +} + + int32_t posix_fsync (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t datasync) + fd_t *fd, int32_t datasync, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; int _fd = -1; struct posix_fd * pfd = NULL; int ret = -1; - uint64_t tmp_pfd = 0; + struct iatt preop = {0,}; + struct iatt postop = {0,}; + struct posix_private *priv = NULL; DECLARE_OLD_FS_ID_VAR; @@ -1908,183 +2696,94 @@ posix_fsync (call_frame_t *frame, xlator_t *this, goto out; #endif - ret = fd_ctx_get (fd, this, &tmp_pfd); + priv = this->private; + if (priv->batch_fsync_mode && xdata && dict_get (xdata, "batch-fsync")) { + posix_batch_fsync (frame, this, fd, datasync, xdata); + return 0; + } + + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_WARNING, "pfd not found in fd's ctx"); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; _fd = pfd->fd; + op_ret = posix_fdstat (this, _fd, &preop); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "pre-operation fstat failed on fd=%p: %s", fd, + strerror (op_errno)); + goto out; + } + if (datasync) { ; #ifdef HAVE_FDATASYNC op_ret = fdatasync (_fd); + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "fdatasync on fd=%p failed: %s", + fd, strerror (errno)); + } #endif } else { op_ret = fsync (_fd); if (op_ret == -1) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, + gf_log (this->name, GF_LOG_ERROR, "fsync on fd=%p failed: %s", fd, strerror (op_errno)); + goto out; } } + op_ret = posix_fdstat (this, _fd, &postop); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "post-operation fstat failed on fd=%p: %s", fd, + strerror (op_errno)); + goto out; + } + op_ret = 0; - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, &preop, &postop, + NULL); return 0; } static int gf_posix_xattr_enotsup_log; - -int -set_file_contents (xlator_t *this, char *real_path, - data_pair_t *trav, int flags) -{ - char * key = NULL; - char real_filepath[ZR_PATH_MAX] = {0,}; - int32_t file_fd = -1; - int op_ret = 0; - int ret = -1; - - key = &(trav->key[15]); - sprintf (real_filepath, "%s/%s", real_path, key); - - if (flags & XATTR_REPLACE) { - /* if file exists, replace it - * else, error out */ - file_fd = open (real_filepath, O_TRUNC|O_WRONLY); - - if (file_fd == -1) { - goto create; - } - - if (trav->value->len) { - ret = write (file_fd, trav->value->data, - trav->value->len); - if (ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "write failed while doing setxattr " - "for key %s on path %s: %s", - key, real_filepath, strerror (errno)); - goto out; - } - - ret = close (file_fd); - if (ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "close failed on %s: %s", - real_filepath, strerror (errno)); - goto out; - } - } - - create: /* we know file doesn't exist, create it */ - - file_fd = open (real_filepath, O_CREAT|O_WRONLY, 0644); - - if (file_fd == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "failed to open file %s with O_CREAT: %s", - key, strerror (errno)); - goto out; - } - - ret = write (file_fd, trav->value->data, trav->value->len); - if (ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "write failed on %s while setxattr with " - "key %s: %s", - real_filepath, key, strerror (errno)); - goto out; - } - - ret = close (file_fd); - if (ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "close failed on %s while setxattr with " - "key %s: %s", - real_filepath, key, strerror (errno)); - goto out; - } - } - - out: - return op_ret; -} - -int -handle_pair (xlator_t *this, char *real_path, - data_pair_t *trav, int flags) +static int +_handle_setxattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) { - int sys_ret = -1; - int ret = 0; - - if (ZR_FILE_CONTENT_REQUEST(trav->key)) { - ret = set_file_contents (this, real_path, trav, flags); - } else { - sys_ret = sys_lsetxattr (real_path, trav->key, - trav->value->data, - trav->value->len, flags); - - if (sys_ret < 0) { - if (errno == ENOTSUP) { - GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, - this->name,GF_LOG_WARNING, - "Extended attributes not " - "supported"); - } else if (errno == ENOENT) { - gf_log (this->name, GF_LOG_ERROR, - "setxattr on %s failed: %s", real_path, - strerror (errno)); - } else { + posix_xattr_filler_t *filler = NULL; -#ifdef GF_DARWIN_HOST_OS - gf_log (this->name, - ((errno == EINVAL) ? - GF_LOG_DEBUG : GF_LOG_ERROR), - "%s: key:%s error:%s", - real_path, trav->key, - strerror (errno)); -#else /* ! DARWIN */ - gf_log (this->name, GF_LOG_ERROR, - "%s: key:%s error:%s", - real_path, trav->key, - strerror (errno)); -#endif /* DARWIN */ - } + filler = tmp; - ret = -errno; - goto out; - } - } - out: - return ret; + return posix_handle_pair (filler->this, filler->real_path, k, v, + filler->flags); } int32_t posix_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int flags) + loc_t *loc, dict_t *dict, int flags, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; char * real_path = NULL; - data_pair_t * trav = NULL; - int ret = -1; + + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; SET_FS_ID (frame->root->uid, frame->root->gid); @@ -2094,96 +2793,72 @@ posix_setxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (loc, out); VALIDATE_OR_GOTO (dict, out); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_INODE_HANDLE (real_path, this, loc, NULL); - trav = dict->members_list; + op_ret = -1; + dict_del (dict, GFID_XATTR_KEY); - while (trav) { - ret = handle_pair (this, real_path, trav, flags); - if (ret < 0) { - op_errno = -ret; - goto out; - } - trav = trav->next; - } - - op_ret = 0; + filler.real_path = real_path; + filler.this = this; + filler.flags = flags; + op_ret = dict_foreach (dict, _handle_setxattr_keyvalue_pair, + &filler); + if (op_ret < 0) + op_errno = -op_ret; - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, NULL); return 0; } + int -get_file_contents (xlator_t *this, char *real_path, - const char *name, char **contents) +posix_xattr_get_real_filename (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *dict, dict_t *xdata) { - char real_filepath[ZR_PATH_MAX] = {0,}; - char * key = NULL; - int32_t file_fd = -1; - struct stat stbuf = {0,}; - int op_ret = 0; - int ret = -1; - - key = (char *) &(name[15]); - sprintf (real_filepath, "%s/%s", real_path, key); - - op_ret = lstat (real_filepath, &stbuf); - if (op_ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, "lstat failed on %s: %s", - real_filepath, strerror (errno)); - goto out; - } - - file_fd = open (real_filepath, O_RDONLY); - - if (file_fd == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, "open failed on %s: %s", - real_filepath, strerror (errno)); - goto out; - } - - *contents = CALLOC (stbuf.st_size + 1, sizeof(char)); - - if (! *contents) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); - goto out; - } - - ret = read (file_fd, *contents, stbuf.st_size); - if (ret <= 0) { - op_ret = -1; - gf_log (this->name, GF_LOG_ERROR, "read on %s failed: %s", - real_filepath, strerror (errno)); - goto out; - } + char *real_path = NULL; + struct dirent *dirent = NULL; + DIR *fd = NULL; + const char *fname = NULL; + char *found = NULL; + int ret = -1; + int op_ret = -1; + + MAKE_INODE_HANDLE (real_path, this, loc, NULL); + + fd = opendir (real_path); + if (!fd) + return -errno; + + fname = key + strlen (GF_XATTR_GET_REAL_FILENAME_KEY); + + while ((dirent = readdir (fd))) { + if (strcasecmp (dirent->d_name, fname) == 0) { + found = gf_strdup (dirent->d_name); + if (!found) { + closedir (fd); + return -ENOMEM; + } + break; + } + } - *contents[stbuf.st_size] = '\0'; + closedir (fd); - op_ret = close (file_fd); - file_fd = -1; - if (op_ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, "close on %s failed: %s", - real_filepath, strerror (errno)); - goto out; - } + if (!found) + return -ENOENT; - out: - if (op_ret < 0) { - if (*contents) - FREE (*contents); - if (file_fd != -1) - close (file_fd); - } + ret = dict_set_dynstr (dict, (char *)key, found); + if (ret) { + GF_FREE (found); + return -ENOMEM; + } + ret = strlen (found) + 1; - return op_ret; + return ret; } /** @@ -2193,20 +2868,25 @@ get_file_contents (xlator_t *this, char *real_path, */ int32_t posix_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) -{ - int32_t op_ret = -1; - int32_t op_errno = ENOENT; - int32_t list_offset = 0; - size_t size = 0; - size_t remaining_size = 0; - char key[1024] = {0,}; - char * value = NULL; - char * list = NULL; - char * real_path = NULL; - dict_t * dict = NULL; - char * file_contents = NULL; - int ret = -1; + loc_t *loc, const char *name, dict_t *xdata) +{ + struct posix_private *priv = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t list_offset = 0; + ssize_t size = 0; + size_t remaining_size = 0; + char key[4096] = {0,}; + char host_buf[1024] = {0,}; + char *value = NULL; + char *list = NULL; + char *real_path = NULL; + dict_t *dict = NULL; + char *file_contents = NULL; + int ret = -1; + char *path = NULL; + char *rpath = NULL; + char *dyn_rpath = NULL; DECLARE_OLD_FS_ID_VAR; @@ -2215,28 +2895,194 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (loc, out); SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_INODE_HANDLE (real_path, this, loc, NULL); + + op_ret = -1; + priv = this->private; - if (loc->inode && S_ISDIR(loc->inode->st_mode) && name && - ZR_FILE_CONTENT_REQUEST(name)) { - ret = get_file_contents (this, real_path, name, - &file_contents); + if (loc->inode && IA_ISDIR(loc->inode->ia_type) && name && + ZR_FILE_CONTENT_REQUEST(name)) { + ret = posix_get_file_contents (this, loc->gfid, &name[15], + &file_contents); if (ret < 0) { op_errno = -ret; gf_log (this->name, GF_LOG_ERROR, - "getting file contents failed: %s", + "getting file contents failed: %s", strerror (op_errno)); goto out; } } - /* Get the total size */ - dict = get_new_dict (); + dict = dict_new (); if (!dict) { - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); + op_errno = ENOMEM; goto out; } + if (loc->inode && name && + (strncmp (name, GF_XATTR_GET_REAL_FILENAME_KEY, + strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)) { + ret = posix_xattr_get_real_filename (frame, this, loc, + name, dict, xdata); + if (ret < 0) { + op_ret = -1; + op_errno = -ret; + gf_log (this->name, (op_errno == ENOENT) ? + GF_LOG_DEBUG : GF_LOG_WARNING, + "Failed to get real filename (%s, %s): %s", + loc->path, name, strerror (op_errno)); + goto out; + } + + size = ret; + goto done; + } + + if (loc->inode && name && !strcmp (name, GLUSTERFS_OPEN_FD_COUNT)) { + if (!list_empty (&loc->inode->fd_list)) { + ret = dict_set_uint32 (dict, (char *)name, 1); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "Failed to set dictionary value for %s", + name); + } else { + ret = dict_set_uint32 (dict, (char *)name, 0); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "Failed to set dictionary value for %s", + name); + } + goto done; + } + if (loc->inode && name && + (strcmp (name, GF_XATTR_PATHINFO_KEY) == 0)) { + if (LOC_HAS_ABSPATH (loc)) + MAKE_REAL_PATH (rpath, this, loc->path); + else + rpath = real_path; + + (void) snprintf (host_buf, 1024, + "<POSIX(%s):%s:%s>", priv->base_path, + ((priv->node_uuid_pathinfo + && !uuid_is_null(priv->glusterd_uuid)) + ? uuid_utoa (priv->glusterd_uuid) + : priv->hostname), + rpath); + + dyn_rpath = gf_strdup (host_buf); + if (!dyn_rpath) { + ret = -1; + goto done; + } + size = strlen (dyn_rpath) + 1; + ret = dict_set_dynstr (dict, GF_XATTR_PATHINFO_KEY, + dyn_rpath); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "could not set value (%s) in dictionary", + dyn_rpath); + GF_FREE (dyn_rpath); + } + + goto done; + } + + if (loc->inode && name && + (strcmp (name, GF_XATTR_NODE_UUID_KEY) == 0) + && !uuid_is_null (priv->glusterd_uuid)) { + (void) snprintf (host_buf, 1024, "%s", + uuid_utoa (priv->glusterd_uuid)); + + dyn_rpath = gf_strdup (host_buf); + if (!dyn_rpath) { + ret = -1; + goto done; + } + + size = strlen (dyn_rpath) + 1; + ret = dict_set_dynstr (dict, GF_XATTR_NODE_UUID_KEY, + dyn_rpath); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "could not set value (%s) in dictionary", + dyn_rpath); + GF_FREE (dyn_rpath); + } + goto done; + } + + if (loc->inode && name && + (strcmp (name, GFID_TO_PATH_KEY) == 0)) { + ret = inode_path (loc->inode, NULL, &path); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: could not get " + "inode path", uuid_utoa (loc->inode->gfid)); + goto done; + } + + ret = dict_set_dynstr (dict, GFID_TO_PATH_KEY, path); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "could not set value (%s) in dictionary", + host_buf); + GF_FREE (path); + } + goto done; + } + + if (name) { + strcpy (key, name); + + size = sys_lgetxattr (real_path, key, NULL, 0); + if (size <= 0) { + op_errno = errno; + if ((op_errno == ENOTSUP) || (op_errno == ENOSYS)) { + GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, + this->name, GF_LOG_WARNING, + "Extended attributes not " + "supported (try remounting" + " brick with 'user_xattr' " + "flag)"); + } else if (op_errno == ENOATTR || + op_errno == ENODATA) { + gf_log (this->name, GF_LOG_DEBUG, + "No such attribute:%s for file %s", + key, real_path); + } else { + gf_log (this->name, GF_LOG_ERROR, + "getxattr failed on %s: %s (%s)", + real_path, key, strerror (op_errno)); + } + + goto done; + } + value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char); + if (!value) { + op_ret = -1; + goto out; + } + size = sys_lgetxattr (real_path, key, value, size); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "getxattr failed on " + "%s: key = %s (%s)", real_path, key, + strerror (op_errno)); + GF_FREE (value); + goto out; + } + value [size] = '\0'; + op_ret = dict_set_dynptr (dict, key, value, size); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "on %s for the key %s failed.", real_path, key); + GF_FREE (value); + goto out; + } + + goto done; + } + size = sys_llistxattr (real_path, NULL, 0); if (size == -1) { op_errno = errno; @@ -2244,11 +3090,13 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, this->name, GF_LOG_WARNING, "Extended attributes not " - "supported."); + "supported (try remounting" + " brick with 'user_xattr' " + "flag)"); } else { gf_log (this->name, GF_LOG_ERROR, - "listxattr failed on %s: %s", + "listxattr failed on %s: %s", real_path, strerror (op_errno)); } goto out; @@ -2260,7 +3108,6 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, list = alloca (size + 1); if (!list) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); goto out; } @@ -2269,43 +3116,63 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, remaining_size = size; list_offset = 0; while (remaining_size > 0) { - if(*(list + list_offset) == '\0') + if (*(list + list_offset) == '\0') break; strcpy (key, list + list_offset); - op_ret = sys_lgetxattr (real_path, key, NULL, 0); - if (op_ret == -1) + size = sys_lgetxattr (real_path, key, NULL, 0); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "getxattr failed on " + "%s: key = %s (%s)", real_path, key, + strerror (op_errno)); break; + } - value = CALLOC (op_ret + 1, sizeof(char)); + value = GF_CALLOC (size + 1, sizeof(char), + gf_posix_mt_char); if (!value) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); goto out; } - op_ret = sys_lgetxattr (real_path, key, value, op_ret); - if (op_ret == -1) + size = sys_lgetxattr (real_path, key, value, size); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "getxattr failed on " + "%s: key = %s (%s)", real_path, key, + strerror (op_errno)); + GF_FREE (value); break; + } + + value [size] = '\0'; + op_ret = dict_set_dynptr (dict, key, value, size); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "on %s for the key %s failed.", real_path, key); + GF_FREE (value); + goto out; + } - value [op_ret] = '\0'; - dict_set (dict, key, data_from_dynptr (value, op_ret)); remaining_size -= strlen (key) + 1; list_offset += strlen (key) + 1; } /* while (remaining_size > 0) */ - done: +done: op_ret = size; if (dict) { - dict_ref (dict); + dict_del (dict, GFID_XATTR_KEY); } - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, dict); + STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, NULL); if (dict) dict_unref (dict); @@ -2316,17 +3183,16 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, int32_t posix_fgetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name) + fd_t *fd, const char *name, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = ENOENT; - uint64_t tmp_pfd = 0; struct posix_fd * pfd = NULL; int _fd = -1; int32_t list_offset = 0; - size_t size = 0; + ssize_t size = 0; size_t remaining_size = 0; - char key[1024] = {0,}; + char key[4096] = {0,}; char * value = NULL; char * list = NULL; dict_t * dict = NULL; @@ -2340,24 +3206,68 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, SET_FS_ID (frame->root->uid, frame->root->gid); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL from fd=%p", fd); + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; _fd = pfd->fd; /* Get the total size */ dict = get_new_dict (); if (!dict) { - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); goto out; } + if (name && !strcmp (name, GLUSTERFS_OPEN_FD_COUNT)) { + ret = dict_set_uint32 (dict, (char *)name, 1); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "Failed to set dictionary value for %s", + name); + goto done; + } + + if (name) { + strcpy (key, name); + + size = sys_fgetxattr (_fd, key, NULL, 0); + if (size <= 0) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "key %s (%s)", key, strerror (op_errno)); + goto done; + } + + value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char); + if (!value) { + op_ret = -1; + goto out; + } + size = sys_fgetxattr (_fd, key, value, size); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "fd %p for the key %s (%s)", fd, key, + strerror (op_errno)); + GF_FREE (value); + goto out; + } + value [size] = '\0'; + op_ret = dict_set_dynptr (dict, key, value, size); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "on key %s failed", key); + GF_FREE (value); + goto out; + } + goto done; + } + size = sys_flistxattr (_fd, NULL, 0); if (size == -1) { op_errno = errno; @@ -2365,11 +3275,12 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, this->name, GF_LOG_WARNING, "Extended attributes not " - "supported."); + "supported (try remounting " + "brick with 'user_xattr' flag)"); } else { gf_log (this->name, GF_LOG_ERROR, - "listxattr failed on %p: %s", + "listxattr failed on %p: %s", fd, strerror (op_errno)); } goto out; @@ -2381,7 +3292,6 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, list = alloca (size + 1); if (!list) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); goto out; } @@ -2394,39 +3304,60 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, break; strcpy (key, list + list_offset); - op_ret = sys_fgetxattr (_fd, key, NULL, 0); - if (op_ret == -1) + size = sys_fgetxattr (_fd, key, NULL, 0); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "fd %p for the key %s (%s)", fd, key, + strerror (op_errno)); break; + } - value = CALLOC (op_ret + 1, sizeof(char)); + value = GF_CALLOC (size + 1, sizeof(char), + gf_posix_mt_char); if (!value) { + op_ret = -1; op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); goto out; } - op_ret = sys_fgetxattr (_fd, key, value, op_ret); - if (op_ret == -1) + size = sys_fgetxattr (_fd, key, value, size); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "the fd %p for the key %s (%s)", fd, key, + strerror (op_errno)); + GF_FREE (value); break; + } - value [op_ret] = '\0'; - dict_set (dict, key, data_from_dynptr (value, op_ret)); + value [size] = '\0'; + op_ret = dict_set_dynptr (dict, key, value, size); + if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "failed on key %s", key); + GF_FREE (value); + goto out; + } remaining_size -= strlen (key) + 1; list_offset += strlen (key) + 1; } /* while (remaining_size > 0) */ - done: +done: op_ret = size; if (dict) { + dict_del (dict, GFID_XATTR_KEY); dict_ref (dict); } - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, dict); + STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, NULL); if (dict) dict_unref (dict); @@ -2434,64 +3365,29 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, return 0; } +static int +_handle_fsetxattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + posix_xattr_filler_t *filler = NULL; -int -fhandle_pair (xlator_t *this, int fd, - data_pair_t *trav, int flags) -{ - int sys_ret = -1; - int ret = 0; - - sys_ret = sys_fsetxattr (fd, trav->key, trav->value->data, - trav->value->len, flags); - - if (sys_ret < 0) { - if (errno == ENOTSUP) { - GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, - this->name,GF_LOG_WARNING, - "Extended attributes not " - "supported"); - } else if (errno == ENOENT) { - gf_log (this->name, GF_LOG_ERROR, - "fsetxattr on fd=%d failed: %s", fd, - strerror (errno)); - } else { - -#ifdef GF_DARWIN_HOST_OS - gf_log (this->name, - ((errno == EINVAL) ? - GF_LOG_DEBUG : GF_LOG_ERROR), - "fd=%d: key:%s error:%s", - fd, trav->key, - strerror (errno)); -#else /* ! DARWIN */ - gf_log (this->name, GF_LOG_ERROR, - "fd=%d: key:%s error:%s", - fd, trav->key, - strerror (errno)); -#endif /* DARWIN */ - } - - ret = -errno; - goto out; - } + filler = tmp; -out: - return ret; + return posix_fhandle_pair (filler->this, filler->fd, k, v, + filler->flags); } - int32_t posix_fsetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *dict, int flags) + fd_t *fd, dict_t *dict, int flags, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; struct posix_fd * pfd = NULL; - uint64_t tmp_pfd = 0; int _fd = -1; - data_pair_t * trav = NULL; - int ret = -1; + int ret = -1; + + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; SET_FS_ID (frame->root->uid, frame->root->gid); @@ -2501,103 +3397,190 @@ posix_fsetxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (fd, out); VALIDATE_OR_GOTO (dict, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL from fd=%p", fd); + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; _fd = pfd->fd; - trav = dict->members_list; - - while (trav) { - ret = fhandle_pair (this, _fd, trav, flags); - if (ret < 0) { - op_errno = -ret; - goto out; - } - trav = trav->next; - } + dict_del (dict, GFID_XATTR_KEY); - op_ret = 0; + filler.fd = _fd; + filler.this = this; + filler.flags = flags; + op_ret = dict_foreach (dict, _handle_fsetxattr_keyvalue_pair, + &filler); + if (op_ret < 0) + op_errno = -op_ret; - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, NULL); return 0; } +int +_posix_remove_xattr (dict_t *dict, char *key, data_t *value, void *data) +{ + int32_t op_ret = 0; + xlator_t *this = NULL; + posix_xattr_filler_t *filler = NULL; + + filler = (posix_xattr_filler_t *) data; + this = filler->this; + + op_ret = sys_lremovexattr (filler->real_path, key); + if (op_ret == -1) { + filler->op_errno = errno; + if (errno != ENOATTR && errno != EPERM) + gf_log (this->name, GF_LOG_ERROR, + "removexattr failed on %s (for %s): %s", + filler->real_path, key, strerror (errno)); + } + + return op_ret; +} + int32_t posix_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) + loc_t *loc, const char *name, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; char * real_path = NULL; + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_INODE_HANDLE (real_path, this, loc, NULL); + + if (!strcmp (GFID_XATTR_KEY, name)) { + gf_log (this->name, GF_LOG_WARNING, "Remove xattr called" + " on gfid for file %s", real_path); + op_ret = -1; + goto out; + } + SET_FS_ID (frame->root->uid, frame->root->gid); + /** + * sending an empty key name with xdata containing the + * list of key(s) to be removed implies "bulk remove request" + * for removexattr. + */ + if (name && (strcmp (name, "") == 0) && xdata) { + filler.real_path = real_path; + filler.this = this; + op_ret = dict_foreach (xdata, _posix_remove_xattr, &filler); + if (op_ret) { + op_errno = filler.op_errno; + } + + goto out; + } + op_ret = sys_lremovexattr (real_path, name); + if (op_ret == -1) { + op_errno = errno; + if (op_errno != ENOATTR && op_errno != EPERM) + gf_log (this->name, GF_LOG_ERROR, + "removexattr on %s (for %s): %s", real_path, + name, strerror (op_errno)); + goto out; + } + op_ret = 0; + +out: + SET_TO_OLD_FS_ID (); + + STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, NULL); + return 0; +} + +int32_t +posix_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + struct posix_fd * pfd = NULL; + int _fd = -1; + int ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + if (!strcmp (GFID_XATTR_KEY, name)) { + gf_log (this->name, GF_LOG_WARNING, "Remove xattr called" + " on gfid for file"); + goto out; + } + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); + goto out; + } + _fd = pfd->fd; + + + + SET_FS_ID (frame->root->uid, frame->root->gid); + + op_ret = sys_fremovexattr (_fd, name); if (op_ret == -1) { op_errno = errno; - if (op_errno != ENOATTR && op_errno != EPERM) - gf_log (this->name, GF_LOG_ERROR, - "removexattr on %s: %s", loc->path, - strerror (op_errno)); + if (op_errno != ENOATTR && op_errno != EPERM) + gf_log (this->name, GF_LOG_ERROR, + "fremovexattr (for %s): %s", + name, strerror (op_errno)); goto out; } op_ret = 0; - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, NULL); return 0; } int32_t posix_fsyncdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, int datasync) + fd_t *fd, int datasync, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; - struct posix_fd * pfd = NULL; - int _fd = -1; int ret = -1; - uint64_t tmp_pfd = 0; + struct posix_fd *pfd = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_WARNING, "pfd is NULL, fd=%p", fd); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; - - _fd = pfd->fd; op_ret = 0; - out: - STACK_UNWIND (frame, op_ret, op_errno); +out: + STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, NULL); return 0; } @@ -2605,12 +3588,12 @@ posix_fsyncdir (call_frame_t *frame, xlator_t *this, void posix_print_xattr (dict_t *this, - char *key, - data_t *value, - void *data) + char *key, + data_t *value, + void *data) { - gf_log ("posix", GF_LOG_DEBUG, - "(key/val) = (%s/%d)", key, data_to_int32 (value)); + gf_log ("posix", GF_LOG_DEBUG, + "(key/val) = (%s/%d)", key, data_to_int32 (value)); } @@ -2624,245 +3607,281 @@ posix_print_xattr (dict_t *this, static void __add_array (int32_t *dest, int32_t *src, int count) { - int i = 0; - for (i = 0; i < count; i++) { - dest[i] = hton32 (ntoh32 (dest[i]) + ntoh32 (src[i])); - } + int i = 0; + int32_t destval = 0; + for (i = 0; i < count; i++) { + destval = ntoh32 (dest[i]); + if (destval == 0xffffffff) + continue; + dest[i] = hton32 (destval + ntoh32 (src[i])); + } } +static void +__or_array (int32_t *dest, int32_t *src, int count) +{ + int i = 0; + for (i = 0; i < count; i++) { + dest[i] = hton32 (ntoh32 (dest[i]) | ntoh32 (src[i])); + } +} -/** - * xattrop - xattr operations - for internal use by GlusterFS - * @optype: ADD_ARRAY: - * dict should contain: - * "key" ==> array of 32-bit numbers - */ - -int -posix_xattrop (call_frame_t *frame, xlator_t *this, - loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr) +static void +__and_array (int32_t *dest, int32_t *src, int count) { - char *real_path = NULL; - int32_t *array = NULL; - int size = 0; - int count = 0; + int i = 0; + for (i = 0; i < count; i++) { + dest[i] = hton32 (ntoh32 (dest[i]) & ntoh32 (src[i])); + } +} - int op_ret = 0; - int op_errno = 0; +static void +__add_long_array (int64_t *dest, int64_t *src, int count) +{ + int i = 0; + for (i = 0; i < count; i++) { + dest[i] = hton64 (ntoh64 (dest[i]) + ntoh64 (src[i])); + } +} - data_pair_t *trav = NULL; +static int +_posix_handle_xattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + int size = 0; + int count = 0; + int op_ret = 0; + int op_errno = 0; + gf_xattrop_flags_t optype = 0; + char *array = NULL; + inode_t *inode = NULL; + xlator_t *this = NULL; + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + optype = (gf_xattrop_flags_t)(filler->flags); + this = filler->this; + inode = filler->inode; + + count = v->len; + array = GF_CALLOC (count, sizeof (char), gf_posix_mt_char); + + LOCK (&inode->lock); + { + if (filler->real_path) { + size = sys_lgetxattr (filler->real_path, k, + (char *)array, v->len); + } else { + size = sys_fgetxattr (filler->fd, k, (char *)array, + v->len); + } - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (xattr, out); - VALIDATE_OR_GOTO (this, out); + op_errno = errno; + if ((size == -1) && (op_errno != ENODATA) && + (op_errno != ENOATTR)) { + if (op_errno == ENOTSUP) { + GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, + this->name, GF_LOG_WARNING, + "Extended attributes not " + "supported by filesystem"); + } else if (op_errno != ENOENT || + !posix_special_xattr (marker_xattrs, + k)) { + if (filler->real_path) + gf_log (this->name, GF_LOG_ERROR, + "getxattr failed on %s while doing " + "xattrop: Key:%s (%s)", + filler->real_path, + k, strerror (op_errno)); + else + gf_log (this->name, GF_LOG_ERROR, + "fgetxattr failed on fd=%d while doing " + "xattrop: Key:%s (%s)", + filler->fd, + k, strerror (op_errno)); + } - trav = xattr->members_list; + op_ret = -1; + goto unlock; + } - if (loc->path) - MAKE_REAL_PATH (real_path, this, loc->path); + switch (optype) { - while (trav) { - count = trav->value->len / sizeof (int32_t); - array = CALLOC (count, sizeof (int32_t)); - - size = sys_lgetxattr (real_path, trav->key, (char *)array, - trav->value->len); + case GF_XATTROP_ADD_ARRAY: + __add_array ((int32_t *) array, (int32_t *) v->data, + v->len / 4); + break; - op_errno = errno; - if ((size == -1) && (op_errno != ENODATA) && - (op_errno != ENOATTR)) { - if (op_errno == ENOTSUP) { - GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, - this->name,GF_LOG_WARNING, - "Extended attributes not " - "supported by filesystem"); - } else { - gf_log (this->name, GF_LOG_ERROR, - "getxattr failed on %s while doing " - "xattrop: %s", loc->path, - strerror (op_errno)); - } - goto out; - } + case GF_XATTROP_ADD_ARRAY64: + __add_long_array ((int64_t *) array, (int64_t *) v->data, + v->len / 8); + break; - switch (optype) { + case GF_XATTROP_OR_ARRAY: + __or_array ((int32_t *) array, + (int32_t *) v->data, + v->len / 4); + break; - case GF_XATTROP_ADD_ARRAY: - __add_array (array, (int32_t *) trav->value->data, - trav->value->len / 4); - break; + case GF_XATTROP_AND_ARRAY: + __and_array ((int32_t *) array, + (int32_t *) v->data, + v->len / 4); + break; - default: - gf_log (this->name, GF_LOG_ERROR, - "Unknown xattrop type (%d) on %s. Please send " + default: + gf_log (this->name, GF_LOG_ERROR, + "Unknown xattrop type (%d) on %s. Please send " "a bug report to gluster-devel@nongnu.org", - optype, loc->path); - op_ret = -1; - op_errno = EINVAL; - goto out; - } + optype, filler->real_path); + op_ret = -1; + op_errno = EINVAL; + goto unlock; + } - size = sys_lsetxattr (real_path, trav->key, array, - trav->value->len, 0); + if (filler->real_path) { + size = sys_lsetxattr (filler->real_path, k, array, + v->len, 0); + } else { + size = sys_fsetxattr (filler->fd, k, (char *)array, + v->len, 0); + } + } +unlock: + UNLOCK (&inode->lock); + + if (op_ret == -1) + goto out; + + op_errno = errno; + if (size == -1) { + if (filler->real_path) + gf_log (this->name, GF_LOG_ERROR, + "setxattr failed on %s while doing xattrop: " + "key=%s (%s)", filler->real_path, + k, strerror (op_errno)); + else + gf_log (this->name, GF_LOG_ERROR, + "fsetxattr failed on fd=%d while doing xattrop: " + "key=%s (%s)", filler->fd, + k, strerror (op_errno)); + + op_ret = -1; + goto out; + } else { + size = dict_set_bin (d, k, array, v->len); + + if (size != 0) { + if (filler->real_path) + gf_log (this->name, GF_LOG_DEBUG, + "dict_set_bin failed (path=%s): " + "key=%s (%s)", filler->real_path, + k, strerror (-size)); + else + gf_log (this->name, GF_LOG_DEBUG, + "dict_set_bin failed (fd=%d): " + "key=%s (%s)", filler->fd, + k, strerror (-size)); + + op_ret = -1; + op_errno = EINVAL; + goto out; + } + array = NULL; + } + + array = NULL; - op_errno = errno; - if (size == -1) { - gf_log (this->name, GF_LOG_ERROR, - "setxattr failed on %s while doing xattrop: " - "key=%s (%s)", loc->path, - trav->key, strerror (op_errno)); - op_ret = -1; - goto out; - } else { - size = dict_set_bin (xattr, trav->key, array, - trav->value->len); - - if (size != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "dict_set_bin failed (path=%s): " - "key=%s (%s)", loc->path, - trav->key, strerror (-size)); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - array = NULL; - } - - array = NULL; - trav = trav->next; - } - out: - if (array) - FREE (array); - STACK_UNWIND (frame, op_ret, op_errno, xattr); - return 0; + return op_ret; } +/** + * xattrop - xattr operations - for internal use by GlusterFS + * @optype: ADD_ARRAY: + * dict should contain: + * "key" ==> array of 32-bit numbers + */ int -posix_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) +do_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr) { - int32_t *array = NULL; - int size = 0; - int count = 0; + int op_ret = 0; + int op_errno = 0; + int _fd = -1; + char *real_path = NULL; + struct posix_fd *pfd = NULL; + inode_t *inode = NULL; + posix_xattr_filler_t filler = {0,}; - int op_ret = 0; - int op_errno = 0; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (xattr, out); + VALIDATE_OR_GOTO (this, out); - int _fd = -1; - struct posix_fd *pfd = NULL; + if (fd) { + op_ret = posix_fd_ctx_get (fd, this, &pfd); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to get pfd from fd=%p", + fd); + op_errno = EBADFD; + goto out; + } + _fd = pfd->fd; + } - data_pair_t *trav = NULL; - int32_t ret = -1; + if (loc && !uuid_is_null (loc->gfid)) + MAKE_INODE_HANDLE (real_path, this, loc, NULL); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (xattr, out); - VALIDATE_OR_GOTO (this, out); + if (real_path) { + inode = loc->inode; + } else if (fd) { + inode = fd->inode; + } - trav = xattr->members_list; + filler.this = this; + filler.fd = _fd; + filler.real_path = real_path; + filler.flags = (int)optype; + filler.inode = inode; - if (fd) { - ret = fd_ctx_get (fd, this, (uint64_t *)&pfd); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get pfd from fd=%p", - fd); - op_ret = -1; - op_errno = EBADFD; - goto out; - } - _fd = pfd->fd; - } + op_ret = dict_foreach (xattr, _posix_handle_xattr_keyvalue_pair, + &filler); - while (trav) { - count = trav->value->len / sizeof (int32_t); - array = CALLOC (count, sizeof (int32_t)); - - size = sys_fgetxattr (_fd, trav->key, (char *)array, - trav->value->len); - - op_errno = errno; - if ((size == -1) && ((op_errno != ENODATA) && - (op_errno != ENOATTR))) { - if (op_errno == ENOTSUP) { - GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, - this->name,GF_LOG_WARNING, - "extended attributes not " - "supported by filesystem"); - } else { - gf_log (this->name, GF_LOG_ERROR, - "fgetxattr failed on fd=%d while: " - "doing xattrop: %s", _fd, - strerror (op_errno)); - } - goto out; - } +out: - switch (optype) { - case GF_XATTROP_ADD_ARRAY: - __add_array (array, (int32_t *) trav->value->data, - trav->value->len / 4); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "Unknown xattrop type (%d) on fd=%d." - "Please send a bug report to " - "gluster-devel@nongnu.org", - optype, _fd); - op_ret = -1; - op_errno = EINVAL; - goto out; - } + STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, xattr, NULL); + return 0; +} - size = sys_fsetxattr (_fd, trav->key, (char *)array, - trav->value->len, 0); - op_errno = errno; - if (size == -1) { - gf_log (this->name, GF_LOG_ERROR, - "fsetxattr failed on fd=%d while doing: " - "xattrop. key=%s (%s)", _fd, - trav->key, strerror (op_errno)); - op_ret = -1; - goto out; - } else { - size = dict_set_bin (xattr, trav->key, array, - trav->value->len); - - if (size != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "dict_set_bin failed (fd=%d): " - "key=%s (%s)", _fd, - trav->key, strerror (-size)); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - array = NULL; - } - - array = NULL; - trav = trav->next; - } - -out: - if (array) - FREE (array); - STACK_UNWIND (frame, op_ret, op_errno, xattr); - return 0; +int +posix_xattrop (call_frame_t *frame, xlator_t *this, + loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + do_xattrop (frame, this, loc, NULL, optype, xattr); + return 0; +} + + +int +posix_fxattrop (call_frame_t *frame, xlator_t *this, + fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + do_xattrop (frame, this, NULL, fd, optype, xattr); + return 0; } int posix_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask) + loc_t *loc, int32_t mask, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = NULL; DECLARE_OLD_FS_ID_VAR; SET_FS_ID (frame->root->uid, frame->root->gid); @@ -2871,37 +3890,37 @@ posix_access (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_INODE_HANDLE (real_path, this, loc, NULL); op_ret = access (real_path, mask & 07); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "access failed on %s: %s", - loc->path, strerror (op_errno)); + real_path, strerror (op_errno)); goto out; } - op_ret = 0; - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, NULL); return 0; } int32_t posix_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset) + fd_t *fd, off_t offset, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - int _fd = -1; - struct stat buf = {0,}; - struct posix_fd * pfd = NULL; - int ret = -1; - uint64_t tmp_pfd = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + struct iatt preop = {0,}; + struct iatt postop = {0,}; + struct posix_fd *pfd = NULL; + int ret = -1; + struct posix_private *priv = NULL; DECLARE_OLD_FS_ID_VAR; SET_FS_ID (frame->root->uid, frame->root->gid); @@ -2910,85 +3929,92 @@ posix_ftruncate (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + priv = this->private; + VALIDATE_OR_GOTO (priv, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_WARNING, "pfd is NULL, fd=%p", fd); op_errno = -ret; goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; _fd = pfd->fd; + op_ret = posix_fdstat (this, _fd, &preop); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "pre-operation fstat failed on fd=%p: %s", fd, + strerror (op_errno)); + goto out; + } + op_ret = ftruncate (_fd, offset); if (op_ret == -1) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "ftruncate failed on fd=%p: %s", - fd, strerror (errno)); + gf_log (this->name, GF_LOG_ERROR, + "ftruncate failed on fd=%p (%"PRId64": %s", + fd, offset, strerror (errno)); goto out; } - op_ret = fstat (_fd, &buf); + op_ret = posix_fdstat (this, _fd, &postop); if (op_ret == -1) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "fstat failed on fd=%p: %s", + gf_log (this->name, GF_LOG_ERROR, + "post-operation fstat failed on fd=%p: %s", fd, strerror (errno)); goto out; } op_ret = 0; - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, &buf); + STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, &preop, + &postop, NULL); return 0; } + int32_t -posix_fchown (call_frame_t *frame, xlator_t *this, - fd_t *fd, uid_t uid, gid_t gid) +posix_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - int _fd = -1; - struct stat buf = {0,}; - struct posix_fd * pfd = NULL; - int ret = -1; - uint64_t tmp_pfd = 0; + int _fd = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + struct iatt buf = {0,}; + struct posix_fd *pfd = NULL; + int ret = -1; + struct posix_private *priv = NULL; DECLARE_OLD_FS_ID_VAR; - SET_FS_ID (frame->root->uid, frame->root->gid); VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + priv = this->private; + VALIDATE_OR_GOTO (priv, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_WARNING, "pfd is NULL, fd=%p", fd); op_errno = -ret; goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; _fd = pfd->fd; - op_ret = fchown (_fd, uid, gid); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "fchown failed on fd=%p: %s", - fd, strerror (op_errno)); - goto out; - } - - op_ret = fstat (_fd, &buf); + op_ret = posix_fdstat (this, _fd, &buf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "fstat failed on fd=%p: %s", @@ -2998,792 +4024,621 @@ posix_fchown (call_frame_t *frame, xlator_t *this, op_ret = 0; - out: +out: SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, &buf); - + STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, &buf, NULL); return 0; } +static int gf_posix_lk_log; int32_t -posix_fchmod (call_frame_t *frame, xlator_t *this, - fd_t *fd, mode_t mode) +posix_lk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - int _fd = -1; - struct stat buf = {0,}; - struct posix_fd * pfd = NULL; - int ret = -1; - uint64_t tmp_pfd = 0; - - DECLARE_OLD_FS_ID_VAR; - - SET_FS_ID (frame->root->uid, frame->root->gid); + struct gf_flock nullock = {0, }; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL fd=%p", fd); - op_errno = -ret; - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; + GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_CRITICAL, + "\"features/locks\" translator is " + "not loaded. You need to use it for proper " + "functioning of your application."); - _fd = pfd->fd; + STACK_UNWIND_STRICT (lk, frame, -1, ENOSYS, &nullock, NULL); + return 0; +} - op_ret = fchmod (_fd, mode); +int32_t +posix_inodelk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) +{ + GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_CRITICAL, + "\"features/locks\" translator is " + "not loaded. You need to use it for proper " + "functioning of your application."); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "fchmod failed on fd=%p: %s", fd, strerror (errno)); - goto out; - } + STACK_UNWIND_STRICT (inodelk, frame, -1, ENOSYS, NULL); + return 0; +} - op_ret = fstat (_fd, &buf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "fstat failed on fd=%p: %s", - fd, strerror (errno)); - goto out; - } +int32_t +posix_finodelk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) +{ + GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_CRITICAL, + "\"features/locks\" translator is " + "not loaded. You need to use it for proper " + "functioning of your application."); - op_ret = 0; + STACK_UNWIND_STRICT (finodelk, frame, -1, ENOSYS, NULL); + return 0; +} - out: - SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, &buf); +int32_t +posix_entrylk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +{ + GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_CRITICAL, + "\"features/locks\" translator is " + "not loaded. You need to use it for proper " + "functioning of your application."); + STACK_UNWIND_STRICT (entrylk, frame, -1, ENOSYS, NULL); return 0; } - -static int -same_file_type (mode_t m1, mode_t m2) +int32_t +posix_fentrylk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - return ((S_IFMT & (m1 ^ m2)) == 0); + GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_CRITICAL, + "\"features/locks\" translator is " + "not loaded. You need to use it for proper " + "functioning of your application."); + + STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOSYS, NULL); + return 0; } -static int -ensure_file_type (xlator_t *this, char *pathname, mode_t mode) +int +posix_fill_readdir (fd_t *fd, DIR *dir, off_t off, size_t size, + gf_dirent_t *entries, xlator_t *this, int32_t skip_dirs) { - struct stat stbuf = {0,}; - int op_ret = 0; - int ret = -1; - - ret = lstat (pathname, &stbuf); - if (ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "stat failed while trying to make sure entry %s " - "is a directory: %s", pathname, strerror (errno)); - goto out; + off_t in_case = -1; + size_t filled = 0; + int count = 0; + char entrybuf[sizeof(struct dirent) + 256 + 8]; + struct dirent *entry = NULL; + int32_t this_size = -1; + gf_dirent_t *this_entry = NULL; + uuid_t rootgfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; + struct stat stbuf = {0,}; + char *hpath = NULL; + int len = 0; + int ret = 0; + + if (skip_dirs) { + len = posix_handle_path (this, fd->inode->gfid, NULL, NULL, 0); + hpath = alloca (len + 256); /* NAME_MAX */ + posix_handle_path (this, fd->inode->gfid, NULL, hpath, len); + len = strlen (hpath); + hpath[len] = '/'; } - if (!same_file_type (mode, stbuf.st_mode)) { - op_ret = -EEXIST; - gf_log (this->name, GF_LOG_ERROR, - "entry %s is a different type of file " - "than expected", pathname); - goto out; + if (!off) { + rewinddir (dir); + } else { + seekdir (dir, off); } - out: - return op_ret; -} -static int -create_entry (xlator_t *this, int32_t flags, - dir_entry_t *entry, char *pathname) -{ - int op_ret = 0; - int ret = -1; - struct timeval tv[2] = {{0,0},{0,0}}; - - if (S_ISDIR (entry->buf.st_mode)) { - /* - * If the entry is directory, create it by - * calling 'mkdir'. If the entry is already - * present, check if it is a directory, - * and issue a warning if otherwise. - */ + while (filled <= size) { + in_case = telldir (dir); - ret = mkdir (pathname, entry->buf.st_mode); - if (ret == -1) { - if (errno == EEXIST) { - op_ret = ensure_file_type (this, pathname, - entry->buf.st_mode); - } - else { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "mkdir %s with mode (0%o) failed: %s", - pathname, entry->buf.st_mode, - strerror (errno)); - goto out; - } + if (in_case == -1) { + gf_log (THIS->name, GF_LOG_ERROR, + "telldir failed on dir=%p: %s", + dir, strerror (errno)); + goto out; } - } else if ((flags & GF_SET_IF_NOT_PRESENT) - || !(flags & GF_SET_DIR_ONLY)) { - - /* create a 0-byte file here */ - - if (S_ISREG (entry->buf.st_mode)) { - ret = open (pathname, O_CREAT|O_EXCL, - entry->buf.st_mode); + errno = 0; + entry = NULL; + readdir_r (dir, (struct dirent *)entrybuf, &entry); - if (ret == -1) { - if (errno == EEXIST) { - op_ret = ensure_file_type (this, - pathname, - entry->buf.st_mode); - } - else { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "Error creating file %s with " - "mode (0%o): %s", - pathname, entry->buf.st_mode, - strerror (errno)); - goto out; - } + if (!entry) { + if (errno == EBADF) { + gf_log (THIS->name, GF_LOG_WARNING, + "readdir failed on dir=%p: %s", + dir, strerror (errno)); + goto out; } + break; + } - close (ret); - - } else if (S_ISLNK (entry->buf.st_mode)) { - ret = symlink (entry->link, pathname); - - if (ret == -1) { - if (errno == EEXIST) { - op_ret = ensure_file_type (this, - pathname, - entry->buf.st_mode); - } - else { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "error creating symlink %s: %s" - , pathname, strerror (errno)); - goto out; - } - } +#ifdef __NetBSD__ + /* + * NetBSD with UFS1 backend uses backing files for + * extended attributes. They can be found in a + * .attribute file located at the root of the filesystem + * We hide it to glusterfs clients, since chaos will occur + * when the cluster/dht xlator decides to distribute + * exended attribute backing file accross storage servers. + */ + if ((uuid_compare (fd->inode->gfid, rootgfid) == 0) + && (!strcmp(entry->d_name, ".attribute"))) + continue; +#endif /* __NetBSD__ */ + + if ((uuid_compare (fd->inode->gfid, rootgfid) == 0) + && (!strcmp (GF_HIDDEN_PATH, entry->d_name))) { + continue; + } - } else if (S_ISBLK (entry->buf.st_mode) || - S_ISCHR (entry->buf.st_mode) || - S_ISFIFO (entry->buf.st_mode) || - S_ISSOCK (entry->buf.st_mode)) { - - ret = mknod (pathname, entry->buf.st_mode, - entry->buf.st_dev); - - if (ret == -1) { - if (errno == EEXIST) { - op_ret = ensure_file_type (this, - pathname, - entry->buf.st_mode); - } else { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "error creating device file " - "%s: %s", - pathname, strerror (errno)); - goto out; - } + if (skip_dirs) { + if (DT_ISDIR (entry->d_type)) { + continue; + } else if (hpath) { + strcpy (&hpath[len+1],entry->d_name); + ret = lstat (hpath, &stbuf); + if (!ret && S_ISDIR (stbuf.st_mode)) + continue; } - } else { - gf_log (this->name, GF_LOG_ERROR, - "invalid mode 0%o for %s", entry->buf.st_mode, - pathname); - op_ret = -EINVAL; - goto out; - } - } - - /* - * Preserve atime and mtime - */ - - if (!S_ISLNK (entry->buf.st_mode)) { - tv[0].tv_sec = entry->buf.st_atime; - tv[1].tv_sec = entry->buf.st_mtime; - ret = utimes (pathname, tv); - if (ret == -1) { - op_ret = -errno; - gf_log (this->name, GF_LOG_ERROR, - "utimes %s failed: %s", - pathname, strerror (errno)); - goto out; - } - } - -out: - return op_ret; - -} - + } -int -posix_setdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t flags, dir_entry_t *entries, - int32_t count) -{ - char * real_path = NULL; - char * entry_path = NULL; - int32_t real_path_len = -1; - int32_t entry_path_len = -1; - int32_t ret = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - struct posix_fd * pfd = {0, }; - struct timeval tv[2] = {{0, }, {0, }}; - uint64_t tmp_pfd = 0; - char pathname[ZR_PATH_MAX] = {0,}; - dir_entry_t * trav = NULL; + this_size = max (sizeof (gf_dirent_t), + sizeof (gfs3_dirplist)) + + strlen (entry->d_name) + 1; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (entries, out); + if (this_size + filled > size) { + seekdir (dir, in_case); + break; + } - tv[0].tv_sec = tv[0].tv_usec = 0; - tv[1].tv_sec = tv[1].tv_usec = 0; + this_entry = gf_dirent_for_name (entry->d_name); - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_DEBUG, - "fd's ctx not found on fd=%p for %s", - fd, this->name); - goto out; - } - pfd = (struct posix_fd *)(long)tmp_pfd; + if (!this_entry) { + gf_log (THIS->name, GF_LOG_ERROR, + "could not create gf_dirent for entry %s: (%s)", + entry->d_name, strerror (errno)); + goto out; + } + this_entry->d_off = telldir (dir); + this_entry->d_ino = entry->d_ino; + this_entry->d_type = entry->d_type; - real_path = pfd->path; + list_add_tail (&this_entry->list, &entries->list); - if (!real_path) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_DEBUG, - "path is NULL on pfd=%p fd=%p", pfd, fd); - goto out; + filled += this_size; + count ++; } - real_path_len = strlen (real_path); - entry_path_len = real_path_len + 256; - entry_path = CALLOC (1, entry_path_len); - - if (!entry_path) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "Out of memory."); - goto out; - } + if ((!readdir (dir) && (errno == 0))) + /* Indicate EOF */ + errno = ENOENT; +out: + return count; +} - strcpy (entry_path, real_path); - entry_path[real_path_len] = '/'; +dict_t * +posix_entry_xattr_fill (xlator_t *this, inode_t *inode, + fd_t *fd, char *name, dict_t *dict, + struct iatt *stbuf) +{ + loc_t tmp_loc = {0,}; + char *entry_path = NULL; - /* fd exists, and everything looks fine */ - /** - * create an entry for each one present in '@entries' - * - if flag is set (ie, if its namespace), create both directories - * and files - * - if not set, create only directories. - * - * after the entry is created, change the mode and ownership of the - * entry according to the stat present in entries->buf. - */ + /* if we don't send the 'loc', open-fd-count be a problem. */ + tmp_loc.inode = inode; - trav = entries->next; - while (trav) { - strcpy (pathname, entry_path); - strcat (pathname, trav->name); + MAKE_HANDLE_PATH (entry_path, this, fd->inode->gfid, name); - ret = create_entry (this, flags, trav, pathname); - if (ret < 0) { - op_errno = -ret; - goto out; - } + return posix_lookup_xattr_fill (this, entry_path, + &tmp_loc, dict, stbuf); - /* TODO: handle another flag, GF_SET_OVERWRITE */ +} - /* Change the mode */ - if (!S_ISLNK (trav->buf.st_mode)) { - ret = chmod (pathname, trav->buf.st_mode); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "chmod on %s failed: %s", pathname, - strerror (op_errno)); - goto out; - } - } - /* change the ownership */ - ret = lchown (pathname, trav->buf.st_uid, trav->buf.st_gid); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "chmod on %s failed: %s", pathname, - strerror (op_errno)); - goto out; - } - - if (flags & GF_SET_EPOCH_TIME) { - ret = utimes (pathname, tv); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "utimes on %s failed: %s", pathname, - strerror (op_errno)); - goto out; - } +int +posix_readdirp_fill (xlator_t *this, fd_t *fd, gf_dirent_t *entries, dict_t *dict) +{ + gf_dirent_t *entry = NULL; + inode_table_t *itable = NULL; + inode_t *inode = NULL; + char *hpath = NULL; + int len = 0; + struct iatt stbuf = {0, }; + uuid_t gfid; + + if (list_empty(&entries->list)) + return 0; + + itable = fd->inode->table; + + len = posix_handle_path (this, fd->inode->gfid, NULL, NULL, 0); + hpath = alloca (len + 256); /* NAME_MAX */ + posix_handle_path (this, fd->inode->gfid, NULL, hpath, len); + len = strlen (hpath); + hpath[len] = '/'; + + list_for_each_entry (entry, &entries->list, list) { + memset (gfid, 0, 16); + inode = inode_grep (fd->inode->table, fd->inode, + entry->d_name); + if (inode) + uuid_copy (gfid, inode->gfid); + + strcpy (&hpath[len+1], entry->d_name); + + posix_pstat (this, gfid, hpath, &stbuf); + + if (!inode) + inode = inode_find (itable, stbuf.ia_gfid); + + if (!inode) + inode = inode_new (itable); + + entry->inode = inode; + + if (dict) { + entry->dict = + posix_entry_xattr_fill (this, entry->inode, + fd, entry->d_name, + dict, &stbuf); + dict_ref (entry->dict); } - /* consider the next entry */ - trav = trav->next; + entry->d_stat = stbuf; + if (stbuf.ia_ino) + entry->d_ino = stbuf.ia_ino; + inode = NULL; } - op_ret = 0; - out: - STACK_UNWIND (frame, op_ret, op_errno); - if (entry_path) - FREE (entry_path); - - return 0; + return 0; } + int32_t -posix_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd) +posix_do_readdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t off, int whichop, dict_t *dict) { - int _fd = -1; - int32_t op_ret = -1; - int32_t op_errno = 0; - struct stat buf = {0,}; - struct posix_fd * pfd = NULL; - uint64_t tmp_pfd = 0; - int ret = -1; + struct posix_fd *pfd = NULL; + DIR *dir = NULL; + int ret = -1; + int count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + gf_dirent_t entries; + int32_t skip_dirs = 0; - DECLARE_OLD_FS_ID_VAR; - SET_FS_ID (frame->root->uid, frame->root->gid); VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + INIT_LIST_HEAD (&entries.list); + + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_WARNING, "pfd is NULL, fd=%p", fd); op_errno = -ret; goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; - _fd = pfd->fd; + dir = pfd->dir; + + if (!dir) { + gf_log (this->name, GF_LOG_WARNING, + "dir is NULL for fd=%p", fd); + op_errno = EINVAL; + goto out; + } - op_ret = fstat (_fd, &buf); + /* When READDIR_FILTER option is set to on, we can filter out + * directory's entry from the entry->list. + */ + ret = dict_get_int32 (dict, GF_READDIR_SKIP_DIRS, &skip_dirs); + + LOCK (&fd->lock); + { + /* posix_fill_readdir performs multiple separate individual + readdir() calls to fill up the buffer. + + In case of NFS where the same anonymous FD is shared between + different applications, reading a common directory can + result in the anonymous fd getting re-used unsafely between + the two readdir requests (in two different io-threads). + + It would also help, in the future, to replace the loop + around readdir() with a single large getdents() call. + */ + count = posix_fill_readdir (fd, dir, off, size, &entries, this, + skip_dirs); + } + UNLOCK (&fd->lock); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "fstat failed on fd=%p: %s", - fd, strerror (op_errno)); + /* pick ENOENT to indicate EOF */ + op_errno = errno; + op_ret = count; + + if (whichop != GF_FOP_READDIRP) goto out; - } - op_ret = 0; + posix_readdirp_fill (this, fd, &entries, dict); - out: - SET_TO_OLD_FS_ID (); +out: + STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, NULL); + + gf_dirent_free (&entries); - STACK_UNWIND (frame, op_ret, op_errno, &buf); return 0; } -static int gf_posix_lk_log; int32_t -posix_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, struct flock *lock) +posix_readdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t off, dict_t *xdata) { - struct flock nullock = {0, }; - - gf_posix_lk_log++; - - GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_ERROR, - "\"features/locks\" translator is " - "not loaded. You need to use it for proper " - "functioning of your application."); - - STACK_UNWIND (frame, -1, ENOSYS, &nullock); + posix_do_readdir (frame, this, fd, size, off, GF_FOP_READDIR, xdata); return 0; } + int32_t -posix_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct flock *lock) +posix_readdirp (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t off, dict_t *dict) { - gf_log (this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is not loaded. " - "You need to use it for proper functioning of GlusterFS"); - - STACK_UNWIND (frame, -1, ENOSYS); + posix_do_readdir (frame, this, fd, size, off, GF_FOP_READDIRP, dict); return 0; } int32_t -posix_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct flock *lock) +posix_priv (xlator_t *this) { - gf_log (this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is not loaded. " - "You need to use it for proper functioning of GlusterFS"); + struct posix_private *priv = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; - STACK_UNWIND (frame, -1, ENOSYS); - return 0; -} + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, + this->name); + gf_proc_dump_add_section(key_prefix); + if (!this) + return 0; -int32_t -posix_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) -{ - gf_log (this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is not loaded. " - "You need to use it for proper functioning of GlusterFS"); + priv = this->private; + + if (!priv) + return 0; + + gf_proc_dump_write("base_path","%s", priv->base_path); + gf_proc_dump_write("base_path_length","%d", priv->base_path_length); + gf_proc_dump_write("max_read","%d", priv->read_value); + gf_proc_dump_write("max_write","%d", priv->write_value); + gf_proc_dump_write("nr_files","%ld", priv->nr_files); - STACK_UNWIND (frame, -1, ENOSYS); return 0; } int32_t -posix_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) +posix_inode (xlator_t *this) { - gf_log (this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is not loaded. " - " You need to use it for proper functioning of GlusterFS"); - - STACK_UNWIND (frame, -1, ENOSYS); return 0; } int32_t -posix_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off) +posix_rchecksum (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset, int32_t len, dict_t *xdata) { - uint64_t tmp_pfd = 0; - struct posix_fd * pfd = NULL; - DIR * dir = NULL; - int ret = -1; - size_t filled = 0; - int count = 0; - - int32_t op_ret = -1; - int32_t op_errno = 0; - - gf_dirent_t * this_entry = NULL; - gf_dirent_t entries; - struct dirent * entry = NULL; - off_t in_case = -1; - int32_t this_size = -1; - + char *alloc_buf = NULL; + char *buf = NULL; + int _fd = -1; + struct posix_fd *pfd = NULL; + int op_ret = -1; + int op_errno = 0; + int ret = 0; + int32_t weak_checksum = 0; + unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0}; + struct posix_private *priv = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); - INIT_LIST_HEAD (&entries.list); + priv = this->private; + memset (strong_checksum, 0, MD5_DIGEST_LENGTH); - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "pfd is NULL, fd=%p", fd); - op_errno = -ret; + alloc_buf = _page_aligned_alloc (len, &buf); + if (!alloc_buf) { + op_errno = ENOMEM; goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; - - dir = pfd->dir; - if (!dir) { - gf_log (this->name, GF_LOG_DEBUG, - "dir is NULL for fd=%p", fd); - op_errno = EINVAL; + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL, fd=%p", fd); + op_errno = -ret; goto out; } + _fd = pfd->fd; - if (!off) { - rewinddir (dir); - } else { - seekdir (dir, off); - } + LOCK (&fd->lock); + { + if (priv->aio_capable && priv->aio_init_done) + __posix_fd_set_odirect (fd, pfd, 0, offset, len); - while (filled <= size) { - in_case = telldir (dir); + ret = pread (_fd, buf, len, offset); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "pread of %d bytes returned %d (%s)", + len, ret, strerror (errno)); - if (in_case == -1) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "telldir failed on dir=%p: %s", - dir, strerror (errno)); - goto out; - } - - errno = 0; - entry = readdir (dir); - - if (!entry) { - if (errno == EBADF) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "readdir failed on dir=%p: %s", - dir, strerror (op_errno)); - goto out; - } - break; - } - - this_size = dirent_size (entry); - - if (this_size + filled > size) { - seekdir (dir, in_case); - break; } - - this_entry = gf_dirent_for_name (entry->d_name); - - if (!this_entry) { - gf_log (this->name, GF_LOG_ERROR, - "could not create gf_dirent for entry %s: (%s)", - entry->d_name, strerror (errno)); - goto out; - } - this_entry->d_off = telldir (dir); - this_entry->d_ino = entry->d_ino; - - list_add_tail (&this_entry->list, &entries.list); - - filled += this_size; - count ++; } + UNLOCK (&fd->lock); - op_ret = count; + if (ret < 0) + goto out; + + weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf, (size_t) len); + gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len, (unsigned char *) strong_checksum); - out: - STACK_UNWIND (frame, op_ret, op_errno, &entries); + op_ret = 0; +out: + STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno, + weak_checksum, strong_checksum, NULL); - gf_dirent_free (&entries); + GF_FREE (alloc_buf); return 0; } +/** + * notify - when parent sends PARENT_UP, send CHILD_UP event from here + */ int32_t -posix_stats (call_frame_t *frame, xlator_t *this, - int32_t flags) - +notify (xlator_t *this, + int32_t event, + void *data, + ...) { - int32_t op_ret = -1; - int32_t op_errno = 0; - - struct xlator_stats xlstats = {0, }; - struct xlator_stats * stats = NULL; - struct statvfs buf = {0,}; - struct timeval tv = {0,}; - struct posix_private * priv = (struct posix_private *)this->private; - - int64_t avg_read = 0; - int64_t avg_write = 0; - int64_t _time_ms = 0; - - DECLARE_OLD_FS_ID_VAR; - - SET_FS_ID (frame->root->uid, frame->root->gid); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - - stats = &xlstats; - - op_ret = statvfs (priv->base_path, &buf); - - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "statvfs failed: %s", - strerror (op_errno)); - goto out; + switch (event) + { + case GF_EVENT_PARENT_UP: + { + /* Tell the parent that posix xlator is up */ + default_notify (this, GF_EVENT_CHILD_UP, data); } - - /* client info is maintained at FSd */ - stats->nr_clients = priv->stats.nr_clients; - stats->nr_files = priv->stats.nr_files; - - /* number of free block in the filesystem. */ - stats->free_disk = buf.f_bfree * buf.f_bsize; - - stats->total_disk_size = buf.f_blocks * buf.f_bsize; - stats->disk_usage = (buf.f_blocks - buf.f_bavail) * buf.f_bsize; - - /* Calculate read and write usage */ - op_ret = gettimeofday (&tv, NULL); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "gettimeofday failed: %s", strerror (errno)); - goto out; + break; + default: + /* */ + break; } + return 0; +} - /* Read */ - _time_ms = (tv.tv_sec - priv->init_time.tv_sec) * 1000 + - ((tv.tv_usec - priv->init_time.tv_usec) / 1000); +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; - avg_read = (_time_ms) ? (priv->read_value / _time_ms) : 0; /* KBps */ - avg_write = (_time_ms) ? (priv->write_value / _time_ms) : 0; /* KBps */ + if (!this) + return ret; - _time_ms = (tv.tv_sec - priv->prev_fetch_time.tv_sec) * 1000 + - ((tv.tv_usec - priv->prev_fetch_time.tv_usec) / 1000); + ret = xlator_mem_acct_init (this, gf_posix_mt_end + 1); - if (_time_ms && ((priv->interval_read / _time_ms) > priv->max_read)) { - priv->max_read = (priv->interval_read / _time_ms); + if (ret != 0) { + gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; } - if (_time_ms && - ((priv->interval_write / _time_ms) > priv->max_write)) { - priv->max_write = priv->interval_write / _time_ms; - } + return ret; +} - stats->read_usage = avg_read / priv->max_read; - stats->write_usage = avg_write / priv->max_write; +static int +posix_set_owner (xlator_t *this, uid_t uid, gid_t gid) +{ + struct posix_private *priv = NULL; + int ret = -1; - op_ret = gettimeofday (&(priv->prev_fetch_time), NULL); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "gettimeofday failed: %s", - strerror (op_errno)); - goto out; - } + priv = this->private; - priv->interval_read = 0; - priv->interval_write = 0; + ret = sys_chown (priv->base_path, uid, gid); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "uid/gid for brick path %s, %s", + priv->base_path, strerror (errno)); - op_ret = 0; + return ret; +} - out: - SET_TO_OLD_FS_ID (); - STACK_UNWIND (frame, op_ret, op_errno, stats); - return 0; +static int +set_batch_fsync_mode (struct posix_private *priv, const char *str) +{ + if (strcmp (str, "none") == 0) + priv->batch_fsync_mode = BATCH_NONE; + else if (strcmp (str, "syncfs") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS; + else if (strcmp (str, "syncfs-single-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_SINGLE_FSYNC; + else if (strcmp (str, "syncfs-reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_REVERSE_FSYNC; + else if (strcmp (str, "reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_REVERSE_FSYNC; + else + return -1; + + return 0; } -int32_t -posix_checksum (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flag) + +int +reconfigure (xlator_t *this, dict_t *options) { - char * real_path = NULL; - DIR * dir = NULL; - struct dirent * dirent = NULL; - uint8_t file_checksum[ZR_FILENAME_MAX] = {0,}; - uint8_t dir_checksum[ZR_FILENAME_MAX] = {0,}; - int32_t op_ret = -1; - int32_t op_errno = 0; - int i = 0; - int length = 0; + int ret = -1; + struct posix_private *priv = NULL; + uid_t uid = -1; + gid_t gid = -1; + char *batch_fsync_mode_str = NULL; - struct stat buf = {0,}; - char tmp_real_path[ZR_PATH_MAX] = {0,}; - int ret = -1; + priv = this->private; - MAKE_REAL_PATH (real_path, this, loc->path); + GF_OPTION_RECONF ("brick-uid", uid, options, uint32, out); + GF_OPTION_RECONF ("brick-gid", gid, options, uint32, out); + posix_set_owner (this, uid, gid); - dir = opendir (real_path); + GF_OPTION_RECONF ("batch-fsync-delay-usec", priv->batch_fsync_delay_usec, + options, uint32, out); - if (!dir){ - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "opendir() failed on `%s': %s", - real_path, strerror (op_errno)); - goto out; - } + GF_OPTION_RECONF ("batch-fsync-mode", batch_fsync_mode_str, + options, str, out); - while ((dirent = readdir (dir))) { - errno = 0; - if (!dirent) { - if (errno != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "readdir() failed on dir=%p: %s", - dir, strerror (errno)); - goto out; - } - break; - } + if (set_batch_fsync_mode (priv, batch_fsync_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", + batch_fsync_mode_str); + goto out; + } - length = strlen (dirent->d_name); + GF_OPTION_RECONF ("linux-aio", priv->aio_configured, + options, bool, out); - strcpy (tmp_real_path, real_path); - strcat (tmp_real_path, "/"); - strcat (tmp_real_path, dirent->d_name); - ret = lstat (tmp_real_path, &buf); + if (priv->aio_configured) + posix_aio_on (this); + else + posix_aio_off (this); - if (ret == -1) - continue; + GF_OPTION_RECONF ("node-uuid-pathinfo", priv->node_uuid_pathinfo, + options, bool, out); - if (S_ISDIR (buf.st_mode)) { - for (i = 0; i < length; i++) - dir_checksum[i] ^= dirent->d_name[i]; - } else { - for (i = 0; i < length; i++) - file_checksum[i] ^= dirent->d_name[i]; - } + if (priv->node_uuid_pathinfo && + (uuid_is_null (priv->glusterd_uuid))) { + gf_log (this->name, GF_LOG_INFO, + "glusterd uuid is NULL, pathinfo xattr would" + " fallback to <hostname>:<export>"); } - closedir (dir); - - op_ret = 0; - out: - STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum); + GF_OPTION_RECONF ("health-check-interval", priv->health_check_interval, + options, uint32, out); + posix_spawn_health_check_thread (this); - return 0; + ret = 0; +out: + return ret; } -/** - * notify - when parent sends PARENT_UP, send CHILD_UP event from here - */ -int32_t -notify (xlator_t *this, - int32_t event, - void *data, - ...) -{ - switch (event) - { - case GF_EVENT_PARENT_UP: - { - /* Tell the parent that posix xlator is up */ - default_notify (this, GF_EVENT_CHILD_UP, data); - } - break; - default: - /* */ - break; - } - return 0; -} /** * init - @@ -3791,13 +4646,24 @@ notify (xlator_t *this, int init (xlator_t *this) { - int ret = 0; - int op_ret = -1; - gf_boolean_t tmp_bool = 0; - struct stat buf = {0,}; - struct posix_private * _private = NULL; - data_t * dir_data = NULL; - data_t * tmp_data = NULL; + struct posix_private *_private = NULL; + data_t *dir_data = NULL; + data_t *tmp_data = NULL; + struct stat buf = {0,}; + gf_boolean_t tmp_bool = 0; + int dict_ret = 0; + int ret = 0; + int op_ret = -1; + ssize_t size = -1; + int32_t janitor_sleep = 0; + uuid_t old_uuid = {0,}; + uuid_t dict_uuid = {0,}; + uuid_t gfid = {0,}; + uuid_t rootgfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; + char *guuid = NULL; + uid_t uid = -1; + gid_t gid = -1; + char *batch_fsync_mode_str; dir_data = dict_get (this->options, "directory"); @@ -3808,10 +4674,10 @@ init (xlator_t *this) goto out; } - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "Volume is dangling. Please check the volume file."); - } + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "Volume is dangling. Please check the volume file."); + } if (!dir_data) { gf_log (this->name, GF_LOG_CRITICAL, @@ -3822,144 +4688,249 @@ init (xlator_t *this) umask (000); // umask `masking' is done at the client side - /* Check whether the specified directory exists, if not create it. */ - op_ret = lstat (dir_data->data, &buf); - if ((ret != 0) || !S_ISDIR (buf.st_mode)) { + /* Check whether the specified directory exists, if not log it. */ + op_ret = stat (dir_data->data, &buf); + if ((op_ret != 0) || !S_ISDIR (buf.st_mode)) { gf_log (this->name, GF_LOG_ERROR, "Directory '%s' doesn't exist, exiting.", - dir_data->data); + dir_data->data); ret = -1; goto out; } - /* Check for Extended attribute support, if not present, log it */ op_ret = sys_lsetxattr (dir_data->data, - "trusted.glusterfs.test", "working", 8, 0); - if (op_ret < 0) { - tmp_data = dict_get (this->options, - "mandate-attribute"); - if (tmp_data) { - if (gf_string2boolean (tmp_data->data, - &tmp_bool) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "wrong option provided for key " - "\"mandate-xattr\""); - ret = -1; - goto out; - } - if (!tmp_bool) { - gf_log (this->name, GF_LOG_WARNING, - "Extended attribute not supported, " - "starting as per option"); - } else { - gf_log (this->name, GF_LOG_CRITICAL, - "Extended attribute not supported, " - "exiting."); - ret = -1; - goto out; - } - } else { - gf_log (this->name, GF_LOG_CRITICAL, - "Extended attribute not supported, exiting."); - ret = -1; - goto out; - } + "trusted.glusterfs.test", "working", 8, 0); + if (op_ret == 0) { + sys_lremovexattr (dir_data->data, "trusted.glusterfs.test"); + } else { + tmp_data = dict_get (this->options, + "mandate-attribute"); + if (tmp_data) { + if (gf_string2boolean (tmp_data->data, + &tmp_bool) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "wrong option provided for key " + "\"mandate-attribute\""); + ret = -1; + goto out; + } + if (!tmp_bool) { + gf_log (this->name, GF_LOG_WARNING, + "Extended attribute not supported, " + "starting as per option"); + } else { + gf_log (this->name, GF_LOG_CRITICAL, + "Extended attribute not supported, " + "exiting."); + ret = -1; + goto out; + } + } else { + gf_log (this->name, GF_LOG_CRITICAL, + "Extended attribute not supported, exiting."); + ret = -1; + goto out; + } } - _private = CALLOC (1, sizeof (*_private)); + tmp_data = dict_get (this->options, "volume-id"); + if (tmp_data) { + op_ret = uuid_parse (tmp_data->data, dict_uuid); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "wrong volume-id (%s) set in volume file", + tmp_data->data); + ret = -1; + goto out; + } + size = sys_lgetxattr (dir_data->data, + "trusted.glusterfs.volume-id", old_uuid, 16); + if (size == 16) { + if (uuid_compare (old_uuid, dict_uuid)) { + gf_log (this->name, GF_LOG_ERROR, + "mismatching volume-id (%s) received. " + "already is a part of volume %s ", + tmp_data->data, uuid_utoa (old_uuid)); + ret = -1; + goto out; + } + } else if ((size == -1) && (errno == ENODATA)) { + + gf_log (this->name, GF_LOG_ERROR, + "Extended attribute trusted.glusterfs." + "volume-id is absent"); + ret = -1; + goto out; + + } else if ((size == -1) && (errno != ENODATA)) { + /* Wrong 'volume-id' is set, it should be error */ + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to fetch volume-id (%s)", + dir_data->data, strerror (errno)); + ret = -1; + goto out; + } else { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "failed to fetch proper volume id from export"); + goto out; + } + } + + /* Now check if the export directory has some other 'gfid', + other than that of root '/' */ + size = sys_lgetxattr (dir_data->data, "trusted.gfid", gfid, 16); + if (size == 16) { + if (!__is_root_gfid (gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "%s: gfid (%s) is not that of glusterfs '/' ", + dir_data->data, uuid_utoa (gfid)); + ret = -1; + goto out; + } + } else if (size != -1) { + /* Wrong 'gfid' is set, it should be error */ + gf_log (this->name, GF_LOG_WARNING, + "%s: wrong value set as gfid", + dir_data->data); + ret = -1; + goto out; + } else if ((size == -1) && (errno != ENODATA)) { + /* Wrong 'gfid' is set, it should be error */ + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to fetch gfid (%s)", + dir_data->data, strerror (errno)); + ret = -1; + goto out; + } else { + /* First time volume, set the GFID */ + size = sys_lsetxattr (dir_data->data, "trusted.gfid", rootgfid, + 16, XATTR_CREATE); + if (size) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set gfid (%s)", + dir_data->data, strerror (errno)); + ret = -1; + goto out; + } + } + + size = sys_lgetxattr (dir_data->data, POSIX_ACL_ACCESS_XATTR, + NULL, 0); + if ((size < 0) && (errno == ENOTSUP)) + gf_log (this->name, GF_LOG_WARNING, + "Posix access control list is not supported."); + + ret = 0; + _private = GF_CALLOC (1, sizeof (*_private), + gf_posix_mt_posix_private); if (!_private) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); ret = -1; goto out; } - _private->base_path = strdup (dir_data->data); + _private->base_path = gf_strdup (dir_data->data); _private->base_path_length = strlen (_private->base_path); - { - /* Stats related variables */ - gettimeofday (&_private->init_time, NULL); - gettimeofday (&_private->prev_fetch_time, NULL); - _private->max_read = 1; - _private->max_write = 1; + LOCK_INIT (&_private->lock); + + ret = dict_get_str (this->options, "hostname", &_private->hostname); + if (ret) { + _private->hostname = GF_CALLOC (256, sizeof (char), + gf_common_mt_char); + if (!_private->hostname) { + goto out; + } + ret = gethostname (_private->hostname, 256); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "could not find hostname (%s)", strerror (errno)); + } } _private->export_statfs = 1; tmp_data = dict_get (this->options, "export-statfs-size"); if (tmp_data) { - if (gf_string2boolean (tmp_data->data, - &_private->export_statfs) == -1) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "'export-statfs-size' takes only boolean " - "options"); - goto out; - } + if (gf_string2boolean (tmp_data->data, + &_private->export_statfs) == -1) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "'export-statfs-size' takes only boolean " + "options"); + goto out; + } if (!_private->export_statfs) gf_log (this->name, GF_LOG_DEBUG, - "'statfs()' returns dummy size"); + "'statfs()' returns dummy size"); } _private->background_unlink = 0; tmp_data = dict_get (this->options, "background-unlink"); if (tmp_data) { - if (gf_string2boolean (tmp_data->data, - &_private->background_unlink) == -1) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "'export-statfs-size' takes only boolean " - "options"); - goto out; - } + if (gf_string2boolean (tmp_data->data, + &_private->background_unlink) == -1) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "'background-unlink' takes only boolean " + "options"); + goto out; + } if (_private->background_unlink) gf_log (this->name, GF_LOG_DEBUG, - "unlinks will be performed in background"); + "unlinks will be performed in background"); } tmp_data = dict_get (this->options, "o-direct"); if (tmp_data) { - if (gf_string2boolean (tmp_data->data, - &_private->o_direct) == -1) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "wrong option provided for 'o-direct'"); - goto out; - } - if (_private->o_direct) + if (gf_string2boolean (tmp_data->data, + &_private->o_direct) == -1) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "wrong option provided for 'o-direct'"); + goto out; + } + if (_private->o_direct) gf_log (this->name, GF_LOG_DEBUG, "o-direct mode is enabled (O_DIRECT " - "for every open)"); + "for every open)"); } - _private->num_devices_to_span = 1; - - tmp_data = dict_get (this->options, "span-devices"); - if (tmp_data) { - if (gf_string2int32 (tmp_data->data, - &_private->num_devices_to_span) == -1) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "wrong option provided for 'span-devices'"); - goto out; - } - if (_private->num_devices_to_span > 1) { - gf_log (this->name, GF_LOG_NORMAL, - "spanning enabled accross %d mounts", - _private->num_devices_to_span); - _private->span_devices = 1; - } - if (_private->num_devices_to_span < 1) - _private->num_devices_to_span = 1; + ret = dict_get_str (this->options, "glusterd-uuid", &guuid); + if (!ret) { + if (uuid_parse (guuid, _private->glusterd_uuid)) + gf_log (this->name, GF_LOG_WARNING, "Cannot parse " + "glusterd (node) UUID, node-uuid xattr " + "request would return - \"No such attribute\""); + } else { + gf_log (this->name, GF_LOG_DEBUG, "No glusterd (node) UUID " + "passed - node-uuid xattr request will return " + "\"No such attribute\""); } - _private->st_device = CALLOC (1, (sizeof (dev_t) * - _private->num_devices_to_span)); - - /* Start with the base */ - _private->st_device[0] = buf.st_dev; + ret = 0; + _private->janitor_sleep_duration = 600; + + dict_ret = dict_get_int32 (this->options, "janitor-sleep-duration", + &janitor_sleep); + if (dict_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Setting janitor sleep duration to %d.", + janitor_sleep); + + _private->janitor_sleep_duration = janitor_sleep; + } + /* performing open dir on brick dir locks the brick dir + * and prevents it from being unmounted + */ + _private->mount_lock = opendir (dir_data->data); + if (!_private->mount_lock) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "Could not lock brick directory"); + goto out; + } #ifndef GF_DARWIN_HOST_OS { struct rlimit lim; @@ -3968,29 +4939,106 @@ init (xlator_t *this) if (setrlimit (RLIMIT_NOFILE, &lim) == -1) { gf_log (this->name, GF_LOG_WARNING, - "Failed to set 'ulimit -n " - " 1048576': %s", strerror(errno)); + "Failed to set 'ulimit -n " + " 1048576': %s", strerror(errno)); lim.rlim_cur = 65536; lim.rlim_max = 65536; if (setrlimit (RLIMIT_NOFILE, &lim) == -1) { gf_log (this->name, GF_LOG_WARNING, - "Failed to set maximum allowed open " - "file descriptors to 64k: %s", + "Failed to set maximum allowed open " + "file descriptors to 64k: %s", strerror(errno)); } else { - gf_log (this->name, GF_LOG_NORMAL, - "Maximum allowed open file descriptors " + gf_log (this->name, GF_LOG_INFO, + "Maximum allowed open file descriptors " "set to 65536"); } } } #endif - this->private = (void *)_private; - out: + op_ret = posix_handle_init (this); + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Posix handle setup failed"); + ret = -1; + goto out; + } + + op_ret = posix_handle_trash_init (this); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Posix landfill setup failed"); + ret = -1; + goto out; + } + + _private->aio_init_done = _gf_false; + _private->aio_capable = _gf_false; + + GF_OPTION_INIT ("brick-uid", uid, uint32, out); + GF_OPTION_INIT ("brick-gid", gid, uint32, out); + posix_set_owner (this, uid, gid); + + GF_OPTION_INIT ("linux-aio", _private->aio_configured, bool, out); + + if (_private->aio_configured) { + op_ret = posix_aio_on (this); + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Posix AIO init failed"); + ret = -1; + goto out; + } + } + + GF_OPTION_INIT ("node-uuid-pathinfo", + _private->node_uuid_pathinfo, bool, out); + if (_private->node_uuid_pathinfo && + (uuid_is_null (_private->glusterd_uuid))) { + gf_log (this->name, GF_LOG_INFO, + "glusterd uuid is NULL, pathinfo xattr would" + " fallback to <hostname>:<export>"); + } + + _private->health_check_active = _gf_false; + GF_OPTION_INIT ("health-check-interval", + _private->health_check_interval, uint32, out); + if (_private->health_check_interval) + posix_spawn_health_check_thread (this); + + pthread_mutex_init (&_private->janitor_lock, NULL); + pthread_cond_init (&_private->janitor_cond, NULL); + INIT_LIST_HEAD (&_private->janitor_fds); + + posix_spawn_janitor_thread (this); + + pthread_mutex_init (&_private->fsync_mutex, NULL); + pthread_cond_init (&_private->fsync_cond, NULL); + INIT_LIST_HEAD (&_private->fsyncs); + + ret = gf_thread_create (&_private->fsyncer, NULL, posix_fsyncer, this); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fsyncer thread" + " creation failed (%s)", strerror (errno)); + goto out; + } + + GF_OPTION_INIT ("batch-fsync-mode", batch_fsync_mode_str, str, out); + + if (set_batch_fsync_mode (_private, batch_fsync_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", + batch_fsync_mode_str); + goto out; + } + + GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec, + uint32, out); +out: return ret; } @@ -3998,13 +5046,19 @@ void fini (xlator_t *this) { struct posix_private *priv = this->private; - sys_lremovexattr (priv->base_path, "trusted.glusterfs.test"); - FREE (priv); + if (!priv) + return; + this->private = NULL; + /*unlock brick dir*/ + if (priv->mount_lock) + closedir (priv->mount_lock); + GF_FREE (priv); return; } -struct xlator_mops mops = { - .stats = posix_stats, +struct xlator_dumpops dumpops = { + .priv = posix_priv, + .inode = posix_inode, }; struct xlator_fops fops = { @@ -4012,6 +5066,7 @@ struct xlator_fops fops = { .stat = posix_stat, .opendir = posix_opendir, .readdir = posix_readdir, + .readdirp = posix_readdirp, .readlink = posix_readlink, .mknod = posix_mknod, .mkdir = posix_mkdir, @@ -4020,10 +5075,7 @@ struct xlator_fops fops = { .symlink = posix_symlink, .rename = posix_rename, .link = posix_link, - .chmod = posix_chmod, - .chown = posix_chown, .truncate = posix_truncate, - .utimens = posix_utimens, .create = posix_create, .open = posix_open, .readv = posix_readv, @@ -4036,42 +5088,105 @@ struct xlator_fops fops = { .getxattr = posix_getxattr, .fgetxattr = posix_fgetxattr, .removexattr = posix_removexattr, + .fremovexattr = posix_fremovexattr, .fsyncdir = posix_fsyncdir, .access = posix_access, .ftruncate = posix_ftruncate, .fstat = posix_fstat, .lk = posix_lk, - .inodelk = posix_inodelk, - .finodelk = posix_finodelk, - .entrylk = posix_entrylk, - .fentrylk = posix_fentrylk, - .fchown = posix_fchown, - .fchmod = posix_fchmod, - .setdents = posix_setdents, - .getdents = posix_getdents, - .checksum = posix_checksum, - .xattrop = posix_xattrop, - .fxattrop = posix_fxattrop, + .inodelk = posix_inodelk, + .finodelk = posix_finodelk, + .entrylk = posix_entrylk, + .fentrylk = posix_fentrylk, + .rchecksum = posix_rchecksum, + .xattrop = posix_xattrop, + .fxattrop = posix_fxattrop, + .setattr = posix_setattr, + .fsetattr = posix_fsetattr, + .fallocate = _posix_fallocate, + .discard = posix_discard, + .zerofill = posix_zerofill, }; struct xlator_cbks cbks = { - .release = posix_release, - .releasedir = posix_releasedir, - .forget = posix_forget + .release = posix_release, + .releasedir = posix_releasedir, + .forget = posix_forget }; struct volume_options options[] = { - { .key = {"o-direct"}, - .type = GF_OPTION_TYPE_BOOL }, - { .key = {"directory"}, - .type = GF_OPTION_TYPE_PATH }, - { .key = {"export-statfs-size"}, - .type = GF_OPTION_TYPE_BOOL }, - { .key = {"mandate-attribute"}, - .type = GF_OPTION_TYPE_BOOL }, - { .key = {"span-devices"}, - .type = GF_OPTION_TYPE_INT }, + { .key = {"o-direct"}, + .type = GF_OPTION_TYPE_BOOL }, + { .key = {"directory"}, + .type = GF_OPTION_TYPE_PATH }, + { .key = {"hostname"}, + .type = GF_OPTION_TYPE_ANY }, + { .key = {"export-statfs-size"}, + .type = GF_OPTION_TYPE_BOOL }, + { .key = {"mandate-attribute"}, + .type = GF_OPTION_TYPE_BOOL }, { .key = {"background-unlink"}, - .type = GF_OPTION_TYPE_BOOL }, - { .key = {NULL} } + .type = GF_OPTION_TYPE_BOOL }, + { .key = {"janitor-sleep-duration"}, + .type = GF_OPTION_TYPE_INT }, + { .key = {"volume-id"}, + .type = GF_OPTION_TYPE_ANY }, + { .key = {"glusterd-uuid"}, + .type = GF_OPTION_TYPE_STR }, + { + .key = {"linux-aio"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Support for native Linux AIO" + }, + { + .key = {"brick-uid"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Support for setting uid of brick's owner" + }, + { + .key = {"brick-gid"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Support for setting gid of brick's owner" + }, + { .key = {"node-uuid-pathinfo"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "return glusterd's node-uuid in pathinfo xattr" + " string instead of hostname" + }, + { + .key = {"health-check-interval"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "30", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Interval in seconds for a filesystem health check, " + "set to 0 to disable" + }, + { .key = {"batch-fsync-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "reverse-fsync", + .description = "Possible values:\n" + "\t- syncfs: Perform one syncfs() on behalf oa batch" + "of fsyncs.\n" + "\t- syncfs-single-fsync: Perform one syncfs() on behalf of a batch" + " of fsyncs and one fsync() per batch.\n" + "\t- syncfs-reverse-fsync: Preform one syncfs() on behalf of a batch" + " of fsyncs and fsync() each file in the batch in reverse order.\n" + " in reverse order.\n" + "\t- reverse-fsync: Perform fsync() of each file in the batch in" + " reverse order." + }, + { .key = {"batch-fsync-delay-usec"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "0", + .description = "Num of usecs to wait for aggregating fsync" + " requests", + }, + { .key = {NULL} } }; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index ed6b46430..3121db271 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -1,22 +1,12 @@ /* - Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef _POSIX_H #define _POSIX_H @@ -29,6 +19,7 @@ #include <unistd.h> #include <sys/types.h> #include <dirent.h> +#include <time.h> #ifdef linux #ifdef __GLIBC__ @@ -49,7 +40,18 @@ #include "xlator.h" #include "inode.h" #include "compat.h" +#include "timer.h" +#include "posix-mem-types.h" +#include "posix-handle.h" +#include "call-stub.h" + +#ifdef HAVE_LIBAIO +#include <libaio.h> +#include "posix-aio.h" +#endif +#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/ +#define MAX_NO_VECT 1024 /** * posix_fd - internal structure common to file and directory fd's */ @@ -57,27 +59,33 @@ struct posix_fd { int fd; /* fd returned by the kernel */ int32_t flags; /* flags for open/creat */ - char * path; /* used by setdents/getdents */ DIR * dir; /* handle returned by the kernel */ + int odirect; + struct list_head list; /* to add to the janitor list */ }; + struct posix_private { char *base_path; int32_t base_path_length; + gf_lock_t lock; + + char *hostname; /* Statistics, provides activity of the server */ - struct xlator_stats stats; - + struct timeval prev_fetch_time; struct timeval init_time; - int32_t max_read; /* */ - int32_t max_write; /* */ - int64_t interval_read; /* Used to calculate the max_read value */ - int64_t interval_write; /* Used to calculate the max_write value */ + time_t last_landfill_check; + int32_t janitor_sleep_duration; + struct list_head janitor_fds; + pthread_cond_t janitor_cond; + pthread_mutex_t janitor_lock; + int64_t read_value; /* Total read, from init */ int64_t write_value; /* Total write, from init */ - + int64_t nr_files; /* In some cases, two exported volumes may reside on the same partition on the server. Sending statvfs info for both @@ -91,28 +99,110 @@ struct posix_private { gf_boolean_t o_direct; /* always open files in O_DIRECT mode */ - gf_boolean_t span_devices; -/* +/* decide whether posix_unlink does open (file), unlink (file), close (fd) instead of just unlink (file). with the former approach there is no lockout of access to parent directory during removal of very large files for the entire duration of freeing of data blocks. -*/ +*/ gf_boolean_t background_unlink; - int num_devices_to_span; - dev_t *st_device; +/* janitor thread which cleans up /.trash (created by replicate) */ + pthread_t janitor; + gf_boolean_t janitor_present; + char * trash_path; +/* lock for brick dir */ + DIR *mount_lock; + + struct stat handledir; + +/* uuid of glusterd that swapned the brick process */ + uuid_t glusterd_uuid; + + gf_boolean_t aio_configured; + gf_boolean_t aio_init_done; + gf_boolean_t aio_capable; +#ifdef HAVE_LIBAIO + io_context_t ctxp; + pthread_t aiothread; +#endif + + /* node-uuid in pathinfo xattr */ + gf_boolean_t node_uuid_pathinfo; + + pthread_t fsyncer; + struct list_head fsyncs; + pthread_mutex_t fsync_mutex; + pthread_cond_t fsync_cond; + int fsync_queue_count; + + enum { + BATCH_NONE = 0, + BATCH_SYNCFS, + BATCH_SYNCFS_SINGLE_FSYNC, + BATCH_REVERSE_FSYNC, + BATCH_SYNCFS_REVERSE_FSYNC + } batch_fsync_mode; + + uint32_t batch_fsync_delay_usec; + + /* seconds to sleep between health checks */ + uint32_t health_check_interval; + pthread_t health_check; + gf_boolean_t health_check_active; }; +typedef struct { + xlator_t *this; + const char *real_path; + dict_t *xattr; + struct iatt *stbuf; + loc_t *loc; + inode_t *inode; /* for all do_xattrop() key handling */ + int fd; + int flags; + int32_t op_errno; +} posix_xattr_filler_t; + + #define POSIX_BASE_PATH(this) (((struct posix_private *)this->private)->base_path) #define POSIX_BASE_PATH_LEN(this) (((struct posix_private *)this->private)->base_path_length) -#define MAKE_REAL_PATH(var, this, path) do { \ - var = alloca (strlen (path) + POSIX_BASE_PATH_LEN(this) + 2); \ - strcpy (var, POSIX_BASE_PATH(this)); \ - strcpy (&var[POSIX_BASE_PATH_LEN(this)], path); \ - } while (0) - +/* Helper functions */ +int posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, + dict_t *xattr_req); +int posix_fdstat (xlator_t *this, int fd, struct iatt *stbuf_p); +int posix_istat (xlator_t *this, uuid_t gfid, const char *basename, + struct iatt *iatt); +int posix_pstat (xlator_t *this, uuid_t gfid, const char *real_path, + struct iatt *iatt); +dict_t *posix_lookup_xattr_fill (xlator_t *this, const char *path, + loc_t *loc, dict_t *xattr, struct iatt *buf); +int posix_handle_pair (xlator_t *this, const char *real_path, char *key, + data_t *value, int flags); +int posix_fhandle_pair (xlator_t *this, int fd, char *key, data_t *value, + int flags); +void posix_spawn_janitor_thread (xlator_t *this); +int posix_get_file_contents (xlator_t *this, uuid_t pargfid, + const char *name, char **contents); +int posix_set_file_contents (xlator_t *this, const char *path, char *key, + data_t *value, int flags); +int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req); +int posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req); +int posix_entry_create_xattr_set (xlator_t *this, const char *path, + dict_t *dict); + +int posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd); +void posix_fill_ino_from_gfid (xlator_t *this, struct iatt *buf); + +gf_boolean_t posix_special_xattr (char **pattern, char *key); + +void +__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size); +void posix_spawn_health_check_thread (xlator_t *this); + +void *posix_fsyncer (void *); #endif /* _POSIX_H */ |
