diff options
Diffstat (limited to 'xlators/storage')
23 files changed, 6489 insertions, 6560 deletions
diff --git a/xlators/storage/Makefile.am b/xlators/storage/Makefile.am index 9cb9ded30..c08e8e41b 100644 --- a/xlators/storage/Makefile.am +++ b/xlators/storage/Makefile.am @@ -1,3 +1,7 @@ SUBDIRS = posix -CLEANFILES = +if ENABLE_BD_XLATOR +SUBDIRS += bd +endif + +CLEANFILES = diff --git a/xlators/storage/bdb/Makefile.am b/xlators/storage/bd/Makefile.am index d471a3f92..a985f42a8 100644 --- a/xlators/storage/bdb/Makefile.am +++ b/xlators/storage/bd/Makefile.am @@ -1,3 +1,3 @@ SUBDIRS = src -CLEANFILES = +CLEANFILES = diff --git a/xlators/storage/bd/src/Makefile.am b/xlators/storage/bd/src/Makefile.am new file mode 100644 index 000000000..3d93f7442 --- /dev/null +++ b/xlators/storage/bd/src/Makefile.am @@ -0,0 +1,20 @@ +if ENABLE_BD_XLATOR +xlator_LTLIBRARIES = bd.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage + +bd_la_LDFLAGS = -module -avoid-version +LIBBD = -llvm2app -lrt +bd_la_SOURCES = bd.c bd-helper.c bd-aio.c +bd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBBD) $(LIBAIO) + +noinst_HEADERS = bd.h bd-aio.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS) + +CLEANFILES = + +endif diff --git a/xlators/storage/bd/src/bd-aio.c b/xlators/storage/bd/src/bd-aio.c new file mode 100644 index 000000000..62d4590f7 --- /dev/null +++ b/xlators/storage/bd/src/bd-aio.c @@ -0,0 +1,527 @@ +/* + Copyright IBM, Corp. 2013 + + This file is part of GlusterFS. + + Author: M. Mohan Kumar <mohan@in.ibm.com> + + Based on posix-aio.c + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <lvm2app.h> +#include <sys/uio.h> + +#include "xlator.h" +#include "glusterfs.h" +#include "defaults.h" +#include "bd.h" +#include "bd-aio.h" + +#ifdef HAVE_LIBAIO +#include <libaio.h> + +struct bd_aio_cb { + struct iocb iocb; + call_frame_t *frame; + struct iobuf *iobuf; + struct iobref *iobref; + struct iatt prebuf; + int op; + off_t offset; + fd_t *fd; +}; + +void +__bd_fd_set_odirect (fd_t *fd, bd_fd_t *bd_fd, int opflags, + off_t offset, size_t size) +{ + int odirect = 0; + int flags = 0; + int ret = 0; + + odirect = bd_fd->odirect; + + if ((fd->flags|opflags) & O_DIRECT) { + /* if instructed, use O_DIRECT always */ + odirect = 1; + } else { + /* else use O_DIRECT when feasible */ + if ((offset|size) & 0xfff) + odirect = 0; + else + odirect = 1; + } + + if (!odirect && bd_fd->odirect) { + flags = fcntl (bd_fd->fd, F_GETFL); + ret = fcntl (bd_fd->fd, F_SETFL, (flags & (~O_DIRECT))); + bd_fd->odirect = 0; + } + + if (odirect && !bd_fd->odirect) { + flags = fcntl (bd_fd->fd, F_GETFL); + ret = fcntl (bd_fd->fd, F_SETFL, (flags | O_DIRECT)); + bd_fd->odirect = 1; + } + + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, + "fcntl() failed (%s). fd=%d flags=%d pfd->odirect=%d", + strerror (errno), bd_fd->fd, flags, bd_fd->odirect); + } +} + +int +bd_aio_readv_complete (struct bd_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iobuf *iobuf = NULL; + struct iatt postbuf = {0,}; + int op_ret = -1; + int op_errno = 0; + struct iovec iov; + struct iobref *iobref = NULL; + off_t offset = 0; + bd_attr_t *bdatt = NULL; + + frame = paiocb->frame; + this = frame->this; + iobuf = paiocb->iobuf; + offset = paiocb->offset; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "readv(async) failed fd=%p,size=%lu,offset=%llu (%d/%s)", + paiocb->fd, paiocb->iocb.u.c.nbytes, + (unsigned long long) paiocb->offset, + res, strerror (op_errno)); + goto out; + } + + bd_inode_ctx_get (paiocb->fd->inode, this, &bdatt); + memcpy (&postbuf, &bdatt->iatt, sizeof (struct iatt)); + + op_ret = res; + op_errno = 0; + + iobref = iobref_new (); + if (!iobref) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + iobref_add (iobref, iobuf); + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = op_ret; + + /* Hack to notify higher layers of EOF. */ + if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size) + op_errno = ENOENT; + +out: + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, &iov, 1, + &postbuf, iobref, NULL); + if (iobuf) + iobuf_unref (iobuf); + if (iobref) + iobref_unref (iobref); + + GF_FREE (paiocb); + + return 0; +} + +int +bd_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t offset, uint32_t flags, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct iobuf *iobuf = NULL; + bd_fd_t *bd_fd = NULL; + int ret = -1; + struct bd_aio_cb *paiocb = NULL; + bd_priv_t *priv = NULL; + struct iocb *iocb = NULL; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + STACK_WIND (frame, default_readv_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readv, fd, size, offset, + flags, xdata); + return 0; + } + _fd = bd_fd->fd; + bd_inode_ctx_get (fd->inode, this, &bdatt); + if (!size) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); + goto err; + } + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto err; + } + + paiocb = CALLOC (1, sizeof (*paiocb)); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + paiocb->frame = frame; + paiocb->iobuf = iobuf; + paiocb->offset = offset; + paiocb->op = GF_FOP_READ; + paiocb->fd = fd; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.c.buf = iobuf_ptr (iobuf); + paiocb->iocb.u.c.nbytes = size; + paiocb->iocb.u.c.offset = offset; + + iocb = &paiocb->iocb; + + LOCK (&fd->lock); + { + __bd_fd_set_odirect (fd, bd_fd, flags, offset, size); + + ret = io_submit (priv->ctxp, 1, &iocb); + } + UNLOCK (&fd->lock); + + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + if (iobuf) + iobuf_unref (iobuf); + + if (paiocb) + GF_FREE (paiocb); + + return 0; +} + +int +bd_aio_writev_complete (struct bd_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iatt prebuf = {0,}; + struct iatt postbuf = {0,}; + int op_ret = -1; + int op_errno = 0; + bd_attr_t *bdatt = NULL; + + frame = paiocb->frame; + prebuf = paiocb->prebuf; + this = frame->this; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "writev(async) failed fd=%p,offset=%llu (%d/%s)", + paiocb->fd, (unsigned long long) paiocb->offset, res, + strerror (op_errno)); + + goto out; + } + + bd_inode_ctx_get (paiocb->fd->inode, this, &bdatt); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); + memcpy (&postbuf, &bdatt->iatt, sizeof (struct iatt)); + + op_ret = res; + op_errno = 0; + +out: + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &prebuf, &postbuf, + NULL); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + +int +bd_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *iov, int count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + bd_fd_t *bd_fd = NULL; + int ret = -1; + struct bd_aio_cb *paiocb = NULL; + bd_priv_t *priv = NULL; + struct iocb *iocb = NULL; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + STACK_WIND (frame, default_writev_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, + fd, iov, count, offset, flags, iobref, xdata); + return 0; + } + + bd_inode_ctx_get (fd->inode, this, &bdatt); + + _fd = bd_fd->fd; + + paiocb = CALLOC (1, sizeof (*paiocb)); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + + paiocb->frame = frame; + paiocb->offset = offset; + paiocb->op = GF_FOP_WRITE; + paiocb->fd = fd; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iobref = iobref_ref (iobref); + paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.v.vec = iov; + paiocb->iocb.u.v.nr = count; + paiocb->iocb.u.v.offset = offset; + + iocb = &paiocb->iocb; + + memcpy (&paiocb->prebuf, &bdatt->iatt, sizeof (struct iatt)); + LOCK (&fd->lock); + { + __bd_fd_set_odirect (fd, bd_fd, flags, offset, + iov_length (iov, count)); + + ret = io_submit (priv->ctxp, 1, &iocb); + } + UNLOCK (&fd->lock); + + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, 0, 0, 0); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + +void * +bd_aio_thread (void *data) +{ + xlator_t *this = NULL; + bd_priv_t *priv = NULL; + int ret = 0; + int i = 0; + struct io_event *event = NULL; + struct bd_aio_cb *paiocb = NULL; + struct io_event events[BD_AIO_MAX_NR_GETEVENTS]; + struct timespec ts = {0, }; + + this = data; + THIS = this; + priv = this->private; + + ts.tv_sec = 5; + for (;;) { + memset (&events[0], 0, sizeof (events)); + ret = io_getevents (priv->ctxp, 1, BD_AIO_MAX_NR_GETEVENTS, + &events[0], &ts); + if (ret < 0) { + if (ret == -EINTR) + continue; + gf_log (this->name, GF_LOG_ERROR, + "io_getevents() returned %d, exiting", ret); + break; + } + + for (i = 0; i < ret; i++) { + event = &events[i]; + + paiocb = event->data; + + switch (paiocb->op) { + case GF_FOP_READ: + bd_aio_readv_complete (paiocb, event->res, + event->res2); + break; + case GF_FOP_WRITE: + bd_aio_writev_complete (paiocb, event->res, + event->res2); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "unknown op %d found in piocb", + paiocb->op); + break; + } + } + } + + return NULL; +} + +int +bd_aio_init (xlator_t *this) +{ + bd_priv_t *priv = NULL; + int ret = 0; + + priv = this->private; + + ret = io_setup (BD_AIO_MAX_NR_EVENTS, &priv->ctxp); + if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) { + gf_log (this->name, GF_LOG_WARNING, + "Linux AIO not available at run-time." + " Continuing with synchronous IO"); + ret = 0; + goto out; + } + + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "io_setup() failed. ret=%d, errno=%d", + ret, errno); + goto out; + } + + ret = pthread_create (&priv->aiothread, NULL, + bd_aio_thread, this); + if (ret != 0) { + io_destroy (priv->ctxp); + goto out; + } + + this->fops->readv = bd_aio_readv; + this->fops->writev = bd_aio_writev; +out: + return ret; +} + + +int +bd_aio_on (xlator_t *this) +{ + bd_priv_t *priv = NULL; + int ret = 0; + + priv = this->private; + + if (!priv->aio_init_done) { + ret = bd_aio_init (this); + if (ret == 0) + priv->aio_capable = _gf_true; + else + priv->aio_capable = _gf_false; + priv->aio_init_done = _gf_true; + } + + if (priv->aio_capable) { + this->fops->readv = bd_aio_readv; + this->fops->writev = bd_aio_writev; + } + + return ret; +} + +int +bd_aio_off (xlator_t *this) +{ + this->fops->readv = bd_readv; + this->fops->writev = bd_writev; + + return 0; +} + +#else + +int +bd_aio_on (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +int +bd_aio_off (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +void +__bd_fd_set_odirect (fd_t *fd, struct bd_fd *pfd, int opflags, + off_t offset, size_t size) +{ + xlator_t *this = THIS; + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return; +} +#endif diff --git a/xlators/storage/bd/src/bd-aio.h b/xlators/storage/bd/src/bd-aio.h new file mode 100644 index 000000000..16f686a4c --- /dev/null +++ b/xlators/storage/bd/src/bd-aio.h @@ -0,0 +1,41 @@ +/* + Copyright IBM, Corp. 2013 + + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _BD_AIO_H +#define _BD_AIO_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "glusterfs.h" + +/* + * Maximum number of concurrently submitted IO events. The heaviest load + * GlusterFS has been able to handle had 60-80 concurrent calls + */ +#define BD_AIO_MAX_NR_EVENTS 256 + +/* Maximum number of completed IO operations to reap per getevents syscall */ +#define BD_AIO_MAX_NR_GETEVENTS 16 + +int bd_aio_on (xlator_t *this); +int bd_aio_off (xlator_t *this); + +int bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata); + +#endif /* !_BD_AIO_H */ diff --git a/xlators/storage/bd/src/bd-helper.c b/xlators/storage/bd/src/bd-helper.c new file mode 100644 index 000000000..5525e346b --- /dev/null +++ b/xlators/storage/bd/src/bd-helper.c @@ -0,0 +1,783 @@ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include <lvm2app.h> +#ifdef HAVE_LIBAIO +#include <libaio.h> +#endif + +#include "bd.h" +#include "run.h" + +int +bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO (this->name, ctx, out); + + ctx_int = (long)ctx; + ret = inode_ctx_set (inode, this, &ctx_int); +out: + return ret; +} + +int +bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO (this->name, inode, out); + ret = inode_ctx_get (inode, this, &ctx_int); + if (ret) + return ret; + if (ctx) + *ctx = (bd_attr_t *) ctx_int; +out: + return ret; +} + +void +bd_local_free (xlator_t *this, bd_local_t *local) +{ + if (!local) + return; + if (local->fd) + fd_unref (local->fd); + else if (local->loc.path) + loc_wipe (&local->loc); + if (local->dict) + dict_unref (local->dict); + if (local->inode) + inode_unref (local->inode); + if (local->bdatt) { + GF_FREE (local->bdatt->type); + GF_FREE (local->bdatt); + } + mem_put (local); + local = NULL; +} + +bd_local_t * +bd_local_init (call_frame_t *frame, xlator_t *this) +{ + frame->local = mem_get0 (this->local_pool); + if (!frame->local) + return NULL; + + return frame->local; +} + +/* + * VG are set with the tag in GF_XATTR_VOL_ID_KEY:<uuid> format. + * This function validates this tag agains volume-uuid. Also goes + * through LV list to find out if a thin-pool is configured or not. + */ +int bd_scan_vg (xlator_t *this, bd_priv_t *priv) +{ + vg_t brick = NULL; + data_t *tmp_data = NULL; + struct dm_list *tags = NULL; + int op_ret = -1; + uuid_t dict_uuid = {0, }; + uuid_t vg_uuid = {0, }; + gf_boolean_t uuid = _gf_false; + lvm_str_list_t *strl = NULL; + struct dm_list *lv_dm_list = NULL; + lv_list_t *lv_list = NULL; + struct dm_list *dm_seglist = NULL; + lvseg_list_t *seglist = NULL; + lvm_property_value_t prop = {0, }; + gf_boolean_t thin = _gf_false; + const char *lv_name = NULL; + + brick = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!brick) { + gf_log (this->name, GF_LOG_CRITICAL, "VG %s is not found", + priv->vg); + return ENOENT; + } + + lv_dm_list = lvm_vg_list_lvs (brick); + if (!lv_dm_list) + goto check; + + dm_list_iterate_items (lv_list, lv_dm_list) { + dm_seglist = lvm_lv_list_lvsegs (lv_list->lv); + if (!dm_seglist) + continue; + dm_list_iterate_items (seglist, dm_seglist) { + prop = lvm_lvseg_get_property (seglist->lvseg, + "segtype"); + if (!prop.is_valid || !prop.value.string) + continue; + if (!strcmp (prop.value.string, "thin-pool")) { + thin = _gf_true; + lv_name = lvm_lv_get_name (lv_list->lv); + priv->pool = gf_strdup (lv_name); + gf_log (THIS->name, GF_LOG_INFO, "Thin Pool " + "\"%s\" will be used for thin LVs", + lv_name); + break; + } + } + } + +check: + /* If there is no volume-id set in dict, we cant validate */ + tmp_data = dict_get (this->options, "volume-id"); + if (!tmp_data) { + op_ret = 0; + goto out; + } + + op_ret = uuid_parse (tmp_data->data, dict_uuid); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "wrong volume-id (%s) set in volume file", + tmp_data->data); + op_ret = -1; + goto out; + } + + tags = lvm_vg_get_tags (brick); + if (!tags) { /* no tags in the VG */ + gf_log (this->name, GF_LOG_ERROR, + "Extended attribute trusted.glusterfs." + "volume-id is absent"); + op_ret = -1; + goto out; + } + dm_list_iterate_items (strl, tags) { + if (!strncmp (strl->str, GF_XATTR_VOL_ID_KEY, + strlen (GF_XATTR_VOL_ID_KEY))) { + uuid = _gf_true; + break; + } + } + /* UUID tag is not set in VG */ + if (!uuid) { + gf_log (this->name, GF_LOG_ERROR, + "Extended attribute trusted.glusterfs." + "volume-id is absent"); + op_ret = -1; + goto out; + } + + op_ret = uuid_parse (strl->str + strlen (GF_XATTR_VOL_ID_KEY) + 1, + vg_uuid); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "wrong volume-id (%s) set in VG", strl->str); + op_ret = -1; + goto out; + } + if (uuid_compare (dict_uuid, vg_uuid)) { + gf_log (this->name, GF_LOG_ERROR, + "mismatching volume-id (%s) received. " + "already is a part of volume %s ", + tmp_data->data, vg_uuid); + op_ret = -1; + goto out; + } + + op_ret = 0; + +out: + lvm_vg_close (brick); + + if (!thin) + gf_log (THIS->name, GF_LOG_WARNING, "No thin pool found in " + "VG %s\n", priv->vg); + else + priv->caps |= BD_CAPS_THIN; + + return op_ret; +} + +/* FIXME: Move this code to common place, so posix and bd xlator can use */ +char * +page_aligned_alloc (size_t size, char **aligned_buf) +{ + char *alloc_buf = NULL; + char *buf = NULL; + + alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_common_mt_char); + if (!alloc_buf) + return NULL; + /* page aligned buffer */ + buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); + *aligned_buf = buf; + + return alloc_buf; +} + +static int +__bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd_p) +{ + int ret = -1; + int _fd = -1; + char *devpath = NULL; + bd_fd_t *bdfd = NULL; + uint64_t tmp_bdfd = 0; + bd_priv_t *priv = this->private; + bd_gfid_t gfid = {0, }; + bd_attr_t *bdatt = NULL; + + /* not bd file */ + if (fd->inode->ia_type != IA_IFREG || + bd_inode_ctx_get (fd->inode, this, &bdatt)) + return 0; + + ret = __fd_ctx_get (fd, this, &tmp_bdfd); + if (ret == 0) { + bdfd = (void *)(long) tmp_bdfd; + *bdfd_p = bdfd; + return 0; + } + + uuid_utoa_r (fd->inode->gfid, gfid); + asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid); + if (!devpath) + goto out; + + _fd = open (devpath, O_RDWR | O_LARGEFILE, 0); + if (_fd < 0) { + ret = errno; + gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath, + strerror (ret)); + goto out; + } + bdfd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd); + BD_VALIDATE_MEM_ALLOC (bdfd, ret, out); + + bdfd->fd = _fd; + bdfd->flag = O_RDWR | O_LARGEFILE; + if (__fd_ctx_set (fd, this, (uint64_t)(long)bdfd) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set the fd context fd=%p", fd); + goto out; + } + + *bdfd_p = bdfd; + + ret = 0; +out: + FREE (devpath); + if (ret) { + close (_fd); + GF_FREE (bdfd); + } + return ret; +} + +int +bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd) +{ + int ret; + + /* FIXME: Is it ok to fd->lock here ? */ + LOCK (&fd->lock); + { + ret = __bd_fd_ctx_get (this, fd, bdfd); + } + UNLOCK (&fd->lock); + + return ret; +} + +/* + * Validates if LV exists for given inode or not. + * Returns 0 if LV exists and size also matches. + * If LV does not exist -1 returned + * If LV size mismatches, returnes 1 also lv_size is updated with actual + * size + */ +int +bd_validate_bd_xattr (xlator_t *this, char *bd, char **type, + uint64_t *lv_size, uuid_t uuid) +{ + char *path = NULL; + int ret = -1; + bd_gfid_t gfid = {0, }; + bd_priv_t *priv = this->private; + struct stat stbuf = {0, }; + uint64_t size = 0; + vg_t vg = NULL; + lv_t lv = NULL; + char *bytes = NULL; + + bytes = strrchr (bd, ':'); + if (bytes) { + *bytes = '\0'; + bytes++; + gf_string2bytesize (bytes, &size); + } + + if (strcmp (bd, BD_LV) && strcmp (bd, BD_THIN)) { + gf_log (this->name, GF_LOG_WARNING, + "invalid xattr %s", bd); + return -1; + } + *type = gf_strdup (bd); + + /* + * Check if LV really exist, there could be a failure + * after setxattr and successful LV creation + */ + uuid_utoa_r (uuid, gfid); + gf_asprintf (&path, "/dev/%s/%s", priv->vg, gfid); + if (!path) { + gf_log (this->name, GF_LOG_WARNING, + "insufficient memory"); + return 0; + } + + /* Destination file does not exist */ + if (stat (path, &stbuf)) { + gf_log (this->name, GF_LOG_WARNING, + "lstat failed for path %s", path); + return -1; + } + + vg = lvm_vg_open (priv->handle, priv->vg, "r", 0); + if (!vg) { + gf_log (this->name, GF_LOG_WARNING, + "VG %s does not exist?", priv->vg); + ret = -1; + goto out; + } + + lv = lvm_lv_from_name (vg, gfid); + if (!lv) { + gf_log (this->name, GF_LOG_WARNING, + "LV %s does not exist", gfid); + ret = -1; + goto out; + } + + *lv_size = lvm_lv_get_size (lv); + if (size == *lv_size) { + ret = 0; + goto out; + } + + ret = 1; + +out: + if (vg) + lvm_vg_close (vg); + + GF_FREE (path); + return ret; +} + +static int +create_thin_lv (char *vg, char *pool, char *lv, uint64_t extent) +{ + int ret = -1; + runner_t runner = {0, }; + char *path = NULL; + struct stat stat = {0, }; + + runinit (&runner); + runner_add_args (&runner, LVM_CREATE, NULL); + runner_add_args (&runner, "--thin", NULL); + runner_argprintf (&runner, "%s/%s", vg, pool); + runner_add_args (&runner, "--name", NULL); + runner_argprintf (&runner, "%s", lv); + runner_add_args (&runner, "--virtualsize", NULL); + runner_argprintf (&runner, "%ldB", extent); + runner_start (&runner); + runner_end (&runner); + + gf_asprintf (&path, "/dev/%s/%s", vg, lv); + if (!path) { + ret = ENOMEM; + goto out; + } + if (lstat (path, &stat) < 0) + ret = EAGAIN; + else + ret = 0; +out: + GF_FREE (path); + return ret; +} + +int +bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv) +{ + int ret = 0; + vg_t vg = NULL; + bd_gfid_t gfid = {0, }; + + uuid_utoa_r (uuid, gfid); + + if (!strcmp (type, BD_THIN)) + return create_thin_lv (priv->vg, priv->pool, gfid, + size); + + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + return ENOENT; + } + + if (!lvm_vg_create_lv_linear (vg, gfid, size)) { + gf_log (THIS->name, GF_LOG_WARNING, "lvm_vg_create_lv_linear " + "failed"); + ret = errno; + } + + lvm_vg_close (vg); + + return ret; +} + +int32_t +bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size) +{ + uint64_t new_size = 0; + runner_t runner = {0, }; + bd_gfid_t gfid = {0, }; + int ret = 0; + vg_t vg = NULL; + lv_t lv = NULL; + + uuid_utoa_r (uuid, gfid); + + runinit (&runner); + + runner_add_args (&runner, LVM_RESIZE, NULL); + runner_argprintf (&runner, "%s/%s", priv->vg, gfid); + runner_argprintf (&runner, "-L%ldb", size); + runner_add_args (&runner, "-f", NULL); + + runner_start (&runner); + runner_end (&runner); + + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + return EAGAIN; + } + + lv = lvm_lv_from_name (vg, gfid); + if (!lv) { + gf_log (THIS->name, GF_LOG_WARNING, "LV %s not found", gfid); + ret = EIO; + goto out; + } + new_size = lvm_lv_get_size (lv); + + if (new_size != size) { + gf_log (THIS->name, GF_LOG_WARNING, "resized LV size %ld does " + "not match requested size %ld", new_size, size); + ret = EIO; + } + +out: + lvm_vg_close (vg); + return ret; +} + +uint64_t +bd_get_default_extent (bd_priv_t *priv) +{ + vg_t vg = NULL; + uint64_t size = 0; + + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + return 0; + } + + size = lvm_vg_get_extent_size (vg); + + lvm_vg_close (vg); + + return size; +} + +/* + * Adjusts the user specified size to VG specific extent size + */ +uint64_t +bd_adjust_size (bd_priv_t *priv, uint64_t size) +{ + uint64_t extent = 0; + uint64_t nr_ex = 0; + + extent = bd_get_default_extent (priv); + if (!extent) + return 0; + + nr_ex = size / extent; + if (size % extent) + nr_ex++; + + size = extent * nr_ex; + + return size; +} + +int +bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno) +{ + vg_t vg = NULL; + lv_t lv = NULL; + int ret = -1; + + *op_errno = 0; + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + *op_errno = ENOENT; + return -1; + } + lv = lvm_lv_from_name (vg, lv_name); + if (!lv) { + gf_log (THIS->name, GF_LOG_WARNING, "No such LV %s", lv_name); + *op_errno = ENOENT; + goto out; + } + ret = lvm_vg_remove_lv (lv); + if (ret < 0) { + gf_log (THIS->name, GF_LOG_WARNING, "removing LV %s failed", + lv_name); + *op_errno = errno; + goto out; + } +out: + lvm_vg_close (vg); + + return ret; +} + +inline void +bd_update_amtime(struct iatt *iatt, int flag) +{ + struct timespec ts = {0, }; + + clock_gettime (CLOCK_REALTIME, &ts); + if (flag & GF_SET_ATTR_ATIME) { + iatt->ia_atime = ts.tv_sec; + iatt->ia_atime_nsec = ts.tv_nsec; + } + if (flag & GF_SET_ATTR_MTIME) { + iatt->ia_mtime = ts.tv_sec; + iatt->ia_mtime_nsec = ts.tv_nsec; + } +} + +int +bd_snapshot_create (bd_local_t *local, bd_priv_t *priv) +{ + char *path = NULL; + bd_gfid_t dest = {0, }; + bd_gfid_t origin = {0, }; + int ret = 0; + runner_t runner = {0, }; + struct stat stat = {0, }; + + uuid_utoa_r (local->dloc->gfid, dest); + uuid_utoa_r (local->loc.gfid, origin); + + gf_asprintf (&path, "/dev/%s/%s", priv->vg, dest); + if (!path) { + gf_log (THIS->name, GF_LOG_WARNING, + "Insufficient memory"); + return ENOMEM; + } + + runinit (&runner); + runner_add_args (&runner, LVM_CREATE, NULL); + runner_add_args (&runner, "--snapshot", NULL); + runner_argprintf (&runner, "/dev/%s/%s", priv->vg, origin); + runner_add_args (&runner, "--name", NULL); + runner_argprintf (&runner, "%s", dest); + if (strcmp (local->bdatt->type, BD_THIN)) + runner_argprintf (&runner, "-L%ldB", local->size); + runner_start (&runner); + runner_end (&runner); + + if (lstat (path, &stat) < 0) + ret = EIO; + + GF_FREE (path); + return ret; +} + +int +bd_clone (bd_local_t *local, bd_priv_t *priv) +{ + int ret = ENOMEM; + int fd1 = -1; + int fd2 = -1; + int i = 0; + char *buff = NULL; + ssize_t bytes = 0; + char *spath = NULL; + char *dpath = NULL; + struct iovec *vec = NULL; + bd_gfid_t source = {0, }; + bd_gfid_t dest = {0, }; + void *bufp[IOV_NR] = {0, }; + + vec = GF_CALLOC (IOV_NR, sizeof (struct iovec), gf_common_mt_iovec); + if (!vec) + return ENOMEM; + + for (i = 0; i < IOV_NR; i++) { + bufp[i] = page_aligned_alloc (IOV_SIZE, &buff); + if (!buff) + goto out; + vec[i].iov_base = buff; + vec[i].iov_len = IOV_SIZE; + } + + uuid_utoa_r (local->loc.gfid, source); + uuid_utoa_r (local->dloc->gfid, dest); + + gf_asprintf (&spath, "/dev/%s/%s", priv->vg, source); + gf_asprintf (&dpath, "/dev/%s/%s", priv->vg, dest); + if (!spath || !dpath) + goto out; + + ret = bd_create (local->dloc->gfid, local->size, + local->bdatt->type, priv); + if (ret) + goto out; + + fd1 = open (spath, O_RDONLY | O_DIRECT); + if (fd1 < 0) { + ret = errno; + goto out; + } + fd2 = open (dpath, O_WRONLY | O_DIRECT); + if (fd2 < 0) { + ret = errno; + goto out; + } + + while (1) { + bytes = readv (fd1, vec, IOV_NR); + if (bytes < 0) { + ret = errno; + gf_log (THIS->name, GF_LOG_WARNING, "read failed: %s", + strerror (ret)); + goto out; + } + if (!bytes) + break; + bytes = writev (fd2, vec, IOV_NR); + if (bytes < 0) { + ret = errno; + gf_log (THIS->name, GF_LOG_WARNING, + "write failed: %s", strerror (ret)); + goto out; + } + } + ret = 0; + +out: + for (i = 0; i < IOV_NR; i++) + GF_FREE (bufp[i]); + GF_FREE (vec); + + if (fd1 != -1) + close (fd1); + if (fd2 != -1) + close (fd2); + + FREE (spath); + FREE (dpath); + + return ret; +} + +/* + * Merges snapshot LV to origin LV and returns status + */ +int +bd_merge (bd_priv_t *priv, uuid_t gfid) +{ + bd_gfid_t dest = {0, }; + char *path = NULL; + struct stat stat = {0, }; + runner_t runner = {0, }; + int ret = 0; + + uuid_utoa_r (gfid, dest); + gf_asprintf (&path, "/dev/%s/%s", priv->vg, dest); + + runinit (&runner); + runner_add_args (&runner, LVM_CONVERT, NULL); + runner_add_args (&runner, "--merge", NULL); + runner_argprintf (&runner, "%s", path); + runner_start (&runner); + runner_end (&runner); + + if (!lstat (path, &stat)) + ret = EIO; + + GF_FREE (path); + + return ret; +} + +int +bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict) +{ + vg_t brick = NULL; + lvm_property_value_t prop = {0, }; + lv_t lv = NULL; + int ret = -1; + bd_gfid_t gfid = {0, }; + inode_t *inode = NULL; + char *origin = NULL; + + brick = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!brick) { + gf_log (THIS->name, GF_LOG_CRITICAL, "VG %s is not found", + priv->vg); + return ENOENT; + } + + if (fd) + inode = fd->inode; + else + inode = loc->inode; + + uuid_utoa_r (inode->gfid, gfid); + lv = lvm_lv_from_name (brick, gfid); + if (!lv) { + gf_log (THIS->name, GF_LOG_CRITICAL, "LV %s not found", gfid); + ret = ENOENT; + goto out; + } + + prop = lvm_lv_get_property (lv, "origin"); + if (!prop.is_valid || !prop.value.string) { + ret = ENODATA; + goto out; + } + + origin = gf_strdup (prop.value.string); + ret = dict_set_dynstr (dict, BD_ORIGIN, origin); + +out: + lvm_vg_close (brick); + return ret; +} + diff --git a/xlators/storage/bd/src/bd.c b/xlators/storage/bd/src/bd.c new file mode 100644 index 000000000..405474c58 --- /dev/null +++ b/xlators/storage/bd/src/bd.c @@ -0,0 +1,2404 @@ +/* + BD translator V2 - Exports Block devices on server side as regular + files to client + + Now only exporting Logical volumes supported. + + Copyright IBM, Corp. 2013 + + This file is part of GlusterFS. + + Author: + M. Mohan Kumar <mohan@in.ibm.com> + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include <lvm2app.h> +#include <openssl/md5.h> +#include <time.h> +#include <linux/fs.h> +#include <sys/ioctl.h> +#ifdef HAVE_LIBAIO +#include <libaio.h> +#endif + +#include "bd.h" +#include "bd-aio.h" +#include "defaults.h" +#include "glusterfs3-xdr.h" +#include "run.h" +#include "protocol-common.h" +#include "checksum.h" + +/* + * Call back function for setxattr and removexattr. + * does not do anything. FIXME: How to handle remove/setxattr failure + */ +int +bd_null_rmsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + STACK_DESTROY (frame->root); + return 0; +} + +/* + * returns 0 if a file is mapped to BD or not. + */ +int +bd_get_bd_info (call_frame_t *frame, xlator_t *this, dict_t *xattr, uuid_t gfid, + char **type, uint64_t *size) +{ + char *bd_xattr = NULL; + char *bd = NULL; + int ret = -1; + loc_t loc = {0, }; + dict_t *dict = NULL; + char *p = NULL; + call_frame_t *bd_frame = NULL; + + if (!xattr) + return 1; + + if (dict_get_str (xattr, BD_XATTR, &p)) + return 1; + + bd_xattr = gf_strdup (p); + + memcpy (loc.gfid, gfid, sizeof (uuid_t)); + + bd_frame = copy_frame (frame); + BD_VALIDATE_MEM_ALLOC (bd_frame, ret, out); + + ret = bd_validate_bd_xattr (this, bd_xattr, type, size, gfid); + if (ret < 0) {/* LV does not exist */ + STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->removexattr, &loc, + BD_XATTR, NULL); + + gf_log (this->name, GF_LOG_WARNING, + "Mapped LV not available for posix file <gfid:%s>, " + "deleting mapping", uuid_utoa (gfid)); + } else if (ret == 1) { + /* BD_XATTR size and LV size mismatch. Update BD_XATTR */ + gf_asprintf (&bd, "%s:%ld", *type, *size); + + dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (dict, ret, out); + + ret = dict_set_dynstr (dict, BD_XATTR, bd); + if (ret) + goto out; + + STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setxattr, &loc, dict, 0, + NULL); + } + +out: + dict_del (xattr, BD_XATTR); + GF_FREE (bd_xattr); + GF_FREE (bd); + return ret; +} + +/* + * bd_lookup_cbk: Call back from posix_lookup. + */ +int32_t +bd_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + int ret = -1; + bd_attr_t *bdatt = NULL; + uint64_t size = 0; + char *type = BD_TYPE_NONE; + + /* only regular files are part of BD object */ + if (op_ret < 0 || buf->ia_type != IA_IFREG) + goto out; + + /* iatt already cached */ + if (!bd_inode_ctx_get (inode, this, &bdatt)) + goto next; + + if (bd_get_bd_info (frame, this, xattr, buf->ia_gfid, &type, &size)) + goto out; + + /* BD file, update buf */ + bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + if (!bdatt) { + op_errno = ENOMEM; + goto out; + } + memcpy (&bdatt->iatt, buf, sizeof (struct iatt)); + bdatt->type = type; + + /* Cache LV size in inode_ctx */ + ret = bd_inode_ctx_set (inode, this, bdatt); + if (ret < 0) { + GF_FREE (bdatt); + op_errno = EINVAL; + goto out; + } + + bdatt->iatt.ia_size = size; + bdatt->iatt.ia_blocks = size / 512; + +next: + dict_del (xattr, GF_CONTENT_KEY); + memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, + xattr, postparent); + return 0; +} + +/* + * bd_lookup: Issues posix_lookup to find out if file is mapped to BD + * bd_lookup -> posix_lookup -> bd_lookup_cbk +*/ +int32_t +bd_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + dict_t *bd_xattr = NULL; + bd_attr_t *bdatt = NULL; + int op_errno = EINVAL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (loc->path, out); + VALIDATE_OR_GOTO (this->private, out); + + if (bd_inode_ctx_get (loc->inode, this, &bdatt) < 0) { + if (!xattr_req) { + bd_xattr = dict_new (); + BD_VALIDATE_MEM_ALLOC (bd_xattr, op_errno, out); + xattr_req = bd_xattr; + } + if (dict_set_int8 (xattr_req, BD_XATTR, 1) < 0) + goto out; + } + + STACK_WIND (frame, bd_lookup_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, loc, xattr_req); + + if (bd_xattr) + dict_unref (bd_xattr); + return 0; +out: + BD_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + + return 0; +} + +int +bd_forget (xlator_t *this, inode_t *inode) +{ + int ret = -1; + uint64_t ctx = 0; + bd_attr_t *bdatt = NULL; + + ret = bd_inode_ctx_get (inode, this, &bdatt); + if (!ret) { + inode_ctx_del (inode, this, &ctx); + FREE (bdatt); + } + return 0; +} + +int +bd_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *entries, dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + uint64_t size = 0; + char *type = NULL; + + if (op_ret < 0) + goto out; + + list_for_each_entry (entry, &entries->list, list) { + if (entry->d_type != DT_REG) + continue; + if (!bd_get_bd_info (frame, this, entry->dict, + entry->d_stat.ia_gfid, &type, &size)) { + entry->d_stat.ia_size = size; + entry->d_stat.ia_blocks = size / 512; + FREE (type); + } + } + +out: + BD_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} + +/* + * bd_readdirp: In bd_readdirp_cbk if the file and BD_XATTR_SIZE is set + * ia_size is updated with the LV(BD_XATTR_SIZE) size + */ +int32_t +bd_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict) +{ + int op_errno = EINVAL; + bd_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + if (!dict) { + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + dict = local->dict; + } + + if (dict_set_int8 (dict, BD_XATTR, 0)) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set key %s", BD_XATTR); + goto out; + } + + STACK_WIND (frame, bd_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, dict); + + return 0; +out: + BD_STACK_UNWIND (readdirp, frame, -1, op_errno, NULL, dict); + return 0; +} + +int +bd_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *buf, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + + /* only regular files are part of BD object */ + if (op_ret < 0 || buf->ia_type != IA_IFREG) + goto out; + + BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + + /* update buf with LV size */ + if (!bd_inode_ctx_get (local->inode, this, &bdatt)) + memcpy (buf, bdatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +bd_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int op_errno = EINVAL; + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (loc->path, out); + VALIDATE_OR_GOTO (this->private, out); + + if (!bd_inode_ctx_get (loc->inode, this, &bdatt)) { + BD_STACK_UNWIND (stat, frame, 0, 0, &bdatt->iatt, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + local->inode = inode_ref (loc->inode); + + STACK_WIND(frame, bd_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; +out: + BD_STACK_UNWIND (stat, frame, -1, op_errno, NULL, xdata); + return 0; +} + +int +bd_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *buff, dict_t *xdata) +{ + uint64_t size = 0; + uint64_t fr_size = 0; + bd_priv_t *priv = NULL; + vg_t vg = NULL; + + if (op_ret < 0) + goto out; + + priv = this->private; + + vg = lvm_vg_open (priv->handle, priv->vg, "r", 0); + if (!vg) { + gf_log (this->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + op_ret = -1; + op_errno = EAGAIN; + goto out; + } + size = lvm_vg_get_size (vg); + fr_size = lvm_vg_get_free_size (vg); + lvm_vg_close (vg); + + buff->f_blocks += size / buff->f_frsize; + buff->f_bfree += fr_size / buff->f_frsize; + buff->f_bavail += fr_size / buff->f_frsize; + +out: + BD_STACK_UNWIND (statfs, frame, op_ret, op_errno, buff, xdata); + return 0; +} + +/* + * bd_statfs: Mimics statfs by returning used/free extents in the VG + */ +int +bd_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + STACK_WIND (frame, bd_statfs_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, loc, xdata); + return 0; +out: + BD_STACK_UNWIND (statfs, frame, -1, EINVAL, NULL, NULL); + return 0; +} + +int +bd_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *buf, dict_t *xdata) +{ + bd_attr_t *bdatt = NULL; + bd_local_t *local = frame->local; + + /* only regular files are part of BD object */ + if (op_ret < 0 || buf->ia_type != IA_IFREG) + goto out; + + BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + + /* update buf with LV size */ + if (!bd_inode_ctx_get (local->inode, this, &bdatt)) + memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +bd_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int op_errno = EINVAL; + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + /* if its already cached return it */ + if (!bd_inode_ctx_get (fd->inode, this, &bdatt)) { + BD_STACK_UNWIND (fstat, frame, 0, 0, &bdatt->iatt, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->inode = inode_ref (fd->inode); + + STACK_WIND (frame, bd_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + + return 0; +out: + BD_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, xdata); + return 0; +} + +/* + * bd_readv: If posix file, invokes posix_readv otherwise reads from the BD + * file + */ +int +bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int ret = -1; + int _fd = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + bd_fd_t *bd_fd = NULL; + struct iovec vec = {0, }; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + uint64_t bd_size = 0; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; + } + if (!size) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); + goto out; + } + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto out; + } + _fd = bd_fd->fd; + op_ret = pread (_fd, iobuf->ptr, size, offset); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "read failed on fd=%p: %s", fd, + strerror (op_errno)); + goto out; + } + + vec.iov_base = iobuf->ptr; + vec.iov_len = op_ret; + + iobref = iobref_new (); + iobref_add (iobref, iobuf); + + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + op_errno = EINVAL; + op_ret = -1; + goto out; + } + bd_size = bdatt->iatt.ia_size; + if (!bd_size || (offset + vec.iov_len) >= bd_size) + op_errno = ENOENT; + + op_ret = vec.iov_len; + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_ATIME); + +out: + BD_STACK_UNWIND (readv, frame, op_ret, op_errno, + &vec, 1, &bdatt->iatt, iobref, NULL); + + if (iobref) + iobref_unref (iobref); + if (iobuf) + iobuf_unref (iobuf); + + return 0; +} + +#ifdef BLKDISCARD +/* + * bd_discard: Sends BLKDISCARD ioctl to the block device + */ +int +bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int ret = -1; + int op_errno = EINVAL; + bd_fd_t *bd_fd = NULL; + uint64_t param[2] = {0, }; + bd_attr_t *bdatt = NULL; + struct iatt prebuf = {0, }; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (fd, out); + + /* posix */ + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + STACK_WIND (frame, default_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, + fd, offset, len, xdata); + return 0; + } + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + op_errno = EINVAL; + goto out; + } + + param[0] = offset; + param[1] = len; + ret = ioctl (bd_fd->fd, BLKDISCARD, param); + if (ret < 0) { + if (errno == ENOTTY) + op_errno = ENOSYS; + else + op_errno = errno; + goto out; + } + memcpy (&prebuf, &bdatt->iatt, sizeof (prebuf)); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); + + BD_STACK_UNWIND (discard, frame, ret, op_errno, &prebuf, + &bdatt->iatt, xdata); + return 0; + +out: + BD_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} +#else + +int +bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + BD_STACK_UNWIND (discard, frame, -1, ENOSYS, NULL, NULL, NULL); + return 0; +} +#endif + +/* + * Call back from posix_open for opening the backing posix file + * If it failed, close BD fd + */ +int +bd_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ + bd_fd_t *bd_fd = NULL; + bd_attr_t *bdatt = NULL; + + if (!op_ret) + goto out; + + bd_inode_ctx_get (fd->inode, this, &bdatt); + if (!bdatt) /* posix file */ + goto out; + + /* posix open failed */ + if (bd_fd_ctx_get (this, fd, &bd_fd) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "bd_fd is NULL from fd=%p", fd); + goto out; + } + close (bd_fd->fd); + GF_FREE (bd_fd); + +out: + BD_STACK_UNWIND (open, frame, op_ret, op_errno, fd, NULL); + + return 0; +} + +/* + * bd_open: Opens BD file if given posix file is mapped to BD. Also opens + * posix file. + * fd contains both posix and BD fd + */ +int32_t +bd_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + int32_t ret = EINVAL; + bd_fd_t *bd_fd = NULL; + bd_attr_t *bdatt = NULL; + bd_gfid_t gfid = {0, }; + char *devpath = NULL; + bd_priv_t *priv = this->private; + int _fd = -1; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (fd, out); + + /* not bd file */ + if (fd->inode->ia_type != IA_IFREG || + bd_inode_ctx_get (fd->inode, this, &bdatt)) + goto posix; + + uuid_utoa_r (fd->inode->gfid, gfid); + asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid); + BD_VALIDATE_MEM_ALLOC (devpath, ret, out); + + _fd = open (devpath, flags | O_LARGEFILE, 0); + if (_fd < 0) { + ret = errno; + gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath, + strerror (ret)); + goto out; + } + bd_fd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd); + BD_VALIDATE_MEM_ALLOC (bd_fd, ret, out); + + bd_fd->fd = _fd; + bd_fd->flag = flags | O_LARGEFILE; + + if (fd_ctx_set (fd, this, (uint64_t)(long)bd_fd) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set the fd context fd=%p", fd); + goto out; + } + + ret = 0; + +posix: + + /* open posix equivalant of this file, fd needed for fd related + operations like fsetxattr, ftruncate etc */ + STACK_WIND (frame, bd_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + + return 0; +out: + BD_STACK_UNWIND (open, frame, -1, ret, fd, NULL); + + FREE (devpath); + if (ret) { + close (_fd); + GF_FREE (bd_fd); + } + + return 0; +} + +/* + * call back from posix_setattr after updating iatt to posix file. + */ +int +bd_fsync_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = local->bdatt; + + BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &bdatt->iatt, + &bdatt->iatt, NULL); + return 0; +} + +int +bd_do_fsync (int fd, int datasync) +{ + int op_errno = 0; + +#ifdef HAVE_FDATASYNC + if (datasync) { + if (fdatasync (fd)) { + op_errno = errno; + gf_log (THIS->name, GF_LOG_ERROR, + "fdatasync on fd=%d failed: %s", + fd, strerror (errno)); + } + + } else +#endif + { + if (fsync (fd)) { + op_errno = errno; + gf_log (THIS->name, GF_LOG_ERROR, + "fsync on fd=%d failed: %s", + fd, strerror (op_errno)); + } + } + + return op_errno; +} + +/* + * bd_fsync: Syncs if BD fd, forwards the request to posix + * fsync -> posix_setattr -> posix_fsync +*/ +int32_t +bd_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t datasync, dict_t *xdata) +{ + int ret = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + bd_fd_t *bd_fd = NULL; + bd_priv_t *priv = NULL; + bd_attr_t *bdatt = NULL; + bd_local_t *local = NULL; + int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + struct iatt prebuf = {0, }; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ret = bd_inode_ctx_get (fd->inode, this, &bdatt); + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd || !bdatt) { + STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsync, fd, datasync, + xdata); + return 0; + } + + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + + op_errno = bd_do_fsync (bd_fd->fd, datasync); + if (op_errno) + goto out; + + /* For BD, Update the a|mtime during full fsync only */ + if (!datasync) { + local = bd_local_init (frame, this); + /* In case of mem failure, should posix flush called ? */ + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + local->bdatt->type = gf_strdup (bdatt->type); + memcpy (&local->bdatt->iatt, &bdatt->iatt, sizeof (struct iatt)); + bd_update_amtime (&local->bdatt->iatt, valid); + uuid_copy (local->loc.gfid, fd->inode->gfid); + STACK_WIND (frame, bd_fsync_setattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, &local->loc, + &local->bdatt->iatt, + valid, NULL); + return 0; + } + +out: + BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &prebuf, + &bdatt->iatt, NULL); + return 0; +} + +int +bd_flush_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + BD_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata); + return 0; +} + +int +bd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int ret = -1; + bd_fd_t *bd_fd = NULL; + bd_priv_t *priv = NULL; + bd_attr_t *bdatt = NULL; + int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + bd_local_t *local = NULL; + int op_errno = EINVAL; + loc_t loc = {0, }; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ret = bd_inode_ctx_get (fd->inode, this, &bdatt); + if (!bdatt) + goto out; + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd || !bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "bdfd/bdatt is NULL from fd=%p", fd); + goto out; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->fd = fd_ref (fd); + uuid_copy (loc.gfid, bdatt->iatt.ia_gfid); + + /* Update the a|mtime during flush */ + STACK_WIND (frame, bd_flush_setattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, &loc, &bdatt->iatt, + valid, NULL); + + return 0; + +out: + STACK_WIND (frame, default_flush_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->flush, fd, xdata); + + return 0; +} + +int32_t +bd_release (xlator_t *this, fd_t *fd) +{ + int ret = -1; + bd_fd_t *bd_fd = NULL; + uint64_t tmp_bfd = 0; + bd_attr_t *bdatt = NULL; + bd_priv_t *priv = this->private; + + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (priv, out); + + ret = bd_inode_ctx_get (fd->inode, this, &bdatt); + if (ret || !bdatt) /* posix file */ + goto out; + + /* FIXME: Update amtime during release */ + + ret = fd_ctx_del (fd, this, &tmp_bfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "bfd is NULL from fd=%p", fd); + goto out; + } + bd_fd = (bd_fd_t *)(long)tmp_bfd; + + close (bd_fd->fd); + GF_FREE (bd_fd); +out: + return 0; +} + +/* + * Call back for removexattr after removing BD_XATTR incase of + * bd create failure + */ +int +bd_setx_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + + if (local->fd) + BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata); + else + BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata); + return 0; + +} + +/* + * Call back after setting BD_XATTR. Creates BD. If BD creation is a failure + * invokes posix_removexattr to remove created BD_XATTR + */ +int +bd_setx_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + + if (op_ret < 0) + goto next; + + /* Create LV */ + op_errno = bd_create (local->inode->gfid, local->bdatt->iatt.ia_size, + local->bdatt->type, this->private); + if (!op_errno) + goto out; + + /* LV creation failed, remove BD_XATTR */ + if (local->fd) + STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, + local->fd, BD_XATTR, NULL); + else + STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + &local->loc, BD_XATTR, NULL); + + return 0; +out: + + bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + if (!bdatt) { + op_ret = -1; + op_errno = ENOMEM; + goto next; + } + + memcpy (&bdatt->iatt, &local->bdatt->iatt, sizeof (struct iatt)); + bdatt->type = gf_strdup (local->bdatt->type); + + bd_inode_ctx_set (local->inode, THIS, bdatt); + +next: + if (local->fd) + BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); + else + BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); + return 0; + +} + +/* + * Call back from posix_stat + */ +int +bd_setx_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *iatt, + dict_t *xdata) +{ + char *param = NULL; + char *type = NULL; + char *s_size = NULL; + char *p = NULL; + char *copy = NULL; + bd_local_t *local = frame->local; + bd_priv_t *priv = this->private; + char *bd = NULL; + uint64_t size = 0; + + if (op_ret < 0) + goto out; + + if (!IA_ISREG (iatt->ia_type)) { + op_errno = EOPNOTSUPP; + goto out; + } + + param = copy = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char); + BD_VALIDATE_MEM_ALLOC (param, op_errno, out); + + strncpy (param, local->data->data, local->data->len); + + type = strtok_r (param, ":", &p); + if (!type) { + op_errno = EINVAL; + goto out; + } + + if (strcmp (type, BD_LV) && strcmp (type, BD_THIN)) { + gf_log (this->name, GF_LOG_WARNING, "Invalid bd type %s given", + type); + op_errno = EINVAL; + goto out; + } + + s_size = strtok_r (NULL, ":", &p); + + /* If size not specified get default size */ + if (!s_size) + size = bd_get_default_extent (priv); + else + gf_string2bytesize (s_size, &size); + + gf_asprintf (&bd, "%s:%ld", type, size); + BD_VALIDATE_MEM_ALLOC (bd, op_errno, out); + + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + + local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + if (dict_set_dynstr (local->dict, BD_XATTR, bd) < 0) { + op_errno = EINVAL; + goto out; + } + + local->bdatt->type = gf_strdup (type); + memcpy (&local->bdatt->iatt, iatt, sizeof (struct iatt)); + local->bdatt->iatt.ia_size = size; + + if (local->fd) + STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, local->dict, 0, NULL); + else + STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + &local->loc, local->dict, 0, NULL); + + return 0; + +out: + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, xdata); + else + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, xdata); + + GF_FREE (bd); + GF_FREE (copy); + return 0; +} + +int +bd_offload_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, EIO, NULL); + else + BD_STACK_UNWIND (setxattr, frame, -1, EIO, NULL); + + return 0; +} + +int +bd_offload_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + + if (op_ret < 0) + goto out; + + if (local->offload == BD_OF_SNAPSHOT) + op_ret = bd_snapshot_create (frame->local, this->private); + else + op_ret = bd_clone (frame->local, this->private); + + if (op_ret) { + STACK_WIND (frame, bd_offload_rm_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + local->dloc, BD_XATTR, NULL); + return 0; + } + +out: + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL); + else + BD_STACK_UNWIND (setxattr, frame, op_errno, op_errno, NULL); + + return 0; +} + +int +bd_offload_getx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + char *bd = NULL; + bd_local_t *local = frame->local; + char *type = NULL; + char *p = NULL; + + if (op_ret < 0) + goto out; + + if (dict_get_str (xattr, BD_XATTR, &p)) { + op_errno = EINVAL; + goto out; + } + + type = gf_strdup (p); + BD_VALIDATE_MEM_ALLOC (type, op_errno, out); + + p = strrchr (type, ':'); + if (!p) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, + "source file xattr %s corrupted?", type); + goto out; + } + + *p='\0'; + + /* For clone size is taken from source LV */ + if (!local->size) { + p++; + gf_string2bytesize (p, &local->size); + } + gf_asprintf (&bd, "%s:%ld", type, local->size); + local->bdatt->type = gf_strdup (type); + dict_del (local->dict, BD_XATTR); + dict_del (local->dict, LINKTO); + if (dict_set_dynstr (local->dict, BD_XATTR, bd)) { + op_errno = EINVAL; + goto out; + } + + STACK_WIND (frame, bd_offload_setx_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + local->dloc, local->dict, 0, NULL); + + return 0; + +out: + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); + else + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + + GF_FREE (type); + GF_FREE (bd); + + return 0; +} + +int +bd_offload_dest_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *iatt, + dict_t *xattr, struct iatt *postparent) +{ + bd_local_t *local = frame->local; + char *bd = NULL; + int ret = -1; + char *linkto = NULL; + + if (op_ret < 0 && op_errno != ENODATA) { + op_errno = EINVAL; + goto out; + } + + if (!IA_ISREG (iatt->ia_type)) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "destination gfid is not a " + "regular file"); + goto out; + } + + ret = dict_get_str (xattr, LINKTO, &linkto); + if (linkto) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "destination file not " + "present in same brick"); + goto out; + } + + ret = dict_get_str (xattr, BD_XATTR, &bd); + if (bd) { + op_errno = EEXIST; + goto out; + } + + local->bdatt = CALLOC (1, sizeof (bd_attr_t)); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + STACK_WIND (frame, bd_offload_getx_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + &local->loc, BD_XATTR, NULL); + + return 0; +out: + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); + else + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + + return 0; +} + +int +bd_merge_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + /* FIXME: if delete failed, remove xattr */ + + BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, NULL); + return 0; +} + +int +bd_do_merge(call_frame_t *frame, xlator_t *this) +{ + bd_local_t *local = frame->local; + inode_t *parent = NULL; + char *p = NULL; + int op_errno = 0; + + op_errno = bd_merge (this->private, local->inode->gfid); + if (op_errno) + goto out; + + /* + * posix_unlink needs loc->pargfid to be valid, but setxattr FOP does + * not have loc->pargfid set. Get parent's gfid by getting parents inode + */ + parent = inode_parent (local->inode, NULL, NULL); + if (!parent) { + /* + * FIXME: Snapshot LV already deleted. + * remove xattr, instead of returning failure + */ + op_errno = EINVAL; + goto out; + } + uuid_copy (local->loc.pargfid, parent->gfid); + + p = strrchr (local->loc.path, '/'); + if (p) + p++; + local->loc.name = p; + + STACK_WIND (frame, bd_merge_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + &local->loc, 0, NULL); + + return 0; +out: + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); + + return op_errno; +} + +int +bd_offload (call_frame_t *frame, xlator_t *this, loc_t *loc, + fd_t *fd, bd_offload_t offload) +{ + char *param = NULL; + char *param_copy = NULL; + char *p = NULL; + char *size = NULL; + char *gfid = NULL; + int op_errno = 0; + bd_local_t *local = frame->local; + + param = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char); + BD_VALIDATE_MEM_ALLOC (param, op_errno, out); + param_copy = param; + + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + + local->dloc = CALLOC (1, sizeof (loc_t)); + BD_VALIDATE_MEM_ALLOC (local->dloc, op_errno, out); + + strncpy (param, local->data->data, local->data->len); + + gfid = strtok_r (param, ":", &p); + size = strtok_r (NULL, ":", &p); + if (size) + gf_string2bytesize (size, &local->size); + else if (offload != BD_OF_CLONE) + local->size = bd_get_default_extent (this->private); + + if (dict_set_int8 (local->dict, BD_XATTR, 1) < 0) { + op_errno = EINVAL; + goto out; + } + if (dict_set_int8 (local->dict, LINKTO, 1) < 0) { + op_errno = EINVAL; + goto out; + } + + uuid_parse (gfid, local->dloc->gfid); + local->offload = offload; + + STACK_WIND (frame, bd_offload_dest_lookup_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, local->dloc, + local->dict); + + return 0; + +out: + if (fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); + else + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + + GF_FREE (param_copy); + return 0; +} + +/* + * bd_setxattr: Used to create & map an LV to a posix file using + * BD_XATTR xattr + * bd_setxattr -> posix_stat -> bd_setx_stat_cbk -> posix_setxattr -> + * bd_setx_setx_cbk -> create_lv + * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk + */ +int +bd_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int flags, dict_t *xdata) +{ + int op_errno = 0; + data_t *data = NULL; + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + bd_offload_t cl_type = BD_OF_NONE; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + + if ((data = dict_get (dict, BD_XATTR))) + cl_type = BD_OF_NONE; + else if ((data = dict_get (dict, BD_CLONE))) + cl_type = BD_OF_CLONE; + else if ((data = dict_get (dict, BD_SNAPSHOT))) + cl_type = BD_OF_SNAPSHOT; + else if ((data = dict_get (dict, BD_MERGE))) + cl_type = BD_OF_MERGE; + + bd_inode_ctx_get (loc->inode, this, &bdatt); + if (!cl_type && !data) { + STACK_WIND (frame, default_setxattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setxattr, loc, dict, + flags, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->data = data; + loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + + if (cl_type) { + /* For cloning/snapshot, source file must be mapped to LV */ + if (!bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "%s not mapped to BD", loc->path); + op_errno = EINVAL; + goto out; + } + if (cl_type == BD_OF_MERGE) + bd_do_merge (frame, this); + else + bd_offload (frame, this, loc, NULL, cl_type); + } else if (data) { + if (bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "%s already mapped to BD", loc->path); + op_errno = EEXIST; + goto out; + } + STACK_WIND (frame, bd_setx_stat_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->stat, loc, xdata); + } + + return 0; +out: + if (op_errno) + STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, xdata); + + return 0; +} + +/* + * bd_fsetxattr: Used to create/map an LV to a posix file using + * BD_XATTR xattr + * bd_fsetxattr -> posix_fstat -> bd_setx_stat_cbk -> posix_fsetxattr -> + * bd_setx_setx_cbk -> create_lv + * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk + * -> bd_fsetxattr_cbk + */ +int32_t +bd_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int flags, dict_t *xdata) +{ + int op_errno = 0; + data_t *data = NULL; + bd_attr_t *bdatt = NULL; + bd_local_t *local = NULL; + bd_offload_t cl_type = BD_OF_NONE; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (fd, out); + + bd_inode_ctx_get (fd->inode, this, &bdatt); + + data = dict_get (dict, BD_XATTR); + if ((data = dict_get (dict, BD_XATTR))) + cl_type = BD_OF_NONE; + else if ((data = dict_get (dict, BD_CLONE))) + cl_type = BD_OF_CLONE; + else if ((data = dict_get (dict, BD_SNAPSHOT))) + cl_type = BD_OF_SNAPSHOT; + else if ((data = dict_get (dict, BD_MERGE))) { + /* + * bd_merge is not supported for fsetxattr, because snapshot LV + * is opened and it causes problem in snapshot merge + */ + op_errno = EOPNOTSUPP; + goto out; + } + + bd_inode_ctx_get (fd->inode, this, &bdatt); + + if (!cl_type && !data) { + /* non bd file object */ + STACK_WIND (frame, default_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + fd, dict, flags, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->inode = inode_ref (fd->inode); + local->fd = fd_ref (fd); + local->data = data; + + if (cl_type) { + /* For cloning/snapshot, source file must be mapped to LV */ + if (!bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "fd %p not mapped to BD", fd); + op_errno = EINVAL; + goto out; + + } + bd_offload (frame, this, NULL, fd, cl_type); + } else if (data) { + if (bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "fd %p already mapped to BD", fd); + op_errno = EEXIST; + goto out; + } + STACK_WIND(frame, bd_setx_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + } + + return 0; +out: + + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + + return 0; +} + +int32_t +bd_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + if (!strcmp (name, BD_XATTR)) + goto out; + + STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; +out: + BD_STACK_UNWIND (removexattr, frame, -1, ENODATA, NULL); + return 0; +} + +int32_t +bd_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + if (!strcmp (name, BD_XATTR)) + goto out; + + STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + + return 0; +out: + BD_STACK_UNWIND (fremovexattr, frame, -1, ENODATA, NULL); + return 0; +} + +int +bd_trunc_setxattr_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL); + else + BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL); + + return 0; +} + +/* + * Call back for setxattr after setting BD_XATTR_SIZE. + */ +int +bd_trunc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + struct iatt prebuf = {0, }; + char *bd = NULL; + + if (op_ret < 0) + goto out; + + bd_inode_ctx_get (local->inode, this, &bdatt); + if (!bdatt) + goto revert_xattr; + + op_errno = bd_resize (this->private, local->inode->gfid, + local->bdatt->iatt.ia_size); + if (op_errno) + goto revert_xattr; + + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + /* LV resized, update new size in the cache */ + bdatt->iatt.ia_size = local->bdatt->iatt.ia_size; + + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, 0, 0, &prebuf, &bdatt->iatt, + NULL); + else + BD_STACK_UNWIND (truncate, frame, 0, 0, &prebuf, &bdatt->iatt, + NULL); + + return 0; + +revert_xattr: + /* revert setxattr */ + op_ret = dict_get_str (local->dict, BD_XATTR, &bd); + GF_FREE (bd); + gf_asprintf (&bd, "%s:%ld", bdatt->type, bdatt->iatt.ia_size); + + if (local->fd) + STACK_WIND (frame, bd_trunc_setxattr_setx_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, local->dict, 0, NULL); + else + STACK_WIND (frame, bd_trunc_setxattr_setx_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + &local->loc, local->dict, 0, NULL); + + return 0; +out: + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL); + else + BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL); + + return 0; +} + +/* + * call back from posix_[f]truncate_stat + * If offset > LV size, it resizes the LV and calls posix_setxattr + * to update new LV size in xattr else calls posix_setattr for updating + * the posix file so that truncate fop behaves properly + */ +int +bd_trunc_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, dict_t *xdata) +{ + char *bd = NULL; + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + + if (op_ret < 0) + goto out; + + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + + bd_inode_ctx_get (local->inode, this, &bdatt); + if (!bdatt) { + op_errno = EINVAL; + goto out; + } + + gf_asprintf (&bd, "%s:%ld", bdatt->type, local->bdatt->iatt.ia_size); + if (dict_set_dynstr (local->dict, BD_XATTR, bd)) { + op_errno = EINVAL; + goto out; + } + + if (local->fd) + STACK_WIND (frame, bd_trunc_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, local->dict, 0, NULL); + else + STACK_WIND (frame, bd_trunc_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + &local->loc, local->dict, 0, NULL); + + return 0; +out: + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, + NULL); + else + BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, + NULL); + GF_FREE (bd); + return 0; +} + +void +bd_do_trunc (call_frame_t *frame, xlator_t *this, fd_t *fd, loc_t *loc, + off_t offset, bd_attr_t *bdatt) +{ + bd_local_t *local = NULL; + struct iatt prebuf = {0, }; + int op_errno = 0; + int op_ret = -1; + + /* If requested size is less than LV size, return success */ + if (offset <= bdatt->iatt.ia_size) { + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); + op_ret = 0; + goto out; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + if (fd) { + local->inode = inode_ref (fd->inode); + local->fd = fd_ref (fd); + } else { + local->inode = inode_ref (loc->inode); + loc_copy (&local->loc, loc); + } + + local->bdatt->iatt.ia_size = + bd_adjust_size (this->private, offset); + + STACK_WIND (frame, bd_trunc_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, NULL); + + return; + +out: + if (fd) + BD_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, + &prebuf, &bdatt->iatt, NULL); + else + BD_STACK_UNWIND (truncate, frame, op_ret, op_errno, + &prebuf, &bdatt->iatt, NULL); + return; +} + +/* + * bd_ftruncate: Resizes a LV if fd belongs to BD. + */ +int32_t +bd_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + int op_errno = 0; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + STACK_WIND (frame, default_ftruncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, + offset, xdata); + return 0; + } + + bd_do_trunc (frame, this, fd, NULL, offset, bdatt); + return 0; +out: + BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +/* + * bd_truncate: Resizes a LV if file maps to LV. + */ +int32_t +bd_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + int op_errno = 0; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { + STACK_WIND (frame, default_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, + offset, xdata); + return 0; + } + + bd_do_trunc (frame, this, NULL, loc, offset, bdatt); + return 0; + +out: + BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +__bd_pwritev (int fd, struct iovec *vector, int count, off_t offset, + uint64_t bd_size) +{ + int index = 0; + int retval = 0; + off_t internal_offset = 0; + + if (!vector) + return -EFAULT; + + retval = pwritev (fd, vector, count, offset); + if (retval == -1) { + gf_log (THIS->name, GF_LOG_WARNING, + "base %p, length %ld, offset %ld, message %s", + vector[index].iov_base, vector[index].iov_len, + internal_offset, strerror (errno)); + retval = -errno; + goto err; + } +/* + + + internal_offset = offset; + for (index = 0; index < count; index++) { + if (internal_offset > bd_size) { + op_ret = -ENOSPC; + goto err; + } + if (internal_offset + vector[index].iov_len > bd_size) { + vector[index].iov_len = bd_size - internal_offset; + no_space = 1; + } + retval = pwritev (fd, vector[index].iov_base, + vector[index].iov_len, internal_offset); + if (retval == -1) { + gf_log (THIS->name, GF_LOG_WARNING, + "base %p, length %ld, offset %ld, message %s", + vector[index].iov_base, vector[index].iov_len, + internal_offset, strerror (errno)); + op_ret = -errno; + goto err; + } + op_ret += retval; + internal_offset += retval; + if (no_space) + break; + } +*/ +err: + return retval; +} + +/* + * bd_writev: Writes to LV if its BD file or forwards the request to posix_write + * bd_writev -> posix_writev -> bd_writev_cbk + */ +int +bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdict) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + bd_fd_t *bd_fd = NULL; + int ret = -1; + uint64_t size = 0; + struct iatt prebuf = {0, }; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (vector, out); + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { /* posix fd */ + STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, + offset, flags, iobref, xdict); + return 0; + } + + _fd = bd_fd->fd; + + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + size = bdatt->iatt.ia_size; + + op_ret = __bd_pwritev (_fd, vector, count, offset, size); + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64 + ", %s", offset, strerror (op_errno)); + goto out; + } + + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); +out: + + BD_STACK_UNWIND (writev, frame, op_ret, op_errno, &prebuf, + &bdatt->iatt, NULL); + return 0; +} + +int +bd_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + bd_attr_t *bdatt = NULL; + int *valid = cookie; + bd_local_t *local = frame->local; + + if (op_ret < 0 || !valid || !local) + goto out; + + if (bd_inode_ctx_get (local->inode, this, &bdatt)) + goto out; + + if (*valid & GF_SET_ATTR_UID) + bdatt->iatt.ia_uid = postbuf->ia_uid; + else if (*valid & GF_SET_ATTR_GID) + bdatt->iatt.ia_gid = postbuf->ia_gid; + else if (*valid & GF_SET_ATTR_MODE) { + bdatt->iatt.ia_type = postbuf->ia_type; + bdatt->iatt.ia_prot = postbuf->ia_prot; + } else if (*valid & GF_SET_ATTR_ATIME) { + bdatt->iatt.ia_atime = postbuf->ia_atime; + bdatt->iatt.ia_atime_nsec = postbuf->ia_atime_nsec; + } else if (*valid & GF_SET_ATTR_MTIME) { + bdatt->iatt.ia_mtime = postbuf->ia_mtime; + bdatt->iatt.ia_mtime_nsec = postbuf->ia_mtime_nsec; + } + + bdatt->iatt.ia_ctime = postbuf->ia_ctime; + bdatt->iatt.ia_ctime_nsec = postbuf->ia_ctime_nsec; + + memcpy (postbuf, &bdatt->iatt, sizeof (struct iatt)); +out: + FREE (valid); + BD_STACK_UNWIND (setattr, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; +} + +int +bd_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + int *ck_valid = NULL; + int op_errno = 0; + + if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, + loc, stbuf, valid, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + ck_valid = CALLOC (1, sizeof (valid)); + BD_VALIDATE_MEM_ALLOC (ck_valid, op_errno, out); + + local->inode = inode_ref (loc->inode); + *ck_valid = valid; + + STACK_WIND_COOKIE (frame, bd_setattr_cbk, ck_valid, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, + loc, stbuf, valid, xdata); + + return 0; +out: + BD_STACK_UNWIND (setattr, frame, -1, ENOMEM, NULL, NULL, xdata); + return 0; +} + +int +bd_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + bd_attr_t *bdatt = NULL; + + if (op_ret < 0) + goto out; + + if (bd_inode_ctx_get (inode, this, &bdatt)) + goto out; + + bdatt->iatt.ia_ctime = buf->ia_ctime; + bdatt->iatt.ia_ctime_nsec = buf->ia_ctime_nsec; + bdatt->iatt.ia_nlink = buf->ia_nlink; + memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (link, frame, op_ret, op_errno, inode, buf, + preparent, postparent, NULL); + return 0; +} + +int +bd_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + STACK_WIND (frame, bd_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; +} + +int +bd_handle_special_xattrs (call_frame_t *frame, xlator_t *this, loc_t *loc, + fd_t *fd, const char *name, dict_t *xdata) +{ + dict_t *xattr = NULL; + int op_ret = -1; + int op_errno = ENOMEM;; + bd_priv_t *priv = this->private; + + xattr = dict_new (); + if (!xattr) + goto out; + + if (!strcmp (name, VOL_TYPE)) + op_ret = dict_set_int64 (xattr, (char *)name, 1); + else if (!strcmp (name, VOL_CAPS)) + op_ret = dict_set_int64 (xattr, (char *)name, priv->caps); + else + op_ret = bd_get_origin (this->private, loc, fd, xattr); + +out: + if (loc) + BD_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, + xdata); + else + BD_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, + xdata); + + op_ret = dict_reset (xattr); + dict_unref (xattr); + + return 0; +} + +int +bd_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS) + || !strcmp (name, BD_ORIGIN))) + bd_handle_special_xattrs (frame, this, NULL, fd, name, xdata); + else + STACK_WIND (frame, default_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, + fd, name, xdata); + return 0; +} + +int +bd_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS) + || !strcmp (name, BD_ORIGIN))) + bd_handle_special_xattrs (frame, this, loc, NULL, name, xdata); + else + STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, name, xdata); + + return 0; +} + +int +bd_unlink_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + bd_gfid_t gfid = {0, }; + bd_local_t *local = frame->local; + + if (buf->ia_nlink > 1) + goto posix; + + BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + + uuid_utoa_r (inode->gfid, gfid); + if (bd_delete_lv (this->private, gfid, &op_errno) < 0) { + if (op_errno != ENOENT) + goto out; + } + +posix: + /* remove posix */ + STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + &local->loc, 0, NULL); + + return 0; +out: + BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +bd_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, int xflag, dict_t *xdata) +{ + int op_errno = 0; + bd_attr_t *bdatt = NULL; + bd_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { + STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + loc, xflag, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + loc_copy (&local->loc, loc); + + STACK_WIND (frame, bd_unlink_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, NULL); + return 0; +out: + BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +bd_priv (xlator_t *this) +{ + return 0; +} + +int32_t +bd_inode (xlator_t *this) +{ + return 0; +} + +int32_t +bd_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + int32_t len, dict_t *xdata) +{ + int op_ret = -1; + int op_errno = 0; + int ret = 0; + int _fd = -1; + char *alloc_buf = NULL; + char *buf = NULL; + int32_t weak_checksum = 0; + bd_fd_t *bd_fd = NULL; + unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0}; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + STACK_WIND (frame, default_rchecksum_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->rchecksum, fd, offset, + len, xdata); + return 0; + } + + memset (strong_checksum, 0, MD5_DIGEST_LENGTH); + + alloc_buf = page_aligned_alloc (len, &buf); + if (!alloc_buf) { + op_errno = ENOMEM; + goto out; + } + + _fd = bd_fd->fd; + + LOCK (&fd->lock); + { + ret = pread (_fd, buf, len, offset); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "pread of %d bytes returned %d (%s)", + len, ret, strerror (errno)); + op_errno = errno; + } + } + UNLOCK (&fd->lock); + + if (ret < 0) + goto out; + + weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf, + (size_t) len); + gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len, + (unsigned char *) strong_checksum); + + op_ret = 0; +out: + BD_STACK_UNWIND (rchecksum, frame, op_ret, op_errno, + weak_checksum, strong_checksum, NULL); + + GF_FREE (alloc_buf); + + return 0; +} + +/** + * notify - when parent sends PARENT_UP, send CHILD_UP event from here + */ +int32_t +notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + switch (event) + { + case GF_EVENT_PARENT_UP: + { + /* Tell the parent that bd xlator is up */ + default_notify (this, GF_EVENT_CHILD_UP, data); + } + break; + default: + break; + } + return 0; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_bd_mt_end + 1); + + if (ret != 0) + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ + int ret = -1; + bd_priv_t *priv = this->private; + + GF_OPTION_RECONF ("bd-aio", priv->aio_configured, options, + bool, out); + + if (priv->aio_configured) + bd_aio_on (this); + else + bd_aio_off (this); + + ret = 0; +out: + return ret; +} + +/** + * bd xlator init - Validate configured VG + */ +int +init (xlator_t *this) +{ + int ret = 0; + char *vg_data = NULL; + char *device = NULL; + bd_priv_t *_private = NULL; + + if (!this->children) { + gf_log (this->name, GF_LOG_CRITICAL, + "FATAL: storage/bd needs posix as subvolume"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "Volume is dangling. Please check the volume file."); + } + + GF_OPTION_INIT ("export", vg_data, str, error); + GF_OPTION_INIT ("device", device, str, error); + + /* Now we support only LV device */ + if (strcasecmp (device, BACKEND_VG)) { + gf_log (this->name, GF_LOG_CRITICAL, + "FATAL: unknown %s backend %s", BD_XLATOR, device); + return -1; + } + + this->local_pool = mem_pool_new (bd_local_t, 64); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_CRITICAL, + "FATAL: Failed to create bd memory pool"); + return -1; + } + + ret = 0; + _private = GF_CALLOC (1, sizeof (*_private), gf_bd_private); + if (!_private) + goto error; + + this->private = _private; + _private->vg = gf_strdup (vg_data); + if (!_private->vg) + goto error; + + _private->handle = lvm_init (NULL); + if (!_private->handle) { + gf_log (this->name, GF_LOG_CRITICAL, "lvm_init failed"); + goto error; + } + _private->caps = BD_CAPS_BD; + if (bd_scan_vg (this, _private)) + goto error; + + _private->aio_init_done = _gf_false; + _private->aio_capable = _gf_false; + + GF_OPTION_INIT ("bd-aio", _private->aio_configured, bool, error); + if (_private->aio_configured) { + if (bd_aio_on (this)) { + gf_log (this->name, GF_LOG_ERROR, + "BD AIO init failed"); + ret = -1; + goto error; + } + } + + _private->caps |= BD_CAPS_OFFLOAD_COPY | BD_CAPS_OFFLOAD_SNAPSHOT; + + return 0; +error: + GF_FREE (_private->vg); + if (_private->handle) + lvm_quit (_private->handle); + mem_pool_destroy (this->local_pool); + GF_FREE (_private); + return -1; +} + +void +fini (xlator_t *this) +{ + bd_priv_t *priv = this->private; + mem_pool_destroy (this->local_pool); + this->local_pool = NULL; + if (!priv) + return; + lvm_quit (priv->handle); + GF_FREE (priv->vg); + this->private = NULL; + GF_FREE (priv); + return; +} + +struct xlator_dumpops dumpops = { + .priv = bd_priv, + .inode = bd_inode, +}; + +struct xlator_fops fops = { + .readdirp = bd_readdirp, + .lookup = bd_lookup, + .stat = bd_stat, + .statfs = bd_statfs, + .open = bd_open, + .fstat = bd_fstat, + .rchecksum = bd_rchecksum, + .readv = bd_readv, + .fsync = bd_fsync, + .setxattr = bd_setxattr, + .fsetxattr = bd_fsetxattr, + .removexattr = bd_removexattr, + .fremovexattr=bd_fremovexattr, + .truncate = bd_truncate, + .ftruncate = bd_ftruncate, + .writev = bd_writev, + .getxattr = bd_getxattr, + .fgetxattr = bd_fgetxattr, + .unlink = bd_unlink, + .link = bd_link, + .flush = bd_flush, + .setattr = bd_setattr, + .discard = bd_discard, +}; + +struct xlator_cbks cbks = { + .release = bd_release, + .forget = bd_forget, +}; + +struct volume_options options[] = { + { .key = {"export"}, + .type = GF_OPTION_TYPE_STR}, + { .key = {"device"}, + .type = GF_OPTION_TYPE_STR, + .default_value = BACKEND_VG}, + { + .key = {"bd-aio"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Support for native Linux AIO" + }, + + { .key = {NULL} } +}; diff --git a/xlators/storage/bd/src/bd.h b/xlators/storage/bd/src/bd.h new file mode 100644 index 000000000..34b4c9e22 --- /dev/null +++ b/xlators/storage/bd/src/bd.h @@ -0,0 +1,178 @@ +/* + BD translator - Exports Block devices on server side as regular + files to client + + Copyright IBM, Corp. 2012 + + This file is part of GlusterFS. + + Author: + M. Mohan Kumar <mohan@in.ibm.com> + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _BD_H +#define _BD_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifdef HAVE_LIBAIO +#include <libaio.h> +#endif + +#include "xlator.h" +#include "mem-types.h" + +#define BD_XLATOR "block device mapper xlator" +#define BACKEND_VG "vg" +#define GF_XATTR "user.glusterfs" +#define BD_XATTR GF_XATTR ".bd" + +#define BD_LV "lv" +#define BD_THIN "thin" + +#define LVM_RESIZE "/sbin/lvresize" +#define LVM_CREATE "/sbin/lvcreate" +#define LVM_CONVERT "/sbin/lvconvert" + +#define VOL_TYPE "volume.type" +#define VOL_CAPS "volume.caps" + +#define ALIGN_SIZE 4096 + +#define BD_CAPS_BD 0x01 +#define BD_CAPS_THIN 0x02 +#define BD_CAPS_OFFLOAD_COPY 0x04 +#define BD_CAPS_OFFLOAD_SNAPSHOT 0x08 + +#define BD_CLONE "clone" +#define BD_SNAPSHOT "snapshot" +#define BD_MERGE "merge" +#define BD_ORIGIN "list-origin" + +#define IOV_NR 4 +#define IOV_SIZE (64 * 1024) + +#define ALIGN_SIZE 4096 + +#define LINKTO "trusted.glusterfs.dht.linkto" + +#define BD_VALIDATE_MEM_ALLOC(buff, op_errno, label) \ + if (!buff) { \ + op_errno = ENOMEM; \ + gf_log (this->name, GF_LOG_ERROR, "out of memory"); \ + goto label; \ + } + +#define BD_VALIDATE_LOCAL_OR_GOTO(local, op_errno, label) \ + if (!local) { \ + op_errno = EINVAL; \ + goto label; \ + } + +#define BD_STACK_UNWIND(typ, frame, args ...) do { \ + bd_local_t *__local = frame->local; \ + xlator_t *__this = frame->this; \ + \ + frame->local = NULL; \ + STACK_UNWIND_STRICT (typ, frame, args); \ + if (__local) \ + bd_local_free (__this, __local); \ + } while (0) + +typedef char bd_gfid_t[GF_UUID_BUF_SIZE]; + +enum gf_bd_mem_types_ { + gf_bd_private = gf_common_mt_end + 1, + gf_bd_attr, + gf_bd_fd, + gf_bd_mt_end +}; + +/** + * bd_fd - internal structure + */ +typedef struct bd_fd { + int fd; + int32_t flag; + int odirect; +} bd_fd_t; + +typedef struct bd_priv { + lvm_t handle; + char *vg; + char *pool; + int caps; + gf_boolean_t aio_init_done; + gf_boolean_t aio_capable; + gf_boolean_t aio_configured; +#ifdef HAVE_LIBAIO + io_context_t ctxp; + pthread_t aiothread; +#endif +} bd_priv_t; + + +typedef enum bd_type { + BD_TYPE_NONE, + BD_TYPE_LV, +} bd_type_t; + +typedef struct { + struct iatt iatt; + char *type; +} bd_attr_t; + +typedef enum { + BD_OF_NONE, + BD_OF_CLONE, + BD_OF_SNAPSHOT, + BD_OF_MERGE, +} bd_offload_t; + +typedef struct { + dict_t *dict; + bd_attr_t *bdatt; + inode_t *inode; + loc_t loc; + fd_t *fd; + data_t *data; /* for setxattr */ + bd_offload_t offload; + uint64_t size; + loc_t *dloc; +} bd_local_t; + +/* Prototypes */ +int bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx); +int bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx); +int bd_scan_vg (xlator_t *this, bd_priv_t *priv); +bd_local_t *bd_local_init (call_frame_t *frame, xlator_t *this); +void bd_local_free (xlator_t *this, bd_local_t *local); +int bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd); +char *page_aligned_alloc (size_t size, char **aligned_buf); +int bd_validate_bd_xattr (xlator_t *this, char *bd, char **type, + uint64_t *lv_size, uuid_t uuid); +uint64_t bd_get_default_extent (bd_priv_t *priv); +uint64_t bd_adjust_size (bd_priv_t *priv, uint64_t size); +int bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv); +int bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size); +int bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno); +int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv); +int bd_clone (bd_local_t *local, bd_priv_t *priv); + +int bd_merge (bd_priv_t *priv, uuid_t gfid); +int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict); +inline void bd_update_amtime(struct iatt *iatt, int flag); +int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv); +int bd_clone (bd_local_t *local, bd_priv_t *priv); +int bd_merge (bd_priv_t *priv, uuid_t gfid); +int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict); + +#endif diff --git a/xlators/storage/bdb/src/Makefile.am b/xlators/storage/bdb/src/Makefile.am deleted file mode 100644 index 7e2376979..000000000 --- a/xlators/storage/bdb/src/Makefile.am +++ /dev/null @@ -1,18 +0,0 @@ - -xlator_LTLIBRARIES = bdb.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/storage - -bdb_la_LDFLAGS = -module -avoidversion - -bdb_la_SOURCES = bctx.c bdb-ll.c bdb.c -bdb_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -noinst_HEADERS = bdb.h - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 -D_GNU_SOURCE -D$(GF_HOST_OS) -Wall \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) - -AM_LDFLAGS = -ldb - -CLEANFILES = - diff --git a/xlators/storage/bdb/src/bctx.c b/xlators/storage/bdb/src/bctx.c deleted file mode 100644 index 61560edfa..000000000 --- a/xlators/storage/bdb/src/bctx.c +++ /dev/null @@ -1,341 +0,0 @@ -/* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#include <list.h> -#include <bdb.h> -#include <libgen.h> /* for dirname */ - -static void -__destroy_bctx (bctx_t *bctx) -{ - if (bctx->directory) - GF_FREE (bctx->directory); - - if (bctx->db_path) - GF_FREE (bctx->db_path); - - GF_FREE (bctx); -} - -static void -__unhash_bctx (bctx_t *bctx) -{ - list_del_init (&bctx->b_hash); -} - -static int32_t -bctx_table_prune (bctx_table_t *table) -{ - int32_t ret = 0; - struct list_head purge = {0,}; - struct list_head *next = NULL; - bctx_t *entry = NULL; - bctx_t *del = NULL, *tmp = NULL; - - if (!table) - return 0; - - INIT_LIST_HEAD (&purge); - - LOCK (&table->lock); - { - if ((table->lru_limit) && - (table->lru_size > table->lru_limit)) { - while (table->lru_size > table->lru_limit) { - next = table->b_lru.next; - entry = list_entry (next, bctx_t, list); - - list_move_tail (next, &table->purge); - __unhash_bctx (entry); - - table->lru_size--; - ret++; - } - } - list_move_tail (&purge, &table->purge); - list_del_init (&table->purge); - } - UNLOCK (&table->lock); - - list_for_each_entry_safe (del, tmp, &purge, list) { - list_del_init (&del->list); - if (del->primary) { - ret = del->primary->close (del->primary, 0); - if (ret != 0) { - gf_log (table->this->name, GF_LOG_DEBUG, - "_BCTX_TABLE_PRUNE %s: %s " - "(failed to close primary database)", - del->directory, db_strerror (ret)); - } else { - gf_log (table->this->name, GF_LOG_DEBUG, - "_BCTX_TABLE_PRUNE %s (lru=%d)" - "(closed primary database)", - del->directory, table->lru_size); - } - } - if (del->secondary) { - ret = del->secondary->close (del->secondary, 0); - if (ret != 0) { - gf_log (table->this->name, GF_LOG_DEBUG, - "_BCTX_TABLE_PRUNE %s: %s " - "(failed to close secondary database)", - del->directory, db_strerror (ret)); - } else { - gf_log (table->this->name, GF_LOG_DEBUG, - "_BCTX_TABLE_PRUNE %s (lru=%d)" - "(closed secondary database)", - del->directory, table->lru_size); - } - } - __destroy_bctx (del); - } - - return ret; -} - - -/* struct bdb_ctx related */ -static inline uint32_t -bdb_key_hash (char *key, uint32_t hash_size) -{ - uint32_t hash = 0; - - hash = *key; - - if (hash) { - for (key += 1; *key != '\0'; key++) { - hash = (hash << 5) - hash + *key; - } - } - - return (hash + *key) % hash_size; -} - -static void -__hash_bctx (bctx_t *bctx) -{ - bctx_table_t *table = NULL; - char *key = NULL; - - table = bctx->table; - - MAKE_KEY_FROM_PATH (key, bctx->directory); - bctx->key_hash = bdb_key_hash (key, table->hash_size); - - list_del_init (&bctx->b_hash); - list_add (&bctx->b_hash, &table->b_hash[bctx->key_hash]); -} - -static inline bctx_t * -__bctx_passivate (bctx_t *bctx) -{ - if (bctx->primary) { - list_move_tail (&bctx->list, &(bctx->table->b_lru)); - bctx->table->lru_size++; - } else { - list_move_tail (&bctx->list, &bctx->table->purge); - __unhash_bctx (bctx); - } - return bctx; -} - -static inline bctx_t * -__bctx_activate (bctx_t *bctx) -{ - list_move (&bctx->list, &bctx->table->active); - bctx->table->lru_size--; - - return bctx; -} - -static bctx_t * -__bdb_ctx_unref (bctx_t *bctx) -{ - GF_ASSERT (bctx->ref); - - --bctx->ref; - - if (!bctx->ref) - bctx = __bctx_passivate (bctx); - - return bctx; -} - - -bctx_t * -bctx_unref (bctx_t *bctx) -{ - bctx_table_t *table = NULL; - - if (!bctx && !bctx->table) - return NULL; - - table = bctx->table; - - LOCK (&table->lock); - { - bctx = __bdb_ctx_unref (bctx); - } - UNLOCK (&table->lock); - - bctx_table_prune (table); - - return bctx; -} - -/* - * NOTE: __bdb_ctx_ref() is called only after holding table->lock and - * bctx->lock, in that order - */ -static inline bctx_t * -__bctx_ref (bctx_t *bctx) -{ - if (!bctx->ref) - __bctx_activate (bctx); - - bctx->ref++; - - return bctx; -} - -bctx_t * -bctx_ref (bctx_t *bctx) -{ - LOCK (&(bctx->table->lock)); - { - __bctx_ref (bctx); - } - UNLOCK (&(bctx->table->lock)); - - return bctx; -} - - -#define BDB_THIS(table) (table->this) - -static inline bctx_t * -__create_bctx (bctx_table_t *table, - const char *path) -{ - bctx_t *bctx = NULL; - char *db_path = NULL; - - bctx = GF_CALLOC (1, sizeof (*bctx), gf_bdb_mt_bctx_t); - GF_VALIDATE_OR_GOTO ("bctx", bctx, out); - - bctx->table = table; - bctx->directory = gf_strdup (path); - GF_VALIDATE_OR_GOTO ("bctx", bctx->directory, out); - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, BDB_THIS (table), path); - - bctx->db_path = gf_strdup (db_path); - GF_VALIDATE_OR_GOTO ("bctx", bctx->directory, out); - - INIT_LIST_HEAD (&bctx->c_list); - INIT_LIST_HEAD (&bctx->list); - INIT_LIST_HEAD (&bctx->b_hash); - - LOCK_INIT (&bctx->lock); - - __hash_bctx (bctx); - - list_add (&bctx->list, &table->b_lru); - table->lru_size++; - -out: - return bctx; -} - -/* bctx_lookup - lookup bctx_t for the directory @directory. - * (see description of bctx_t in bdb.h) - * - * @table: bctx_table_t for this instance of bdb. - * @directory: directory for which bctx_t is being looked up. - */ -bctx_t * -bctx_lookup (bctx_table_t *table, - const char *directory) -{ - char *key = NULL; - uint32_t key_hash = 0; - bctx_t *trav = NULL, *bctx = NULL, *tmp = NULL; - int32_t need_break = 0; - - GF_VALIDATE_OR_GOTO ("bctx", table, out); - GF_VALIDATE_OR_GOTO ("bctx", directory, out); - - MAKE_KEY_FROM_PATH (key, directory); - key_hash = bdb_key_hash (key, table->hash_size); - - LOCK (&table->lock); - { - if (list_empty (&table->b_hash[key_hash])) { - goto creat_bctx; - } - - list_for_each_entry_safe (trav, tmp, &table->b_hash[key_hash], - b_hash) { - LOCK(&trav->lock); - { - if (!strcmp(trav->directory, directory)) { - bctx = __bctx_ref (trav); - need_break = 1; - } - } - UNLOCK(&trav->lock); - - if (need_break) - break; - } - - creat_bctx: - if (!bctx) { - bctx = __create_bctx (table, directory); - bctx = __bctx_ref (bctx); - } - } - UNLOCK (&table->lock); -out: - return bctx; -} - - -bctx_t * -bctx_parent (bctx_table_t *table, - const char *path) -{ - char *pathname = NULL, *directory = NULL; - bctx_t *bctx = NULL; - - GF_VALIDATE_OR_GOTO ("bctx", table, out); - GF_VALIDATE_OR_GOTO ("bctx", path, out); - - pathname = gf_strdup (path); - GF_VALIDATE_OR_GOTO ("bctx", pathname, out); - directory = dirname (pathname); - - bctx = bctx_lookup (table, directory); - GF_VALIDATE_OR_GOTO ("bctx", bctx, out); - -out: - if (pathname) - free (pathname); - return bctx; -} diff --git a/xlators/storage/bdb/src/bdb-ll.c b/xlators/storage/bdb/src/bdb-ll.c deleted file mode 100644 index f70ec47f4..000000000 --- a/xlators/storage/bdb/src/bdb-ll.c +++ /dev/null @@ -1,1464 +0,0 @@ -/* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#include <libgen.h> -#include "bdb.h" -#include <list.h> -#include "hashfn.h" -/* - * implement the procedures to interact with bdb */ - -/**************************************************************** - * - * General wrappers and utility procedures for bdb xlator - * - ****************************************************************/ - -ino_t -bdb_inode_transform (ino_t parent, - const char *name, - size_t namelen) -{ - ino_t ino = -1; - uint64_t hash = 0; - - hash = gf_dm_hashfn (name, namelen); - - ino = (((parent << 32) | 0x00000000ffffffffULL) - & (hash | 0xffffffff00000000ULL)); - - return ino; -} - -static int -bdb_generate_secondary_hash (DB *secondary, - const DBT *pkey, - const DBT *data, - DBT *skey) -{ - char *primary = NULL; - uint32_t *hash = NULL; - - primary = pkey->data; - - hash = GF_CALLOC (1, sizeof (uint32_t), gf_bdb_mt_uint32_t); - - *hash = gf_dm_hashfn (primary, pkey->size); - - skey->data = hash; - skey->size = sizeof (hash); - skey->flags = DB_DBT_APPMALLOC; - - return 0; -} - -/*********************************************************** - * - * bdb storage database utilities - * - **********************************************************/ - -/* - * bdb_db_open - opens a storage db. - * - * @ctx: context specific to the directory for which we are supposed to open db - * - * see, if we have empty slots to open a db. - * if (no-empty-slots), then prune open dbs and close as many as possible - * if (empty-slot-available), tika muchkonDu db open maaDu - * - */ -static int -bdb_db_open (bctx_t *bctx) -{ - DB *primary = NULL; - DB *secondary = NULL; - int32_t ret = -1; - bctx_table_t *table = NULL; - - GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); - - table = bctx->table; - GF_VALIDATE_OR_GOTO ("bdb-ll", table, out); - - /* we have to do the following, we can't deny someone of db_open ;) */ - ret = db_create (&primary, table->dbenv, 0); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_OPEN %s: %s (failed to create database object" - " for primary database)", - bctx->directory, db_strerror (ret)); - ret = -ENOMEM; - goto out; - } - - if (table->page_size) { - ret = primary->set_pagesize (primary, - table->page_size); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_OPEN %s: %s (failed to set page-size " - "to %"PRIu64")", - bctx->directory, db_strerror (ret), - table->page_size); - } else { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_OPEN %s: page-size set to %"PRIu64, - bctx->directory, table->page_size); - } - } - - ret = primary->open (primary, NULL, bctx->db_path, "primary", - table->access_mode, table->dbflags, 0); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "_BDB_DB_OPEN %s: %s " - "(failed to open primary database)", - bctx->directory, db_strerror (ret)); - ret = -1; - goto cleanup; - } - - ret = db_create (&secondary, table->dbenv, 0); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_OPEN %s: %s (failed to create database object" - " for secondary database)", - bctx->directory, db_strerror (ret)); - ret = -ENOMEM; - goto cleanup; - } - - ret = secondary->open (secondary, NULL, bctx->db_path, "secondary", - table->access_mode, table->dbflags, 0); - if (ret != 0 ) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "_BDB_DB_OPEN %s: %s " - "(failed to open secondary database)", - bctx->directory, db_strerror (ret)); - ret = -1; - goto cleanup; - } - - ret = primary->associate (primary, NULL, secondary, - bdb_generate_secondary_hash, -#ifdef DB_IMMUTABLE_KEY - DB_IMMUTABLE_KEY); -#else - 0); -#endif - if (ret != 0 ) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "_BDB_DB_OPEN %s: %s " - "(failed to associate primary database with " - "secondary database)", - bctx->directory, db_strerror (ret)); - ret = -1; - goto cleanup; - } - -out: - bctx->primary = primary; - bctx->secondary = secondary; - - return ret; -cleanup: - if (primary) - primary->close (primary, 0); - if (secondary) - secondary->close (secondary, 0); - - return ret; -} - -int32_t -bdb_cursor_close (bctx_t *bctx, - DBC *cursorp) -{ - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); - GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out); - - LOCK (&bctx->lock); - { -#ifdef HAVE_BDB_CURSOR_GET - ret = cursorp->close (cursorp); -#else - ret = cursorp->c_close (cursorp); -#endif - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CURSOR_CLOSE %s: %s " - "(failed to close database cursor)", - bctx->directory, db_strerror (ret)); - } - } - UNLOCK (&bctx->lock); - -out: - return ret; -} - - -int32_t -bdb_cursor_open (bctx_t *bctx, - DBC **cursorpp) -{ - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); - GF_VALIDATE_OR_GOTO ("bdb-ll", cursorpp, out); - - LOCK (&bctx->lock); - { - if (bctx->secondary) { - /* do nothing, just continue */ - ret = 0; - } else { - ret = bdb_db_open (bctx); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CURSOR_OPEN %s: ENOMEM " - "(failed to open secondary database)", - bctx->directory); - ret = -ENOMEM; - } else { - ret = 0; - } - } - - if (ret == 0) { - /* all set, open cursor */ - ret = bctx->secondary->cursor (bctx->secondary, - NULL, cursorpp, 0); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CURSOR_OPEN %s: %s " - "(failed to open a cursor to database)", - bctx->directory, db_strerror (ret)); - } - } - } - UNLOCK (&bctx->lock); - -out: - return ret; -} - - -/* cache related */ -static bdb_cache_t * -bdb_cache_lookup (bctx_t *bctx, - char *path) -{ - bdb_cache_t *bcache = NULL; - bdb_cache_t *trav = NULL; - char *key = NULL; - - GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); - GF_VALIDATE_OR_GOTO ("bdb-ll", path, out); - - MAKE_KEY_FROM_PATH (key, path); - - LOCK (&bctx->lock); - { - list_for_each_entry (trav, &bctx->c_list, c_list) { - if (!strcmp (trav->key, key)){ - bcache = trav; - break; - } - } - } - UNLOCK (&bctx->lock); - -out: - return bcache; -} - -static int32_t -bdb_cache_insert (bctx_t *bctx, - DBT *key, - DBT *data) -{ - bdb_cache_t *bcache = NULL; - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); - GF_VALIDATE_OR_GOTO ("bdb-ll", key, out); - GF_VALIDATE_OR_GOTO ("bdb-ll", data, out); - - LOCK (&bctx->lock); - { - if (bctx->c_count > 5) { - /* most of the times, we enter here */ - /* FIXME: ugly, not supposed to disect any of the - * 'struct list_head' directly */ - if (!list_empty (&bctx->c_list)) { - bcache = list_entry (bctx->c_list.prev, - bdb_cache_t, c_list); - list_del_init (&bcache->c_list); - } - if (bcache->key) { - GF_FREE (bcache->key); - bcache->key = GF_CALLOC (key->size + 1, - sizeof (char), - gf_bdb_mt_char); - GF_VALIDATE_OR_GOTO ("bdb-ll", - bcache->key, unlock); - memcpy (bcache->key, (char *)key->data, - key->size); - } else { - /* should never come here */ - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CACHE_INSERT %s (%s) " - "(found a cache entry with empty key)", - bctx->directory, (char *)key->data); - } /* if(bcache->key)...else */ - if (bcache->data) { - GF_FREE (bcache->data); - bcache->data = memdup (data->data, data->size); - GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, - unlock); - bcache->size = data->size; - } else { - /* should never come here */ - gf_log ("bdb-ll", GF_LOG_CRITICAL, - "_BDB_CACHE_INSERT %s (%s) " - "(found a cache entry with no data)", - bctx->directory, (char *)key->data); - } /* if(bcache->data)...else */ - list_add (&bcache->c_list, &bctx->c_list); - ret = 0; - } else { - /* we will be entering here very rarely */ - bcache = GF_CALLOC (1, sizeof (*bcache), - gf_bdb_mt_bdb_cache_t); - GF_VALIDATE_OR_GOTO ("bdb-ll", bcache, unlock); - - bcache->key = GF_CALLOC (key->size + 1, sizeof (char), - gf_bdb_mt_char); - GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->key, unlock); - memcpy (bcache->key, key->data, key->size); - - bcache->data = memdup (data->data, data->size); - GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, unlock); - - bcache->size = data->size; - list_add (&bcache->c_list, &bctx->c_list); - bctx->c_count++; - ret = 0; - } /* if(private->c_count < 5)...else */ - } -unlock: - UNLOCK (&bctx->lock); -out: - return ret; -} - -static int32_t -bdb_cache_delete (bctx_t *bctx, - const char *key) -{ - bdb_cache_t *bcache = NULL; - bdb_cache_t *trav = NULL; - - GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); - GF_VALIDATE_OR_GOTO ("bdb-ll", key, out); - - LOCK (&bctx->lock); - { - list_for_each_entry (trav, &bctx->c_list, c_list) { - if (!strcmp (trav->key, key)){ - bctx->c_count--; - bcache = trav; - break; - } - } - - if (bcache) { - list_del_init (&bcache->c_list); - GF_FREE (bcache->key); - GF_FREE (bcache->data); - GF_FREE (bcache); - } - } - UNLOCK (&bctx->lock); - -out: - return 0; -} - -void * -bdb_db_stat (bctx_t *bctx, - DB_TXN *txnid, - uint32_t flags) -{ - DB *storage = NULL; - void *stat = NULL; - int32_t ret = -1; - - LOCK (&bctx->lock); - { - if (bctx->primary == NULL) { - ret = bdb_db_open (bctx); - storage = bctx->primary; - } else { - /* we are just fine, lets continue */ - storage = bctx->primary; - } /* if(bctx->dbp==NULL)...else */ - } - UNLOCK (&bctx->lock); - - GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); - - ret = storage->stat (storage, txnid, &stat, flags); - - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_STAT %s: %s " - "(failed to do stat database)", - bctx->directory, db_strerror (ret)); - } -out: - return stat; - -} - -/* bdb_storage_get - retrieve a key/value pair corresponding to @path from the - * corresponding db file. - * - * @bctx: bctx_t * corresponding to the parent directory of @path. (should - * always be a valid bctx). bdb_storage_get should never be called if - * @bctx = NULL. - * @txnid: NULL if bdb_storage_get is not embedded in an explicit transaction - * or a valid DB_TXN *, when embedded in an explicit transaction. - * @path: path of the file to read from (translated to a database key using - * MAKE_KEY_FROM_PATH) - * @buf: char ** - pointer to a pointer to char. a read buffer is created in - * this procedure and pointer to the buffer is passed through @buf to the - * caller. - * @size: size of the file content to be read. - * @offset: offset from which the file content to be read. - * - * NOTE: bdb_storage_get tries to open DB, if @bctx->dbp == NULL - * (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by - * bdb_table_prune()). - * - * NOTE: if private->cache is set (bdb xlator's internal caching enabled), then - * bdb_storage_get first looks up the cache for key/value pair. if - * bdb_lookup_cache fails, then only DB->get() is called. also, inserts a - * newly read key/value pair to cache through bdb_insert_to_cache. - * - * return: 'number of bytes read' on success or -1 on error. - * - * also see: bdb_lookup_cache, bdb_insert_to_cache for details about bdb - * xlator's internal cache. - */ -static int32_t -bdb_db_get (bctx_t *bctx, - DB_TXN *txnid, - const char *path, - char *buf, - size_t size, - off_t offset) -{ - DB *storage = NULL; - DBT key = {0,}; - DBT value = {0,}; - int32_t ret = -1; - size_t copy_size = 0; - char *key_string = NULL; - bdb_cache_t *bcache = NULL; - int32_t db_flags = 0; - uint8_t need_break = 0; - int32_t retries = 1; - - GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); - GF_VALIDATE_OR_GOTO ("bdb-ll", path, out); - - MAKE_KEY_FROM_PATH (key_string, path); - - if (bctx->cache && - ((bcache = bdb_cache_lookup (bctx, key_string)) != NULL)) { - if (buf) { - copy_size = ((bcache->size - offset) < size)? - (bcache->size - offset) : size; - - memcpy (buf, (bcache->data + offset), copy_size); - ret = copy_size; - } else { - ret = bcache->size; - } - - goto out; - } - - LOCK (&bctx->lock); - { - if (bctx->primary == NULL) { - ret = bdb_db_open (bctx); - storage = bctx->primary; - } else { - /* we are just fine, lets continue */ - storage = bctx->primary; - } /* if(bctx->dbp==NULL)...else */ - } - UNLOCK (&bctx->lock); - - GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); - - key.data = (char *)key_string; - key.size = strlen (key_string); - key.flags = DB_DBT_USERMEM; - - if (bctx->cache){ - value.flags = DB_DBT_MALLOC; - } else { - if (size) { - value.data = buf; - value.ulen = size; - value.flags = DB_DBT_USERMEM | DB_DBT_PARTIAL; - } else { - value.flags = DB_DBT_MALLOC; - } - value.dlen = size; - value.doff = offset; - } - - do { - /* TODO: we prefer to give our own buffer to value.data - * and ask bdb to fill in it */ - ret = storage->get (storage, txnid, &key, &value, - db_flags); - - if (ret == DB_NOTFOUND) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_GET %s - %s: ENOENT" - "(specified key not found in database)", - bctx->directory, key_string); - ret = -1; - need_break = 1; - } else if (ret == DB_LOCK_DEADLOCK) { - retries++; - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_GET %s - %s" - "(deadlock detected, retrying for %d " - "time)", - bctx->directory, key_string, retries); - } else if (ret == 0) { - /* successfully read data, lets set everything - * in place and return */ - if (bctx->cache) { - if (buf) { - copy_size = ((value.size - offset) < size) ? - (value.size - offset) : size; - - memcpy (buf, (value.data + offset), - copy_size); - ret = copy_size; - } - - bdb_cache_insert (bctx, &key, &value); - } else { - ret = value.size; - } - - if (size == 0) - GF_FREE (value.data); - - need_break = 1; - } else { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_GET %s - %s: %s" - "(failed to retrieve specified key from" - " database)", - bctx->directory, key_string, - db_strerror (ret)); - ret = -1; - need_break = 1; - } - } while (!need_break); - -out: - return ret; -}/* bdb_db_get */ - -/* TODO: handle errors here and log. propogate only the errno to caller */ -int32_t -bdb_db_fread (struct bdb_fd *bfd, char *buf, size_t size, off_t offset) -{ - return bdb_db_get (bfd->ctx, NULL, bfd->key, buf, size, offset); -} - -int32_t -bdb_db_iread (struct bdb_ctx *bctx, const char *key, char **bufp) -{ - char *buf = NULL; - size_t size = 0; - int64_t ret = 0; - - ret = bdb_db_get (bctx, NULL, key, NULL, 0, 0); - size = ret; - - if (bufp) { - buf = GF_CALLOC (size, sizeof (char), gf_bdb_mt_char); - *bufp = buf; - ret = bdb_db_get (bctx, NULL, key, buf, size, 0); - } - - return ret; -} - -/* bdb_storage_put - insert a key/value specified to the corresponding DB. - * - * @bctx: bctx_t * corresponding to the parent directory of @path. - * (should always be a valid bctx). bdb_storage_put should never be - * called if @bctx = NULL. - * @txnid: NULL if bdb_storage_put is not embedded in an explicit transaction - * or a valid DB_TXN *, when embedded in an explicit transaction. - * @key_string: key of the database entry. - * @buf: pointer to the buffer data to be written as data for @key_string. - * @size: size of @buf. - * @offset: offset in the key's data to be modified with provided data. - * @flags: valid flags are BDB_TRUNCATE_RECORD (to reduce the data of - * @key_string to 0 size). - * - * NOTE: bdb_storage_put tries to open DB, if @bctx->dbp == NULL - * (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by - * bdb_table_prune()). - * - * NOTE: bdb_storage_put deletes the key/value from bdb xlator's internal cache. - * - * return: 0 on success or -1 on error. - * - * also see: bdb_cache_delete for details on how a cached key/value pair is - * removed. - */ -static int32_t -bdb_db_put (bctx_t *bctx, - DB_TXN *txnid, - const char *key_string, - const char *buf, - size_t size, - off_t offset, - int32_t flags) -{ - DB *storage = NULL; - DBT key = {0,}, value = {0,}; - int32_t ret = -1; - int32_t db_flags = DB_AUTO_COMMIT; - uint8_t need_break = 0; - int32_t retries = 1; - - LOCK (&bctx->lock); - { - if (bctx->primary == NULL) { - ret = bdb_db_open (bctx); - storage = bctx->primary; - } else { - /* we are just fine, lets continue */ - storage = bctx->primary; - } - } - UNLOCK (&bctx->lock); - - GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); - - if (bctx->cache) { - ret = bdb_cache_delete (bctx, (char *)key_string); - GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out); - } - - key.data = (void *)key_string; - key.size = strlen (key_string); - - /* NOTE: bdb lets us expand the file, suppose value.size > value.len, - * then value.len bytes from value.doff offset and value.size bytes - * will be written from value.doff and data from - * value.doff + value.dlen will be pushed value.doff + value.size - */ - value.data = (void *)buf; - - if (flags & BDB_TRUNCATE_RECORD) { - value.size = size; - value.doff = 0; - value.dlen = offset; - } else { - value.size = size; - value.dlen = size; - value.doff = offset; - } - value.flags = DB_DBT_PARTIAL; - if (buf == NULL && size == 0) - /* truncate called us */ - value.flags = 0; - - do { - ret = storage->put (storage, txnid, &key, &value, db_flags); - if (ret == DB_LOCK_DEADLOCK) { - retries++; - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_PUT %s - %s" - "(deadlock detected, retying for %d time)", - bctx->directory, key_string, retries); - } else if (ret) { - /* write failed */ - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_PUT %s - %s: %s" - "(failed to put specified entry into database)", - bctx->directory, key_string, db_strerror (ret)); - need_break = 1; - } else { - /* successfully wrote */ - ret = 0; - need_break = 1; - } - } while (!need_break); -out: - return ret; -}/* bdb_db_put */ - -int32_t -bdb_db_icreate (struct bdb_ctx *bctx, const char *key) -{ - return bdb_db_put (bctx, NULL, key, NULL, 0, 0, 0); -} - -/* TODO: handle errors here and log. propogate only the errno to caller */ -int32_t -bdb_db_fwrite (struct bdb_fd *bfd, char *buf, size_t size, off_t offset) -{ - return bdb_db_put (bfd->ctx, NULL, bfd->key, buf, size, offset, 0); -} - -/* TODO: handle errors here and log. propogate only the errno to caller */ -int32_t -bdb_db_iwrite (struct bdb_ctx *bctx, const char *key, char *buf, size_t size) -{ - return bdb_db_put (bctx, NULL, key, buf, size, 0, 0); -} - -int32_t -bdb_db_itruncate (struct bdb_ctx *bctx, const char *key) -{ - return bdb_db_put (bctx, NULL, key, NULL, 0, 1, 0); -} - -/* bdb_storage_del - delete a key/value pair corresponding to @path from - * corresponding db file. - * - * @bctx: bctx_t * corresponding to the parent directory of @path. - * (should always be a valid bctx). bdb_storage_del should never be called - * if @bctx = NULL. - * @txnid: NULL if bdb_storage_del is not embedded in an explicit transaction - * or a valid DB_TXN *, when embedded in an explicit transaction. - * @path: path to the file, whose key/value pair has to be deleted. - * - * NOTE: bdb_storage_del tries to open DB, if @bctx->dbp == NULL - * (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by - * bdb_table_prune()). - * - * return: 0 on success or -1 on error. - */ -static int32_t -bdb_db_del (bctx_t *bctx, - DB_TXN *txnid, - const char *key_string) -{ - DB *storage = NULL; - DBT key = {0,}; - int32_t ret = -1; - int32_t db_flags = 0; - uint8_t need_break = 0; - int32_t retries = 1; - - LOCK (&bctx->lock); - { - if (bctx->primary == NULL) { - ret = bdb_db_open (bctx); - storage = bctx->primary; - } else { - /* we are just fine, lets continue */ - storage = bctx->primary; - } - } - UNLOCK (&bctx->lock); - - GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); - - ret = bdb_cache_delete (bctx, key_string); - GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out); - - key.data = (char *)key_string; - key.size = strlen (key_string); - key.flags = DB_DBT_USERMEM; - - do { - ret = storage->del (storage, txnid, &key, db_flags); - - if (ret == DB_NOTFOUND) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_DEL %s - %s: ENOENT" - "(failed to delete entry, could not be " - "found in the database)", - bctx->directory, key_string); - need_break = 1; - } else if (ret == DB_LOCK_DEADLOCK) { - retries++; - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_DEL %s - %s" - "(deadlock detected, retying for %d time)", - bctx->directory, key_string, retries); - } else if (ret == 0) { - /* successfully deleted the entry */ - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_DEL %s - %s" - "(successfully deleted entry from database)", - bctx->directory, key_string); - ret = 0; - need_break = 1; - } else { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_DB_DEL %s - %s: %s" - "(failed to delete entry from database)", - bctx->directory, key_string, db_strerror (ret)); - ret = -1; - need_break = 1; - } - } while (!need_break); -out: - return ret; -} - -int32_t -bdb_db_iremove (bctx_t *bctx, - const char *key) -{ - return bdb_db_del (bctx, NULL, key); -} - -/* NOTE: bdb version compatibility wrapper */ -int32_t -bdb_cursor_get (DBC *cursorp, - DBT *sec, DBT *pri, - DBT *val, - int32_t flags) -{ - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out); - -#ifdef HAVE_BDB_CURSOR_GET - ret = cursorp->pget (cursorp, sec, pri, val, flags); -#else - ret = cursorp->c_pget (cursorp, sec, pri, val, flags); -#endif - if ((ret != 0) && (ret != DB_NOTFOUND)) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CURSOR_GET: %s" - "(failed to retrieve entry from database cursor)", - db_strerror (ret)); - } - -out: - return ret; -}/* bdb_cursor_get */ - -int32_t -bdb_dirent_size (DBT *key) -{ - return GF_DIR_ALIGN (24 /* FIX MEEEE!!! */ + key->size); -} - - - -/* bdb_dbenv_init - initialize DB_ENV - * - * initialization includes: - * 1. opening DB_ENV (db_env_create(), DB_ENV->open()). - * NOTE: see private->envflags for flags used. - * 2. DB_ENV->set_lg_dir - set log directory to be used for storing log files - * (log files are the files in which transaction logs are written by db). - * 3. DB_ENV->set_flags (DB_LOG_AUTOREMOVE) - set DB_ENV to automatically - * clear the unwanted log files (flushed at each checkpoint). - * 4. DB_ENV->set_errfile - set errfile to be used by db to report detailed - * error logs. used only for debbuging purpose. - * - * return: returns a valid DB_ENV * on success or NULL on error. - * - */ -static DB_ENV * -bdb_dbenv_init (xlator_t *this, - char *directory) -{ - /* Create a DB environment */ - DB_ENV *dbenv = NULL; - int32_t ret = 0; - bdb_private_t *private = NULL; - int32_t fatal_flags = 0; - - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (directory, err); - - private = this->private; - VALIDATE_OR_GOTO (private, err); - - ret = db_env_create (&dbenv, 0); - VALIDATE_OR_GOTO ((ret == 0), err); - - /* NOTE: set_errpfx returns 'void' */ - dbenv->set_errpfx(dbenv, this->name); - - ret = dbenv->set_lk_detect (dbenv, DB_LOCK_DEFAULT); - VALIDATE_OR_GOTO ((ret == 0), err); - - ret = dbenv->open(dbenv, directory, - private->envflags, - S_IRUSR | S_IWUSR); - if ((ret != 0) && (ret != DB_RUNRECOVERY)) { - gf_log (this->name, GF_LOG_CRITICAL, - "failed to join Berkeley DB environment at %s: %s." - "please run manual recovery and retry running " - "glusterfs", - directory, db_strerror (ret)); - dbenv = NULL; - goto err; - } else if (ret == DB_RUNRECOVERY) { - fatal_flags = ((private->envflags & (~DB_RECOVER)) - | DB_RECOVER_FATAL); - ret = dbenv->open(dbenv, directory, fatal_flags, - S_IRUSR | S_IWUSR); - if (ret != 0) { - gf_log (this->name, GF_LOG_CRITICAL, - "failed to join Berkeley DB environment in " - "recovery mode at %s: %s. please run manual " - "recovery and retry running glusterfs", - directory, db_strerror (ret)); - dbenv = NULL; - goto err; - } - } - - ret = 0; -#if (DB_VERSION_MAJOR == 4 && \ - DB_VERSION_MINOR == 7) - if (private->log_auto_remove) { - ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 1); - } else { - ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 0); - } -#else - if (private->log_auto_remove) { - ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 1); - } else { - ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 0); - } -#endif - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "autoremoval of transactional log files could not be " - "configured (%s). you may have to do a manual " - "monitoring of transactional log files and remove " - "periodically.", - db_strerror (ret)); - goto err; - } - - if (private->transaction) { - ret = dbenv->set_flags(dbenv, DB_AUTO_COMMIT, 1); - - if (ret != 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "configuration of auto-commit failed for " - "database environment at %s. none of the " - "operations will be embedded in transaction " - "unless explicitly done so.", - db_strerror (ret)); - goto err; - } - - if (private->txn_timeout) { - ret = dbenv->set_timeout (dbenv, private->txn_timeout, - DB_SET_TXN_TIMEOUT); - if (ret != 0) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "could not configure Berkeley DB " - "transaction timeout to %d (%s). please" - " review 'option transaction-timeout %d" - "' option.", - private->txn_timeout, - db_strerror (ret), - private->txn_timeout); - goto err; - } - } - - if (private->lock_timeout) { - ret = dbenv->set_timeout(dbenv, - private->txn_timeout, - DB_SET_LOCK_TIMEOUT); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "could not configure Berkeley DB " - "lock timeout to %d (%s). please" - " review 'option lock-timeout %d" - "' option.", - private->lock_timeout, - db_strerror (ret), - private->lock_timeout); - goto err; - } - } - - ret = dbenv->set_lg_dir (dbenv, private->logdir); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "failed to configure libdb transaction log " - "directory at %s. please review the " - "'option logdir %s' option.", - db_strerror (ret), private->logdir); - goto err; - } - } - - if (private->errfile) { - private->errfp = fopen (private->errfile, "a+"); - if (private->errfp) { - dbenv->set_errfile (dbenv, private->errfp); - } else { - gf_log ("bdb-ll", GF_LOG_ERROR, - "failed to open error logging file for " - "libdb (Berkeley DB) internal logging (%s)." - "please review the 'option errfile %s' option.", - strerror (errno), private->errfile); - goto err; - } - } - - return dbenv; -err: - if (dbenv) { - dbenv->close (dbenv, 0); - } - - return NULL; -} - -#define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv) - -/* bdb_checkpoint - during transactional usage, db does not directly write the - * data to db files, instead db writes a 'log' (similar to a journal entry) - * into a log file. db normally clears the log files during opening of an - * environment. since we expect a filesystem server to run for a pretty long - * duration and flushing 'log's during dbenv->open would prove very costly, if - * we accumulate the log entries for one complete run of glusterfs server. to - * flush the logs frequently, db provides a mechanism called 'checkpointing'. - * when we do a checkpoint, db flushes the logs to disk (writes changes to db - * files) and we can also clear the accumulated log files after checkpointing. - * NOTE: removing unwanted log files is not part of dbenv->txn_checkpoint() - * call. - * - * @data: xlator_t of the current instance of bdb xlator. - * - * bdb_checkpoint is called in a different thread from the main glusterfs - * thread. bdb xlator creates the checkpoint thread after successfully opening - * the db environment. - * NOTE: bdb_checkpoint thread shares the DB_ENV handle with the filesystem - * thread. - * - * db environment checkpointing frequency is controlled by - * 'option checkpoint-timeout <time-in-seconds>' in volfile. - * - * NOTE: checkpointing thread is started only if 'option transaction on' - * specified in volfile. checkpointing is not valid for non-transactional - * environments. - * - */ -static void * -bdb_checkpoint (void *data) -{ - xlator_t *this = NULL; - struct bdb_private *private = NULL; - DB_ENV *dbenv = NULL; - int32_t ret = 0; - uint32_t active = 0; - - this = (xlator_t *) data; - dbenv = BDB_ENV(this); - private = this->private; - - for (;;sleep (private->checkpoint_interval)) { - LOCK (&private->active_lock); - active = private->active; - UNLOCK (&private->active_lock); - - if (active) { - ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0); - if (ret) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CHECKPOINT: %s" - "(failed to checkpoint environment)", - db_strerror (ret)); - } else { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CHECKPOINT: successfully " - "checkpointed"); - } - } else { - ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0); - if (ret) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "_BDB_CHECKPOINT: %s" - "(final checkpointing failed. might " - "need to run recovery tool manually on " - "next usage of this database " - "environment)", - db_strerror (ret)); - } else { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "_BDB_CHECKPOINT: final successfully " - "checkpointed"); - } - break; - } - } - - return NULL; -} - - -/* bdb_db_init - initialize bdb xlator - * - * reads the options from @options dictionary and sets appropriate values in - * @this->private. also initializes DB_ENV. - * - * return: 0 on success or -1 on error - * (with logging the error through gf_log()). - */ -int -bdb_db_init (xlator_t *this, - dict_t *options) -{ - /* create a db entry for root */ - int32_t op_ret = 0; - bdb_private_t *private = NULL; - bctx_table_t *table = NULL; - - char *checkpoint_interval_str = NULL; - char *page_size_str = NULL; - char *lru_limit_str = NULL; - char *timeout_str = NULL; - char *access_mode = NULL; - char *endptr = NULL; - char *errfile = NULL; - char *directory = NULL; - char *logdir = NULL; - char *mode = NULL; - char *mode_str = NULL; - int ret = -1; - int idx = 0; - struct stat stbuf = {0,}; - - private = this->private; - - /* cache is always on */ - private->cache = ON; - - ret = dict_get_str (options, "access-mode", &access_mode); - if ((ret == 0) - && (!strcmp (access_mode, "btree"))) { - gf_log (this->name, GF_LOG_DEBUG, - "using BTREE access mode to access libdb " - "(Berkeley DB)"); - private->access_mode = DB_BTREE; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "using HASH access mode to access libdb (Berkeley DB)"); - private->access_mode = DB_HASH; - } - - ret = dict_get_str (options, "mode", &mode); - if ((ret == 0) - && (!strcmp (mode, "cache"))) { - gf_log (this->name, GF_LOG_DEBUG, - "cache data mode selected for 'storage/bdb'. filesystem" - " operations are not transactionally protected and " - "system crash does not guarantee recoverability of " - "data"); - private->envflags = DB_CREATE | DB_INIT_LOG | - DB_INIT_MPOOL | DB_THREAD; - private->dbflags = DB_CREATE | DB_THREAD; - private->transaction = OFF; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "persistent data mode selected for 'storage/bdb'. each" - "filesystem operation is guaranteed to be Berkeley DB " - "transaction protected."); - private->transaction = ON; - private->envflags = DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG | - DB_INIT_MPOOL | DB_INIT_TXN | DB_RECOVER | DB_THREAD; - private->dbflags = DB_CREATE | DB_THREAD; - - - ret = dict_get_str (options, "lock-timeout", &timeout_str); - - if (ret == 0) { - ret = gf_string2time (timeout_str, - &private->lock_timeout); - - if (private->lock_timeout > 4260000) { - /* db allows us to DB_SET_LOCK_TIMEOUT to be - * set to a maximum of 71 mins - * (4260000 milliseconds) */ - gf_log (this->name, GF_LOG_DEBUG, - "Berkeley DB lock-timeout parameter " - "(%d) is out of range. please specify" - " a valid timeout value for " - "lock-timeout and retry.", - private->lock_timeout); - goto err; - } - } - ret = dict_get_str (options, "transaction-timeout", - &timeout_str); - if (ret == 0) { - ret = gf_string2time (timeout_str, - &private->txn_timeout); - - if (private->txn_timeout > 4260000) { - /* db allows us to DB_SET_TXN_TIMEOUT to be set - * to a maximum of 71 mins - * (4260000 milliseconds) */ - gf_log (this->name, GF_LOG_DEBUG, - "Berkeley DB lock-timeout parameter " - "(%d) is out of range. please specify" - " a valid timeout value for " - "lock-timeout and retry.", - private->lock_timeout); - goto err; - } - } - - private->checkpoint_interval = BDB_DEFAULT_CHECKPOINT_INTERVAL; - ret = dict_get_str (options, "checkpoint-interval", - &checkpoint_interval_str); - if (ret == 0) { - ret = gf_string2time (checkpoint_interval_str, - &private->checkpoint_interval); - - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "'%"PRIu32"' is not a valid parameter " - "for checkpoint-interval option. " - "please specify a valid " - "checkpoint-interval and retry", - private->checkpoint_interval); - goto err; - } - } - } - - ret = dict_get_str (options, "file-mode", &mode_str); - if (ret == 0) { - private->file_mode = strtol (mode_str, &endptr, 8); - - if ((*endptr) || - (!IS_VALID_FILE_MODE(private->file_mode))) { - gf_log (this->name, GF_LOG_DEBUG, - "'%o' is not a valid parameter for file-mode " - "option. please specify a valid parameter for " - "file-mode and retry.", - private->file_mode); - goto err; - } - } else { - private->file_mode = DEFAULT_FILE_MODE; - } - private->symlink_mode = private->file_mode | S_IFLNK; - private->file_mode = private->file_mode | S_IFREG; - - ret = dict_get_str (options, "dir-mode", &mode_str); - if (ret == 0) { - private->dir_mode = strtol (mode_str, &endptr, 8); - if ((*endptr) || - (!IS_VALID_FILE_MODE(private->dir_mode))) { - gf_log (this->name, GF_LOG_DEBUG, - "'%o' is not a valid parameter for dir-mode " - "option. please specify a valid parameter for " - "dir-mode and retry.", - private->dir_mode); - goto err; - } - } else { - private->dir_mode = DEFAULT_DIR_MODE; - } - - private->dir_mode = private->dir_mode | S_IFDIR; - - table = GF_CALLOC (1, sizeof (*table), gf_bdb_mt_bctx_table_t); - if (table == NULL) { - gf_log ("bdb-ll", GF_LOG_CRITICAL, - "memory allocation for 'storage/bdb' internal " - "context table failed."); - goto err; - } - - INIT_LIST_HEAD(&(table->b_lru)); - INIT_LIST_HEAD(&(table->active)); - INIT_LIST_HEAD(&(table->purge)); - - LOCK_INIT (&table->lock); - LOCK_INIT (&table->checkpoint_lock); - - table->transaction = private->transaction; - table->access_mode = private->access_mode; - table->dbflags = private->dbflags; - table->this = this; - - ret = dict_get_str (options, "lru-limit", - &lru_limit_str); - - /* TODO: set max lockers and max txns to accomodate - * for more than lru_limit */ - if (ret == 0) { - ret = gf_string2uint32 (lru_limit_str, - &table->lru_limit); - gf_log ("bdb-ll", GF_LOG_DEBUG, - "setting lru limit of 'storage/bdb' internal context" - "table to %d. maximum of %d unused databases can be " - "open at any given point of time.", - table->lru_limit, table->lru_limit); - } else { - table->lru_limit = BDB_DEFAULT_LRU_LIMIT; - } - - ret = dict_get_str (options, "page-size", - &page_size_str); - - if (ret == 0) { - ret = gf_string2bytesize (page_size_str, - &table->page_size); - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "\"%s\" is an invalid parameter to " - "\"option page-size\". please specify a valid " - "size and retry.", - page_size_str); - goto err; - } - - if (!PAGE_SIZE_IN_RANGE(table->page_size)) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "\"%s\" is out of range for Berkeley DB " - "page-size. allowed page-size range is %d to " - "%d. please specify a page-size value in the " - "range and retry.", - page_size_str, BDB_LL_PAGE_SIZE_MIN, - BDB_LL_PAGE_SIZE_MAX); - goto err; - } - } else { - table->page_size = BDB_LL_PAGE_SIZE_DEFAULT; - } - - table->hash_size = BDB_DEFAULT_HASH_SIZE; - table->b_hash = GF_CALLOC (BDB_DEFAULT_HASH_SIZE, - sizeof (struct list_head), - gf_bdb_mt_list_head); - - for (idx = 0; idx < table->hash_size; idx++) - INIT_LIST_HEAD(&(table->b_hash[idx])); - - private->b_table = table; - - ret = dict_get_str (options, "errfile", &errfile); - if (ret == 0) { - private->errfile = gf_strdup (errfile); - gf_log (this->name, GF_LOG_DEBUG, - "using %s as error logging file for libdb (Berkeley DB " - "library) internal logging.", private->errfile); - } - - ret = dict_get_str (options, "directory", &directory); - - if (ret == 0) { - ret = dict_get_str (options, "logdir", &logdir); - - if (ret < 0) { - gf_log ("bdb-ll", GF_LOG_DEBUG, - "using the database environment home " - "directory (%s) itself as transaction log " - "directory", directory); - private->logdir = gf_strdup (directory); - - } else { - private->logdir = gf_strdup (logdir); - - op_ret = stat (private->logdir, &stbuf); - if ((op_ret != 0) - || (!S_ISDIR (stbuf.st_mode))) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "specified logdir %s does not exist. " - "please provide a valid existing " - "directory as parameter to 'option " - "logdir'", - private->logdir); - goto err; - } - } - - private->b_table->dbenv = bdb_dbenv_init (this, directory); - if (private->b_table->dbenv == NULL) { - gf_log ("bdb-ll", GF_LOG_ERROR, - "initialization of database environment " - "failed"); - goto err; - } else { - if (private->transaction) { - /* all well, start the checkpointing thread */ - LOCK_INIT (&private->active_lock); - - LOCK (&private->active_lock); - { - private->active = 1; - } - UNLOCK (&private->active_lock); - pthread_create (&private->checkpoint_thread, - NULL, bdb_checkpoint, this); - } - } - } - - return op_ret; -err: - if (table) { - GF_FREE (table->b_hash); - GF_FREE (table); - } - if (private) { - if (private->errfile) - GF_FREE (private->errfile); - - if (private->logdir) - GF_FREE (private->logdir); - } - - return -1; -} diff --git a/xlators/storage/bdb/src/bdb-mem-types.h b/xlators/storage/bdb/src/bdb-mem-types.h deleted file mode 100644 index e68b8c0ca..000000000 --- a/xlators/storage/bdb/src/bdb-mem-types.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - - -#ifndef __POSIX_MEM_TYPES_H__ -#define __POSIX_MEM_TYPES_H__ - -#include "mem-types.h" - -enum gf_bdb_mem_types_ { - gf_bdb_mt_bctx_t = gf_common_mt_end + 1, - gf_bdb_mt_bdb_fd, - gf_bdb_mt_dir_entry_t, - gf_bdb_mt_char, - gf_bdb_mt_dir_entry_t, - gf_bdb_mt_char, - gf_bdb_mt_bdb_private, - gf_bdb_mt_uint32_t, - gf_bdb_mt_char, - gf_bdb_mt_bdb_cache_t, - gf_bdb_mt_char, - gf_bdb_mt_bctx_table_t, - gf_bdb_mt_list_head, - gf_bdb_mt_end, -}; -#endif diff --git a/xlators/storage/bdb/src/bdb.c b/xlators/storage/bdb/src/bdb.c deleted file mode 100644 index 1a5ef02b2..000000000 --- a/xlators/storage/bdb/src/bdb.c +++ /dev/null @@ -1,3603 +0,0 @@ -/* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -/* bdb based storage translator - named as 'bdb' translator - * - * - * There can be only two modes for files existing on bdb translator: - * 1. DIRECTORY - directories are stored by bdb as regular directories on - * back-end file-system. directories also have an entry in the ns_db.db of - * their parent directory. - * 2. REGULAR FILE - regular files are stored as records in the storage_db.db - * present in the directory. regular files also have an entry in ns_db.db - * - * Internally bdb has a maximum of three different types of logical files - * associated with each directory: - * 1. storage_db.db - storage database, used to store the data corresponding to - * regular files in the form of key/value pair. file-name is the 'key' and - * data is 'value'. - * 2. directory (all subdirectories) - any subdirectory will have a regular - * directory entry. - */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#define __XOPEN_SOURCE 500 - -#include <stdint.h> -#include <sys/time.h> -#include <errno.h> -#include <ftw.h> -#include <libgen.h> - -#include "glusterfs.h" -#include "dict.h" -#include "logging.h" -#include "bdb.h" -#include "xlator.h" -#include "defaults.h" -#include "common-utils.h" - -/* to be used only by fops, nobody else */ -#define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv) -#define B_TABLE(this) (((struct bdb_private *)this->private)->b_table) - - -int32_t -bdb_mknod (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode, - dev_t dev) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - char *key_string = NULL; /* after translating path to DB key */ - char *db_path = NULL; - bctx_t *bctx = NULL; - struct stat stbuf = {0,}; - - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - if (!S_ISREG(mode)) { - gf_log (this->name, GF_LOG_DEBUG, - "MKNOD %"PRId64"/%s (%s): EPERM" - "(mknod supported only for regular files. " - "file mode '%o' not supported)", - loc->parent->ino, loc->name, loc->path, mode); - op_ret = -1; - op_errno = EPERM; - goto out; - } /* if(!S_ISREG(mode)) */ - - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "MKNOD %"PRId64"/%s (%s): ENOMEM" - "(failed to lookup database handle)", - loc->parent->ino, loc->name, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); - - op_ret = lstat (db_path, &stbuf); - if (op_ret != 0) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_DEBUG, - "MKNOD %"PRId64"/%s (%s): EINVAL" - "(failed to lookup database handle)", - loc->parent->ino, loc->name, loc->path); - goto out; - } - - MAKE_KEY_FROM_PATH (key_string, loc->path); - op_ret = bdb_db_icreate (bctx, key_string); - if (op_ret > 0) { - /* create successful */ - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, - key_string, - strlen (key_string)); - stbuf.st_mode = mode; - stbuf.st_size = 0; - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, \ - stbuf.st_blksize); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "MKNOD %"PRId64"/%s (%s): ENOMEM" - "(failed to create database entry)", - loc->parent->ino, loc->name, loc->path); - op_ret = -1; - op_errno = EINVAL; /* TODO: errno sari illa */ - goto out; - }/* if (!op_ret)...else */ - -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); - return 0; -} - -static inline int32_t -is_dir_empty (xlator_t *this, - loc_t *loc) -{ - int32_t ret = 1; - bctx_t *bctx = NULL; - DIR *dir = NULL; - char *real_path = NULL; - void *dbstat = NULL; - struct dirent *entry = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - bctx = bctx_lookup (B_TABLE(this), loc->path); - if (bctx == NULL) { - ret = -ENOMEM; - goto out; - } - - dbstat = bdb_db_stat (bctx, NULL, 0); - if (dbstat) { - switch (bctx->table->access_mode) - { - case DB_HASH: - ret = (((DB_HASH_STAT *)dbstat)->hash_nkeys == 0); - break; - case DB_BTREE: - case DB_RECNO: - ret = (((DB_BTREE_STAT *)dbstat)->bt_nkeys == 0); - break; - case DB_QUEUE: - ret = (((DB_QUEUE_STAT *)dbstat)->qs_nkeys == 0); - break; - case DB_UNKNOWN: - gf_log (this->name, GF_LOG_CRITICAL, - "unknown access-mode set for database"); - ret = 0; - } - } else { - ret = -EBUSY; - goto out; - } - - MAKE_REAL_PATH (real_path, this, loc->path); - dir = opendir (real_path); - if (dir == NULL) { - ret = -errno; - goto out; - } - - while ((entry = readdir (dir))) { - if ((!IS_BDB_PRIVATE_FILE(entry->d_name)) && - (!IS_DOT_DOTDOT(entry->d_name))) { - ret = 0; - break; - }/* if(!IS_BDB_PRIVATE_FILE()) */ - } /* while(true) */ - closedir (dir); -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - return ret; -} - -int32_t -bdb_rename (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - STACK_UNWIND (frame, -1, EXDEV, NULL); - return 0; -} - -int32_t -bdb_link (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - STACK_UNWIND (frame, -1, EXDEV, NULL, NULL); - return 0; -} - -int32_t -is_space_left (xlator_t *this, - size_t size) -{ - struct bdb_private *private = this->private; - struct statvfs stbuf = {0,}; - int32_t ret = -1; - fsblkcnt_t req_blocks = 0; - fsblkcnt_t usable_blocks = 0; - - ret = statvfs (private->export_path, &stbuf); - if (ret != 0) { - ret = 0; - } else { - req_blocks = (size / stbuf.f_frsize) + 1; - - usable_blocks = (stbuf.f_bfree - BDB_ENOSPC_THRESHOLD); - - if (req_blocks < usable_blocks) - ret = 1; - else - ret = 0; - } - - return ret; -} - -int32_t -bdb_create (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - mode_t mode, - fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = EPERM; - char *db_path = NULL; - struct stat stbuf = {0,}; - bctx_t *bctx = NULL; - struct bdb_private *private = NULL; - char *key_string = NULL; - struct bdb_fd *bfd = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - private = this->private; - - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "CREATE %"PRId64"/%s (%s): ENOMEM" - "(failed to lookup database handle)", - loc->parent->ino, loc->name, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); - op_ret = lstat (db_path, &stbuf); - if (op_ret != 0) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_DEBUG, - "CREATE %"PRId64"/%s (%s): EINVAL" - "(database file missing)", - loc->parent->ino, loc->name, loc->path); - goto out; - } - - MAKE_KEY_FROM_PATH (key_string, loc->path); - op_ret = bdb_db_icreate (bctx, key_string); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "CREATE %"PRId64"/%s (%s): ENOMEM" - "(failed to create database entry)", - loc->parent->ino, loc->name, loc->path); - op_errno = EINVAL; /* TODO: errno sari illa */ - goto out; - } - - /* create successful */ - bfd = GF_CALLOC (1, sizeof (*bfd), gf_bdb_mt_bdb_fd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "CREATE %"PRId64"/%s (%s): ENOMEM" - "(failed to allocate memory for internal fd context)", - loc->parent->ino, loc->name, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - /* NOTE: bdb_get_bctx_from () returns bctx with a ref */ - bfd->ctx = bctx; - bfd->key = gf_strdup (key_string); - if (bfd->key == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "CREATE %"PRId64" (%s): ENOMEM" - "(failed to allocate memory for internal fd->key)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - BDB_FCTX_SET (fd, this, bfd); - - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, - key_string, - strlen (key_string)); - stbuf.st_mode = private->file_mode; - stbuf.st_size = 0; - stbuf.st_nlink = 1; - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); - op_ret = 0; - op_errno = 0; -out: - STACK_UNWIND (frame, op_ret, op_errno, fd, loc->inode, &stbuf); - - return 0; -} - - -/* bdb_open - * - * as input parameters bdb_open gets the file name, i.e key. bdb_open should - * effectively - * do: store key, open storage db, store storage-db pointer. - * - */ -int32_t -bdb_open (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - fd_t *fd, - int32_t wbflags) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - bctx_t *bctx = NULL; - char *key_string = NULL; - struct bdb_fd *bfd = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "OPEN %"PRId64" (%s): ENOMEM" - "(failed to lookup database handle)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - bfd = GF_CALLOC (1, sizeof (*bfd), gf_bdb_mt_bdb_fd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "OPEN %"PRId64" (%s): ENOMEM" - "(failed to allocate memory for internal fd context)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - /* NOTE: bctx_parent () returns bctx with a ref */ - bfd->ctx = bctx; - - MAKE_KEY_FROM_PATH (key_string, loc->path); - bfd->key = gf_strdup (key_string); - if (bfd->key == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "OPEN %"PRId64" (%s): ENOMEM" - "(failed to allocate memory for internal fd->key)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - BDB_FCTX_SET (fd, this, bfd); - op_ret = 0; -out: - STACK_UNWIND (frame, op_ret, op_errno, fd); - - return 0; -} - -int32_t -bdb_readv (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - struct iovec vec = {0,}; - struct stat stbuf = {0,}; - struct bdb_fd *bfd = NULL; - char *db_path = NULL; - int32_t read_size = 0; - struct iobref *iobref = NULL; - struct iobuf *iobuf = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "READV %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EBADFD" - "(internal fd not found through fd)", - fd->inode->ino, size, offset); - op_errno = EBADFD; - op_ret = -1; - goto out; - } - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory); - op_ret = lstat (db_path, &stbuf); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "READV %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EINVAL" - "(database file missing)", - fd->inode->ino, size, offset); - goto out; - } - - iobuf = iobuf_get (this->ctx->iobuf_pool); - if (!iobuf) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - /* we are ready to go */ - op_ret = bdb_db_fread (bfd, iobuf->ptr, size, offset); - read_size = op_ret; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "READV %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EBADFD" - "(failed to find entry in database)", - fd->inode->ino, size, offset); - op_ret = -1; - op_errno = ENOENT; - goto out; - } else if (op_ret == 0) { - goto out; - } - - iobref = iobref_new (); - if (iobref == NULL) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - if (size < read_size) { - op_ret = size; - read_size = size; - } - - iobref_add (iobref, iobuf); - - vec.iov_base = iobuf->ptr; - vec.iov_len = read_size; - - stbuf.st_ino = fd->inode->ino; - stbuf.st_size = bdb_db_fread (bfd, NULL, 0, 0); - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); - op_ret = size; -out: - STACK_UNWIND (frame, op_ret, op_errno, &vec, 1, &stbuf, iobuf); - - if (iobref) - iobref_unref (iobref); - - if (iobuf) - iobuf_unref (iobuf); - - return 0; -} - - -int32_t -bdb_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t offset, - struct iobref *iobref) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - struct stat stbuf = {0,}; - struct bdb_fd *bfd = NULL; - int32_t idx = 0; - off_t c_off = offset; - int32_t c_ret = -1; - char *db_path = NULL; - size_t total_size = 0; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - GF_VALIDATE_OR_GOTO (this->name, vector, out); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "WRITEV %"PRId64" - %"PRId32",%"PRId64": EBADFD" - "(internal fd not found through fd)", - fd->inode->ino, count, offset); - op_ret = -1; - op_errno = EBADFD; - goto out; - } - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory); - op_ret = lstat (db_path, &stbuf); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "WRITEV %"PRId64" - %"PRId32",%"PRId64": EINVAL" - "(database file missing)", - fd->inode->ino, count, offset); - goto out; - } - - for (idx = 0; idx < count; idx++) - total_size += vector[idx].iov_len; - - if (!is_space_left (this, total_size)) { - gf_log (this->name, GF_LOG_ERROR, - "WRITEV %"PRId64" - %"PRId32" (%"GF_PRI_SIZET"),%" - PRId64": ENOSPC " - "(not enough space after internal measurement)", - fd->inode->ino, count, total_size, offset); - op_ret = -1; - op_errno = ENOSPC; - goto out; - } - - /* we are ready to go */ - for (idx = 0; idx < count; idx++) { - c_ret = bdb_db_fwrite (bfd, vector[idx].iov_base, - vector[idx].iov_len, c_off); - if (c_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "WRITEV %"PRId64" - %"PRId32",%"PRId64": EINVAL" - "(database write at %"PRId64" failed)", - fd->inode->ino, count, offset, c_off); - break; - } else { - c_off += vector[idx].iov_len; - } - op_ret += vector[idx].iov_len; - } /* for(idx=0;...)... */ - - if (c_ret) { - /* write failed after a point, not an error */ - stbuf.st_size = bdb_db_fread (bfd, NULL, 0, 0); - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, - stbuf.st_blksize); - goto out; - } - - /* NOTE: we want to increment stbuf->st_size, as stored in db */ - stbuf.st_size = op_ret; - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); - op_errno = 0; - -out: - STACK_UNWIND (frame, op_ret, op_errno, &stbuf); - return 0; -} - -int32_t -bdb_flush (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = EPERM; - struct bdb_fd *bfd = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "FLUSH %"PRId64": EBADFD" - "(internal fd not found through fd)", - fd->inode->ino); - op_ret = -1; - op_errno = EBADFD; - goto out; - } - - /* do nothing */ - op_ret = 0; - op_errno = 0; - -out: - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -int32_t -bdb_release (xlator_t *this, - fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = EBADFD; - struct bdb_fd *bfd = NULL; - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "RELEASE %"PRId64": EBADFD" - "(internal fd not found through fd)", - fd->inode->ino); - op_ret = -1; - op_errno = EBADFD; - goto out; - } - - bctx_unref (bfd->ctx); - bfd->ctx = NULL; - - if (bfd->key) - GF_FREE (bfd->key); /* we did strdup() in bdb_open() */ - GF_FREE (bfd); - op_ret = 0; - op_errno = 0; - -out: - return 0; -}/* bdb_release */ - - -int32_t -bdb_fsync (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t datasync) -{ - STACK_UNWIND (frame, 0, 0); - return 0; -}/* bdb_fsync */ - -static int gf_bdb_lk_log; - -int32_t -bdb_lk (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t cmd, - struct gf_flock *lock) -{ - struct gf_flock nullock = {0, }; - - if (BDB_TIMED_LOG (ENOTSUP, gf_bdb_lk_log)) { - gf_log (this->name, GF_LOG_DEBUG, - "LK %"PRId64": ENOTSUP " - "(load \"features/locks\" translator to enable " - "lock support)", - fd->inode->ino); - } - - STACK_UNWIND (frame, -1, ENOTSUP, &nullock); - return 0; -}/* bdb_lk */ - -/* bdb_lookup - * - * there are four possibilities for a file being looked up: - * 1. file exists and is a directory. - * 2. file exists and is a symlink. - * 3. file exists and is a regular file. - * 4. file does not exist. - * case 1 and 2 are handled by doing lstat() on the @loc. if the file is a - * directory or symlink, lstat() succeeds. lookup continues to check if the - * @loc belongs to case-3 only if lstat() fails. - * to check for case 3, bdb_lookup does a bdb_db_iread() for the given @loc. - * (see description of bdb_db_iread() for more details on how @loc is transformed - * into db handle and key). if check for case 1, 2 and 3 fail, we proceed to - * conclude that file doesn't exist (case 4). - * - * @frame: call frame. - * @this: xlator_t of this instance of bdb xlator. - * @loc: loc_t specifying the file to operate upon. - * @need_xattr: if need_xattr != 0, we are asked to return all the extended - * attributed of @loc, if any exist, in a dictionary. if @loc is a regular - * file and need_xattr is set, then we look for value of need_xattr. if - * need_xattr > sizo-of-the-file @loc, then the file content of @loc is - * returned in dictionary of xattr with 'glusterfs.content' as dictionary key. - * - * NOTE: bdb currently supports only directories, symlinks and regular files. - * - * NOTE: bdb_lookup returns the 'struct stat' of underlying file itself, in - * case of directory and symlink (st_ino is modified as bdb allocates its own - * set of inodes of all files). for regular files, bdb uses 'struct stat' of - * the database file in which the @loc is stored as templete and modifies - * st_ino (see bdb_inode_transform for more details), st_mode (can be set in - * volfile 'option file-mode <mode>'), st_size (exact size of the @loc - * contents), st_blocks (block count on the underlying filesystem to - * accomodate st_size, see BDB_COUNT_BLOCKS in bdb.h for more details). - */ -int32_t -bdb_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xattr_req) -{ - struct stat stbuf = {0, }; - int32_t op_ret = -1; - int32_t op_errno = ENOENT; - dict_t *xattr = NULL; - char *pathname = NULL; - char *directory = NULL; - char *real_path = NULL; - bctx_t *bctx = NULL; - char *db_path = NULL; - struct bdb_private *private = NULL; - char *key_string = NULL; - int32_t entry_size = 0; - char *file_content = NULL; - uint64_t need_xattr = 0; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - private = this->private; - - MAKE_REAL_PATH (real_path, this, loc->path); - - pathname = gf_strdup (loc->path); - GF_VALIDATE_OR_GOTO (this->name, pathname, out); - - directory = dirname (pathname); - GF_VALIDATE_OR_GOTO (this->name, directory, out); - - if (!strcmp (directory, loc->path)) { - /* SPECIAL CASE: looking up root */ - op_ret = lstat (real_path, &stbuf); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "LOOKUP %"PRId64" (%s): %s", - loc->ino, loc->path, strerror (op_errno)); - goto out; - } - - /* bctx_lookup() returns NULL only when its time to wind up, - * we should shutdown functioning */ - bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "LOOKUP %"PRId64" (%s): ENOMEM" - "(failed to lookup database handle)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - stbuf.st_ino = 1; - stbuf.st_mode = private->dir_mode; - - op_ret = 0; - goto out; - } - - MAKE_KEY_FROM_PATH (key_string, loc->path); - op_ret = lstat (real_path, &stbuf); - if ((op_ret == 0) && (S_ISDIR (stbuf.st_mode))){ - bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "LOOKUP %"PRId64"/%s (%s): ENOMEM" - "(failed to lookup database handle)", - loc->parent->ino, loc->name, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - if (loc->ino) { - /* revalidating directory inode */ - stbuf.st_ino = loc->ino; - } else { - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, - key_string, - strlen (key_string)); - } - stbuf.st_mode = private->dir_mode; - - op_ret = 0; - goto out; - - } else if (op_ret == 0) { - /* a symlink */ - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "LOOKUP %"PRId64"/%s (%s): ENOMEM" - "(failed to lookup database handle)", - loc->parent->ino, loc->name, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - if (loc->ino) { - stbuf.st_ino = loc->ino; - } else { - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, - key_string, - strlen (key_string)); - } - - stbuf.st_mode = private->symlink_mode; - - op_ret = 0; - goto out; - - } - - /* for regular files */ - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "LOOKUP %"PRId64"/%s (%s): ENOMEM" - "(failed to lookup database handle for parent)", - loc->parent->ino, loc->name, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - if (GF_FILE_CONTENT_REQUESTED(xattr_req, &need_xattr)) { - entry_size = bdb_db_iread (bctx, key_string, &file_content); - } else { - entry_size = bdb_db_iread (bctx, key_string, NULL); - } - - op_ret = entry_size; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "LOOKUP %"PRId64"/%s (%s): ENOENT" - "(database entry not found)", - loc->parent->ino, loc->name, loc->path); - op_errno = ENOENT; - goto out; - } - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); - op_ret = lstat (db_path, &stbuf); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "LOOKUP %"PRId64"/%s (%s): %s", - loc->parent->ino, loc->name, loc->path, - strerror (op_errno)); - goto out; - } - - if (entry_size - && (need_xattr >= entry_size) - && (file_content)) { - xattr = dict_new (); - op_ret = dict_set_dynptr (xattr, "glusterfs.content", - file_content, entry_size); - if (op_ret < 0) { - /* continue without giving file contents */ - GF_FREE (file_content); - } - } else { - if (file_content) - GF_FREE (file_content); - } - - if (loc->ino) { - /* revalidate */ - stbuf.st_ino = loc->ino; - stbuf.st_size = entry_size; - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, - stbuf.st_blksize); - } else { - /* fresh lookup, create an inode number */ - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, - key_string, - strlen (key_string)); - stbuf.st_size = entry_size; - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, - stbuf.st_blksize); - }/* if(inode->ino)...else */ - stbuf.st_nlink = 1; - stbuf.st_mode = private->file_mode; - - op_ret = 0; -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - if (pathname) - GF_FREE (pathname); - - if (xattr) - dict_ref (xattr); - - STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf, xattr); - - if (xattr) - dict_unref (xattr); - - return 0; - -}/* bdb_lookup */ - -int32_t -bdb_stat (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - - struct stat stbuf = {0,}; - char *real_path = NULL; - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - struct bdb_private *private = NULL; - char *db_path = NULL; - bctx_t *bctx = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - private = this->private; - GF_VALIDATE_OR_GOTO (this->name, private, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = lstat (real_path, &stbuf); - op_errno = errno; - if (op_ret == 0) { - /* directory or symlink */ - stbuf.st_ino = loc->inode->ino; - if (S_ISDIR(stbuf.st_mode)) - stbuf.st_mode = private->dir_mode; - else - stbuf.st_mode = private->symlink_mode; - /* we are done, lets unwind the stack */ - goto out; - } - - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "STAT %"PRId64" (%s): ENOMEM" - "(no database handle for parent)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); - op_ret = lstat (db_path, &stbuf); - if (op_ret < 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "STAT %"PRId64" (%s): %s" - "(failed to stat on database file)", - loc->ino, loc->path, strerror (op_errno)); - goto out; - } - - stbuf.st_size = bdb_db_iread (bctx, loc->path, NULL); - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); - stbuf.st_ino = loc->inode->ino; - -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno, &stbuf); - - return 0; -}/* bdb_stat */ - - - -/* bdb_opendir - in the world of bdb, open/opendir is all about opening - * correspondind databases. opendir in particular, opens the database for the - * directory which is to be opened. after opening the database, a cursor to - * the database is also created. cursor helps us get the dentries one after - * the other, and cursor maintains the state about current positions in - * directory. pack 'pointer to db', 'pointer to the cursor' into - * struct bdb_dir and store it in fd->ctx, we get from our parent xlator. - * - * @frame: call frame - * @this: our information, as we filled during init() - * @loc: location information - * @fd: file descriptor structure (glusterfs internal) - * - * return value - immaterial, async call. - * - */ -int32_t -bdb_opendir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - fd_t *fd) -{ - char *real_path = NULL; - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - bctx_t *bctx = NULL; - struct bdb_dir *bfd = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "OPENDIR %"PRId64" (%s): ENOMEM" - "(no database handle for directory)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - bfd = GF_CALLOC (1, sizeof (*bfd), gf_bdb_mt_bdb_fd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "OPENDIR %"PRId64" (%s): ENOMEM" - "(failed to allocate memory for internal fd)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto err; - } - - bfd->dir = opendir (real_path); - if (bfd->dir == NULL) { - op_ret = -1; - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "OPENDIR %"PRId64" (%s): %s", - loc->ino, loc->path, strerror (op_errno)); - goto err; - } - - /* NOTE: bctx_lookup() return bctx with ref */ - bfd->ctx = bctx; - - bfd->path = gf_strdup (real_path); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "OPENDIR %"PRId64" (%s): ENOMEM" - "(failed to allocate memory for internal fd->path)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto err; - } - - BDB_FCTX_SET (fd, this, bfd); - op_ret = 0; -out: - STACK_UNWIND (frame, op_ret, op_errno, fd); - return 0; -err: - if (bctx) - bctx_unref (bctx); - if (bfd) { - if (bfd->dir) - closedir (bfd->dir); - - GF_FREE (bfd); - } - - return 0; -}/* bdb_opendir */ - -int32_t -bdb_getdents (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t off, - int32_t flag) -{ - struct bdb_dir *bfd = NULL; - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - size_t filled = 0; - dir_entry_t entries = {0, }; - dir_entry_t *this_entry = NULL; - char *entry_path = NULL; - struct dirent *dirent = NULL; - off_t in_case = 0; - int32_t this_size = 0; - DBC *cursorp = NULL; - int32_t ret = -1; - int32_t real_path_len = 0; - int32_t entry_path_len = 0; - int32_t count = 0; - off_t offset = 0; - size_t tmp_name_len = 0; - struct stat db_stbuf = {0,}; - struct stat buf = {0,}; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64 - " %o: EBADFD " - "(failed to find internal context in fd)", - fd->inode->ino, size, off, flag); - op_errno = EBADFD; - op_ret = -1; - goto out; - } - - op_ret = bdb_cursor_open (bfd->ctx, &cursorp); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64 - ": EBADFD " - "(failed to open cursor to database handle)", - fd->inode->ino, size, off); - op_errno = EBADFD; - goto out; - } - - if (off) { - DBT sec = {0,}, pri = {0,}, val = {0,}; - sec.data = &(off); - sec.size = sizeof (off); - sec.flags = DB_DBT_USERMEM; - val.dlen = 0; - val.doff = 0; - val.flags = DB_DBT_PARTIAL; - - op_ret = bdb_cursor_get (cursorp, &sec, &pri, &val, DB_SET); - if (op_ret == DB_NOTFOUND) { - offset = off; - goto dir_read; - } - } - - while (filled <= size) { - DBT sec = {0,}, pri = {0,}, val = {0,}; - - this_entry = NULL; - - sec.flags = DB_DBT_MALLOC; - pri.flags = DB_DBT_MALLOC; - val.dlen = 0; - val.doff = 0; - val.flags = DB_DBT_PARTIAL; - op_ret = bdb_cursor_get (cursorp, &sec, &pri, &val, DB_NEXT); - - if (op_ret == DB_NOTFOUND) { - /* we reached end of the directory */ - op_ret = 0; - op_errno = 0; - break; - } else if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET - ",%"PRId64":" - "(failed to read the next entry from database)", - fd->inode->ino, size, off); - op_errno = ENOENT; - break; - } /* if (op_ret == DB_NOTFOUND)...else if...else */ - - if (pri.data == NULL) { - /* NOTE: currently ignore when we get key.data == NULL. - * FIXME: we should not get key.data = NULL */ - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET - ",%"PRId64":" - "(null key read for entry from database)", - fd->inode->ino, size, off); - continue; - }/* if(key.data)...else */ - - this_entry = GF_CALLOC (1, sizeof (*this_entry), - gf_bdb_mt_dir_entry_t); - if (this_entry == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64 - " - %s:" - "(failed to allocate memory for an entry)", - fd->inode->ino, size, off, strerror (errno)); - op_errno = ENOMEM; - op_ret = -1; - goto out; - } - - this_entry->name = GF_CALLOC (pri.size + 1, sizeof (char), - gf_bdb_mt_char); - if (this_entry->name == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64 - " - %s:" - "(failed to allocate memory for an " - "entry->name)", - fd->inode->ino, size, off, strerror (errno)); - op_errno = ENOMEM; - op_ret = -1; - goto out; - } - - memcpy (this_entry->name, pri.data, pri.size); - this_entry->buf = db_stbuf; - this_entry->buf.st_size = bdb_db_iread (bfd->ctx, - this_entry->name, NULL); - this_entry->buf.st_blocks = BDB_COUNT_BLOCKS ( - this_entry->buf.st_size, - this_entry->buf.st_blksize); - - this_entry->buf.st_ino = bdb_inode_transform (fd->inode->ino, - pri.data, - pri.size); - count++; - - this_entry->next = entries.next; - this_entry->link = ""; - entries.next = this_entry; - /* if size is 0, count can never be = size, - * so entire dir is read */ - if (sec.data) - GF_FREE (sec.data); - - if (pri.data) - GF_FREE (pri.data); - - if (count == size) - break; - }/* while */ - bdb_cursor_close (bfd->ctx, cursorp); - op_ret = count; - op_errno = 0; - if (count >= size) - goto out; -dir_read: - /* hungry kyaa? */ - if (!offset) { - rewinddir (bfd->dir); - } else { - seekdir (bfd->dir, offset); - } - - while (filled <= size) { - this_entry = NULL; - this_size = 0; - - in_case = telldir (bfd->dir); - dirent = readdir (bfd->dir); - if (!dirent) - break; - - if (IS_BDB_PRIVATE_FILE(dirent->d_name)) - continue; - - tmp_name_len = strlen (dirent->d_name); - if (entry_path_len < (real_path_len + 1 + (tmp_name_len) + 1)) { - entry_path_len = real_path_len + tmp_name_len + 1024; - entry_path = realloc (entry_path, entry_path_len); - if (entry_path == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET"," - "%"PRId64" - %s: (failed to allocate " - "memory for an entry_path)", - fd->inode->ino, size, off, - strerror (errno)); - op_errno = ENOMEM; - op_ret = -1; - goto out; - } - } - - strncpy (&entry_path[real_path_len+1], dirent->d_name, - tmp_name_len); - op_ret = stat (entry_path, &buf); - if (op_ret < 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64 - " - %s:" - " (failed to stat on an entry '%s')", - fd->inode->ino, size, off, - strerror (errno), entry_path); - goto out; /* FIXME: shouldn't we continue here */ - } - - if ((flag == GF_GET_DIR_ONLY) && - ((ret != -1) && (!S_ISDIR(buf.st_mode)))) { - continue; - } - - this_entry = GF_CALLOC (1, sizeof (*this_entry), - gf_bdb_mt_dir_entry_t); - if (this_entry == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64 - " - %s:" - "(failed to allocate memory for an entry)", - fd->inode->ino, size, off, strerror (errno)); - op_errno = ENOMEM; - op_ret = -1; - goto out; - } - - this_entry->name = gf_strdup (dirent->d_name); - if (this_entry->name == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET",%"PRId64 - " - %s:" - "(failed to allocate memory for an " - "entry->name)", - fd->inode->ino, size, off, strerror (errno)); - op_errno = ENOMEM; - op_ret = -1; - goto out; - } - - this_entry->buf = buf; - - this_entry->buf.st_ino = -1; - if (S_ISLNK(this_entry->buf.st_mode)) { - char linkpath[PATH_MAX] = {0,}; - ret = readlink (entry_path, linkpath, PATH_MAX); - if (ret != -1) { - linkpath[ret] = '\0'; - this_entry->link = gf_strdup (linkpath); - } - } else { - this_entry->link = ""; - } - - count++; - - this_entry->next = entries.next; - entries.next = this_entry; - - /* if size is 0, count can never be = size, - * so entire dir is read */ - if (count == size) - break; - } - op_ret = filled; - op_errno = 0; - -out: - gf_log (this->name, GF_LOG_DEBUG, - "GETDENTS %"PRId64" - %"GF_PRI_SIZET" (%"PRId32")" - "/%"GF_PRI_SIZET",%"PRId64":" - "(failed to read the next entry from database)", - fd->inode->ino, filled, count, size, off); - - STACK_UNWIND (frame, count, op_errno, &entries); - - while (entries.next) { - this_entry = entries.next; - entries.next = entries.next->next; - GF_FREE (this_entry->name); - GF_FREE (this_entry); - } - - return 0; -}/* bdb_getdents */ - - -int32_t -bdb_releasedir (xlator_t *this, - fd_t *fd) -{ - int32_t op_ret = 0; - int32_t op_errno = 0; - struct bdb_dir *bfd = NULL; - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "RELEASEDIR %"PRId64": EBADFD", - fd->inode->ino); - op_errno = EBADFD; - op_ret = -1; - goto out; - } - - if (bfd->path) { - GF_FREE (bfd->path); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "RELEASEDIR %"PRId64": (bfd->path is NULL)", - fd->inode->ino); - } - - if (bfd->dir) { - closedir (bfd->dir); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "RELEASEDIR %"PRId64": (bfd->dir is NULL)", - fd->inode->ino); - } - - if (bfd->ctx) { - bctx_unref (bfd->ctx); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "RELEASEDIR %"PRId64": (bfd->ctx is NULL)", - fd->inode->ino); - } - - GF_FREE (bfd); - -out: - return 0; -}/* bdb_releasedir */ - - -int32_t -bdb_readlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - size_t size) -{ - char *dest = NULL; - int32_t op_ret = -1; - int32_t op_errno = EPERM; - char *real_path = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - dest = alloca (size + 1); - GF_VALIDATE_OR_GOTO (this->name, dest, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = readlink (real_path, dest, size); - - if (op_ret > 0) - dest[op_ret] = 0; - - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "READLINK %"PRId64" (%s): %s", - loc->ino, loc->path, strerror (op_errno)); - } -out: - STACK_UNWIND (frame, op_ret, op_errno, dest); - - return 0; -}/* bdb_readlink */ - - -int32_t -bdb_mkdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode) -{ - int32_t op_ret = -1; - int32_t ret = -1; - int32_t op_errno = EINVAL; - char *real_path = NULL; - struct stat stbuf = {0, }; - bctx_t *bctx = NULL; - char *key_string = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - MAKE_KEY_FROM_PATH (key_string, loc->path); - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = mkdir (real_path, mode); - if (op_ret < 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "MKDIR %"PRId64" (%s): %s", - loc->ino, loc->path, strerror (op_errno)); - goto out; - } - - op_ret = chown (real_path, frame->root->uid, frame->root->gid); - if (op_ret < 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "MKDIR %"PRId64" (%s): %s " - "(failed to do chmod)", - loc->ino, loc->path, strerror (op_errno)); - goto err; - } - - op_ret = lstat (real_path, &stbuf); - if (op_ret < 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "MKDIR %"PRId64" (%s): %s " - "(failed to do lstat)", - loc->ino, loc->path, strerror (op_errno)); - goto err; - } - - bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "MKDIR %"PRId64" (%s): ENOMEM" - "(no database handle for parent)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto err; - } - - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, key_string, - strlen (key_string)); - - goto out; - -err: - ret = rmdir (real_path); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "MKDIR %"PRId64" (%s): %s" - "(failed to do rmdir)", - loc->ino, loc->path, strerror (errno)); - } - -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); - - return 0; -}/* bdb_mkdir */ - - -int32_t -bdb_unlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - bctx_t *bctx = NULL; - char *real_path = NULL; - char *key_string = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "UNLINK %"PRId64" (%s): ENOMEM" - "(no database handle for parent)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - MAKE_KEY_FROM_PATH (key_string, loc->path); - op_ret = bdb_db_iremove (bctx, key_string); - if (op_ret == DB_NOTFOUND) { - MAKE_REAL_PATH (real_path, this, loc->path); - op_ret = unlink (real_path); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "UNLINK %"PRId64" (%s): %s" - "(symlink unlink failed)", - loc->ino, loc->path, strerror (op_errno)); - goto out; - } - } else if (op_ret == 0) { - op_errno = 0; - } -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno); - - return 0; -}/* bdb_unlink */ - - - -static int32_t -bdb_do_rmdir (xlator_t *this, - loc_t *loc) -{ - char *real_path = NULL; - int32_t ret = -1; - bctx_t *bctx = NULL; - DB_ENV *dbenv = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - dbenv = BDB_ENV(this); - GF_VALIDATE_OR_GOTO (this->name, dbenv, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - bctx = bctx_lookup (B_TABLE(this), loc->path); - if (bctx == NULL) { - ret = -ENOMEM; - goto out; - } - - LOCK(&bctx->lock); - { - if ((bctx->primary == NULL) - || (bctx->secondary == NULL)) { - goto unlock; - } - - ret = bctx->primary->close (bctx->primary, 0); - if (ret < 0) { - ret = -EINVAL; - } - - ret = bctx->secondary->close (bctx->secondary, 0); - if (ret < 0) { - ret = -EINVAL; - } - - ret = dbenv->dbremove (dbenv, NULL, bctx->db_path, - "primary", 0); - if (ret < 0) { - ret = -EBUSY; - } - - ret = dbenv->dbremove (dbenv, NULL, bctx->db_path, - "secondary", 0); - if (ret != 0) { - ret = -EBUSY; - } - } -unlock: - UNLOCK(&bctx->lock); - - if (ret) { - goto out; - } - ret = rmdir (real_path); - -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - return ret; -} - -int32_t -bdb_rmdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - - op_ret = is_dir_empty (this, loc); - if (op_ret < 0) { - op_errno = -op_ret; - gf_log (this->name, GF_LOG_DEBUG, - "RMDIR %"PRId64" (%s): %s" - "(internal rmdir routine returned error)", - loc->ino, loc->path, strerror (op_errno)); - } else if (op_ret == 0) { - op_ret = -1; - op_errno = ENOTEMPTY; - gf_log (this->name, GF_LOG_DEBUG, - "RMDIR %"PRId64" (%s): ENOTEMPTY", - loc->ino, loc->path); - goto out; - } - - op_ret = bdb_do_rmdir (this, loc); - if (op_ret < 0) { - op_errno = -op_ret; - gf_log (this->name, GF_LOG_DEBUG, - "RMDIR %"PRId64" (%s): %s" - "(internal rmdir routine returned error)", - loc->ino, loc->path, strerror (op_errno)); - goto out; - } - -out: - STACK_UNWIND (frame, op_ret, op_errno); - - return 0; -} /* bdb_rmdir */ - -int32_t -bdb_symlink (call_frame_t *frame, - xlator_t *this, - const char *linkname, - loc_t *loc) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - char *real_path = NULL; - struct stat stbuf = {0,}; - struct bdb_private *private = NULL; - bctx_t *bctx = NULL; - char *key_string = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, linkname, out); - - private = this->private; - GF_VALIDATE_OR_GOTO (this->name, private, out); - - MAKE_KEY_FROM_PATH (key_string, loc->path); - - MAKE_REAL_PATH (real_path, this, loc->path); - op_ret = symlink (linkname, real_path); - op_errno = errno; - if (op_ret == 0) { - op_ret = lstat (real_path, &stbuf); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "SYMLINK %"PRId64" (%s): %s", - loc->ino, loc->path, strerror (op_errno)); - goto err; - } - - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "SYMLINK %"PRId64" (%s): ENOMEM" - "(no database handle for parent)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto err; - } - - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, - key_string, - strlen (key_string)); - stbuf.st_mode = private->symlink_mode; - - goto out; - } -err: - op_ret = unlink (real_path); - op_errno = errno; - if (op_ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "SYMLINK %"PRId64" (%s): %s" - "(failed to unlink the created symlink)", - loc->ino, loc->path, strerror (op_errno)); - } - op_ret = -1; - op_errno = ENOENT; -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); - - return 0; -} /* bdb_symlink */ - -static int -bdb_do_chmod (xlator_t *this, - const char *path, - struct stat *stbuf) -{ - int32_t ret = -1; - - ret = lchmod (path, stbuf->st_mode); - if ((ret == -1) && (errno == ENOSYS)) { - ret = chmod (path, stbuf->st_mode); - } - - return ret; -} - -static int -bdb_do_chown (xlator_t *this, - const char *path, - struct stat *stbuf, - int32_t valid) -{ - int32_t ret = -1; - uid_t uid = -1; - gid_t gid = -1; - - if (valid & GF_SET_ATTR_UID) - uid = stbuf->st_uid; - - if (valid & GF_SET_ATTR_GID) - gid = stbuf->st_gid; - - ret = lchown (path, uid, gid); - - return ret; -} - -static int -bdb_do_utimes (xlator_t *this, - const char *path, - struct stat *stbuf) -{ - int32_t ret = -1; - struct timeval tv[2] = {{0,},{0,}}; - - tv[0].tv_sec = stbuf->st_atime; - tv[0].tv_usec = ST_ATIM_NSEC (stbuf) / 1000; - tv[1].tv_sec = stbuf->st_mtime; - tv[1].tv_usec = ST_ATIM_NSEC (stbuf) / 1000; - - ret = lutimes (path, tv); - - return ret; -} - -int32_t -bdb_setattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - struct stat *stbuf, - int32_t valid) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - char *real_path = NULL; - struct stat preop = {0,}; - struct stat postop = {0,}; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - op_ret = lstat (real_path, &preop); - op_errno = errno; - if (op_ret != 0) { - if (op_errno == ENOENT) { - op_errno = EPERM; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "CHMOD %"PRId64" (%s): %s" - "(pre-op lstat failed)", - loc->ino, loc->path, strerror (op_errno)); - } - goto out; - } - - /* directory or symlink */ - if (valid & GF_SET_ATTR_MODE) { - op_ret = bdb_do_chmod (this, real_path, stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "setattr (chmod) on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - } - - if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)){ - op_ret = bdb_do_chown (this, real_path, stbuf, valid); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "setattr (chown) on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - } - - if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) { - op_ret = bdb_do_utimes (this, real_path, stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "setattr (utimes) on %s failed: %s", loc->path, - strerror (op_errno)); - goto out; - } - } - - op_ret = lstat (real_path, &postop); - op_errno = errno; - if (op_ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "CHMOD %"PRId64" (%s): %s" - "(post-op lstat failed)", - loc->ino, loc->path, strerror (op_errno)); - } - -out: - STACK_UNWIND (frame, op_ret, op_errno, &preop, &postop); - - return 0; -}/* bdb_setattr */ - -int32_t -bdb_fsetattr (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct stat *stbuf, - int32_t valid) -{ - int32_t op_ret = -1; - int32_t op_errno = EPERM; - struct stat preop = {0,}; - struct stat postop = {0,}; - - STACK_UNWIND (frame, op_ret, op_errno, &preop, &postop); - - return 0; -}/* bdb_fsetattr */ - - -int32_t -bdb_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - char *real_path = NULL; - struct stat stbuf = {0,}; - char *db_path = NULL; - bctx_t *bctx = NULL; - char *key_string = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - bctx = bctx_parent (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "TRUNCATE %"PRId64" (%s): ENOMEM" - "(no database handle for parent)", - loc->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - MAKE_REAL_PATH (real_path, this, loc->path); - MAKE_KEY_FROM_PATH (key_string, loc->path); - - /* now truncate */ - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); - op_ret = lstat (db_path, &stbuf); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "TRUNCATE %"PRId64" (%s): %s" - "(lstat on database file failed)", - loc->ino, loc->path, strerror (op_errno)); - goto out; - } - - if (loc->inode->ino) { - stbuf.st_ino = loc->inode->ino; - }else { - stbuf.st_ino = bdb_inode_transform (loc->parent->ino, - key_string, - strlen (key_string)); - } - - op_ret = bdb_db_itruncate (bctx, key_string); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "TRUNCATE %"PRId64" (%s): EINVAL" - "(truncating entry in database failed - %s)", - loc->ino, loc->path, db_strerror (op_ret)); - op_errno = EINVAL; /* TODO: better errno */ - } - -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno, &stbuf); - - return 0; -}/* bdb_truncate */ - - -int32_t -bdb_statfs (call_frame_t *frame, - xlator_t *this, - loc_t *loc) - -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - char *real_path = NULL; - struct statvfs buf = {0, }; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = statvfs (real_path, &buf); - op_errno = errno; -out: - STACK_UNWIND (frame, op_ret, op_errno, &buf); - return 0; -}/* bdb_statfs */ - -static int gf_bdb_xattr_log; - -/* bdb_setxattr - set extended attributes. - * - * bdb allows setxattr operation only on directories. - * bdb reservers 'glusterfs.file.<attribute-name>' to operate on the content - * of the files under the specified directory. - * 'glusterfs.file.<attribute-name>' transforms to contents of file of name - * '<attribute-name>' under specified directory. - * - * @frame: call frame. - * @this: xlator_t of this instance of bdb xlator. - * @loc: loc_t specifying the file to operate upon. - * @dict: list of extended attributes to set on @loc. - * @flags: can be XATTR_REPLACE (replace an existing extended attribute only if - * it exists) or XATTR_CREATE (create an extended attribute only if it - * doesn't already exist). - * - * - */ -int32_t -bdb_setxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *dict, - int flags) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - data_pair_t *trav = dict->members_list; - bctx_t *bctx = NULL; - char *real_path = NULL; - char *key = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, dict, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - if (!S_ISDIR (loc->inode->st_mode)) { - op_ret = -1; - op_errno = ENOATTR; - goto out; - } - - while (trav) { - if (GF_FILE_CONTENT_REQUEST(trav->key) ) { - key = BDB_KEY_FROM_FREQUEST_KEY(trav->key); - - bctx = bctx_lookup (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "SETXATTR %"PRId64" (%s) - %s: ENOMEM" - "(no database handle for directory)", - loc->ino, loc->path, key); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - if (flags & XATTR_REPLACE) { - op_ret = bdb_db_itruncate (bctx, key); - if (op_ret == -1) { - /* key doesn't exist in database */ - gf_log (this->name, GF_LOG_DEBUG, - "SETXATTR %"PRId64" (%s) - %s:" - " (entry not present in " - "database)", - loc->ino, loc->path, key); - op_ret = -1; - op_errno = ENOATTR; - break; - } - op_ret = bdb_db_iwrite (bctx, key, - trav->value->data, - trav->value->len); - if (op_ret != 0) { - op_ret = -1; - op_errno = ENOATTR; - break; - } - } else { - /* fresh create */ - op_ret = bdb_db_iwrite (bctx, key, - trav->value->data, - trav->value->len); - if (op_ret != 0) { - op_ret = -1; - op_errno = EEXIST; - break; - } else { - op_ret = 0; - op_errno = 0; - } /* if(op_ret!=0)...else */ - } /* if(flags&XATTR_REPLACE)...else */ - if (bctx) { - /* NOTE: bctx_unref always returns success, see - * description of bctx_unref for more details */ - bctx_unref (bctx); - } - } else { - /* do plain setxattr */ - op_ret = lsetxattr (real_path, - trav->key, trav->value->data, - trav->value->len, - flags); - op_errno = errno; - - if ((op_errno == ENOATTR) || (op_errno == EEXIST)) { - /* don't log, normal behaviour */ - ; - } else if (BDB_TIMED_LOG (op_errno, gf_bdb_xattr_log)) { - gf_log (this->name, GF_LOG_DEBUG, - "SETXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, trav->key, - strerror (op_errno)); - /* do not continue, break out */ - break; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "SETXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, trav->key, - strerror (op_errno)); - } - } /* if(ZR_FILE_CONTENT_REQUEST())...else */ - trav = trav->next; - }/* while(trav) */ -out: - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -}/* bdb_setxattr */ - - -/* bdb_gettxattr - get extended attributes. - * - * bdb allows getxattr operation only on directories. - * bdb_getxattr retrieves the whole content of the file, when - * glusterfs.file.<attribute-name> is specified. - * - * @frame: call frame. - * @this: xlator_t of this instance of bdb xlator. - * @loc: loc_t specifying the file to operate upon. - * @name: name of extended attributes to get for @loc. - * - * NOTE: see description of bdb_setxattr for details on how - * 'glusterfs.file.<attribute-name>' is handles by bdb. - */ -int32_t -bdb_getxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - int32_t op_ret = 0; - int32_t op_errno = 0; - dict_t *dict = NULL; - bctx_t *bctx = NULL; - char *buf = NULL; - char *key_string = NULL; - int32_t list_offset = 0; - size_t size = 0; - size_t remaining_size = 0; - char *real_path = NULL; - char key[1024] = {0,}; - char *value = NULL; - char *list = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, name, out); - - dict = dict_new (); - GF_VALIDATE_OR_GOTO (this->name, dict, out); - - if (!S_ISDIR (loc->inode->st_mode)) { - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: ENOATTR " - "(not a directory)", - loc->ino, loc->path, name); - op_ret = -1; - op_errno = ENOATTR; - goto out; - } - - if (name && GF_FILE_CONTENT_REQUEST(name)) { - bctx = bctx_lookup (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: ENOMEM" - "(no database handle for directory)", - loc->ino, loc->path, name); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - key_string = BDB_KEY_FROM_FREQUEST_KEY(name); - - op_ret = bdb_db_iread (bctx, key_string, &buf); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: ENOATTR" - "(attribute not present in database)", - loc->ino, loc->path, name); - op_errno = ENOATTR; - goto out; - } - - op_ret = dict_set_dynptr (dict, (char *)name, buf, op_ret); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: ENOATTR" - "(attribute present in database, " - "dict set failed)", - loc->ino, loc->path, name); - op_errno = ENODATA; - } - - goto out; - } - - MAKE_REAL_PATH (real_path, this, loc->path); - size = sys_llistxattr (real_path, NULL, 0); - op_errno = errno; - if (size < 0) { - if (BDB_TIMED_LOG (op_errno, gf_bdb_xattr_log)) { - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, name, strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, name, strerror (op_errno)); - } - op_ret = -1; - op_errno = ENOATTR; - - goto out; - } - - if (size == 0) - goto done; - - list = alloca (size + 1); - if (list == NULL) { - op_ret = -1; - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, name, strerror (op_errno)); - } - - size = sys_llistxattr (real_path, list, size); - op_ret = size; - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, name, strerror (op_errno)); - goto out; - } - - remaining_size = size; - list_offset = 0; - while (remaining_size > 0) { - if(*(list+list_offset) == '\0') - break; - - strcpy (key, list + list_offset); - - op_ret = sys_lgetxattr (real_path, key, NULL, 0); - if (op_ret == -1) - break; - - value = GF_CALLOC (op_ret + 1, sizeof(char), gf_bdb_mt_char); - GF_VALIDATE_OR_GOTO (this->name, value, out); - - op_ret = sys_lgetxattr (real_path, key, value, - op_ret); - if (op_ret == -1) - break; - value [op_ret] = '\0'; - op_ret = dict_set_dynptr (dict, key, - value, op_ret); - if (op_ret < 0) { - GF_FREE (value); - gf_log (this->name, GF_LOG_DEBUG, - "GETXATTR %"PRId64" (%s) - %s: " - "(skipping key %s)", - loc->ino, loc->path, name, key); - continue; - } - remaining_size -= strlen (key) + 1; - list_offset += strlen (key) + 1; - } /* while(remaining_size>0) */ -done: -out: - if(bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno, dict); - - if (dict) - dict_unref (dict); - - return 0; -}/* bdb_getxattr */ - - -int32_t -bdb_removexattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - bctx_t *bctx = NULL; - char *real_path = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, name, out); - - if (!S_ISDIR(loc->inode->st_mode)) { - gf_log (this->name, GF_LOG_DEBUG, - "REMOVEXATTR %"PRId64" (%s) - %s: ENOATTR " - "(not a directory)", - loc->ino, loc->path, name); - op_ret = -1; - op_errno = ENOATTR; - goto out; - } - - if (GF_FILE_CONTENT_REQUEST(name)) { - bctx = bctx_lookup (B_TABLE(this), loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "REMOVEXATTR %"PRId64" (%s) - %s: ENOATTR" - "(no database handle for directory)", - loc->ino, loc->path, name); - op_ret = -1; - op_errno = ENOATTR; - goto out; - } - - op_ret = bdb_db_iremove (bctx, name); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "REMOVEXATTR %"PRId64" (%s) - %s: ENOATTR" - "(no such attribute in database)", - loc->ino, loc->path, name); - op_errno = ENOATTR; - } - goto out; - } - - MAKE_REAL_PATH(real_path, this, loc->path); - op_ret = lremovexattr (real_path, name); - op_errno = errno; - if (op_ret == -1) { - if (BDB_TIMED_LOG (op_errno, gf_bdb_xattr_log)) { - gf_log (this->name, GF_LOG_DEBUG, - "REMOVEXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, name, strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "REMOVEXATTR %"PRId64" (%s) - %s: %s", - loc->ino, loc->path, name, strerror (op_errno)); - } - } /* if(op_ret == -1) */ -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -}/* bdb_removexattr */ - - -int32_t -bdb_fsyncdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int datasync) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - struct bdb_fd *bfd = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "FSYNCDIR %"PRId64": EBADFD" - "(failed to find internal context from fd)", - fd->inode->ino); - op_errno = EBADFD; - op_ret = -1; - } - -out: - STACK_UNWIND (frame, op_ret, op_errno); - - return 0; -}/* bdb_fsycndir */ - - -int32_t -bdb_access (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t mask) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - char *real_path = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = access (real_path, mask); - op_errno = errno; - /* TODO: implement for db entries */ -out: - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -}/* bdb_access */ - - -int32_t -bdb_ftruncate (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - off_t offset) -{ - int32_t op_ret = -1; - int32_t op_errno = EPERM; - struct stat buf = {0,}; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - /* TODO: impelement */ -out: - STACK_UNWIND (frame, op_ret, op_errno, &buf); - - return 0; -} - - - -int32_t -bdb_setdents (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags, - dir_entry_t *entries, - int32_t count) -{ - int32_t op_ret = -1, op_errno = EINVAL; - char *entry_path = NULL; - int32_t real_path_len = 0; - int32_t entry_path_len = 0; - int32_t ret = 0; - struct bdb_dir *bfd = NULL; - dir_entry_t *trav = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - GF_VALIDATE_OR_GOTO (this->name, entries, out); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "SETDENTS %"PRId64": EBADFD", - fd->inode->ino); - op_errno = EBADFD; - op_ret = -1; - goto out; - } - - real_path_len = strlen (bfd->path); - entry_path_len = real_path_len + 256; - entry_path = GF_CALLOC (1, entry_path_len, gf_bdb_mt_char); - GF_VALIDATE_OR_GOTO (this->name, entry_path, out); - - strcpy (entry_path, bfd->path); - entry_path[real_path_len] = '/'; - - trav = entries->next; - while (trav) { - char pathname[PATH_MAX] = {0,}; - strcpy (pathname, entry_path); - strcat (pathname, trav->name); - - if (S_ISDIR(trav->buf.st_mode)) { - /* If the entry is directory, create it by calling - * 'mkdir'. If directory is not present, it will be - * created, if its present, no worries even if it fails. - */ - ret = mkdir (pathname, trav->buf.st_mode); - if ((ret == -1) && (errno != EEXIST)) { - op_errno = errno; - op_ret = ret; - gf_log (this->name, GF_LOG_DEBUG, - "SETDENTS %"PRId64" - %s: %s " - "(mkdir failed)", - fd->inode->ino, pathname, - strerror (op_errno)); - goto loop; - } - - /* Change the mode - * NOTE: setdents tries its best to restore the state - * of storage. if chmod and chown fail, they can - * be ignored now */ - ret = chmod (pathname, trav->buf.st_mode); - if (ret < 0) { - op_ret = -1; - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "SETDENTS %"PRId64" - %s: %s " - "(chmod failed)", - fd->inode->ino, pathname, - strerror (op_errno)); - goto loop; - } - /* change the ownership */ - ret = chown (pathname, trav->buf.st_uid, - trav->buf.st_gid); - if (ret != 0) { - op_ret = -1; - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "SETDENTS %"PRId64" - %s: %s " - "(chown failed)", - fd->inode->ino, pathname, - strerror (op_errno)); - goto loop; - } - } else if ((flags == GF_SET_IF_NOT_PRESENT) || - (flags != GF_SET_DIR_ONLY)) { - /* Create a 0 byte file here */ - if (S_ISREG (trav->buf.st_mode)) { - op_ret = bdb_db_icreate (bfd->ctx, - trav->name); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "SETDENTS %"PRId64" (%s) - %s: " - "%s (database entry creation" - " failed)", - fd->inode->ino, - bfd->ctx->directory, trav->name, - strerror (op_errno)); - } - } else if (S_ISLNK (trav->buf.st_mode)) { - /* TODO: impelement */; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "SETDENTS %"PRId64" (%s) - %s mode=%o: " - "(unsupported file type)", - fd->inode->ino, - bfd->ctx->directory, trav->name, - trav->buf.st_mode); - } /* if(S_ISREG())...else */ - } /* if(S_ISDIR())...else if */ - loop: - /* consider the next entry */ - trav = trav->next; - } /* while(trav) */ - -out: - STACK_UNWIND (frame, op_ret, op_errno); - - GF_FREE (entry_path); - return 0; -} - -int32_t -bdb_fstat (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - struct stat stbuf = {0,}; - struct bdb_fd *bfd = NULL; - bctx_t *bctx = NULL; - char *db_path = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "FSTAT %"PRId64": EBADFD " - "(failed to find internal context in fd)", - fd->inode->ino); - op_errno = EBADFD; - op_ret = -1; - goto out; - } - - bctx = bfd->ctx; - - MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); - op_ret = lstat (db_path, &stbuf); - op_errno = errno; - if (op_ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "FSTAT %"PRId64": %s" - "(failed to stat database file %s)", - fd->inode->ino, strerror (op_errno), db_path); - goto out; - } - - stbuf.st_ino = fd->inode->ino; - stbuf.st_size = bdb_db_fread (bfd, NULL, 0, 0); - stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); - -out: - STACK_UNWIND (frame, op_ret, op_errno, &stbuf); - return 0; -} - -gf_dirent_t * -gf_dirent_for_namen (const char *name, - size_t len) -{ - char *tmp_name = NULL; - - tmp_name = alloca (len + 1); - - memcpy (tmp_name, name, len); - - tmp_name[len] = 0; - - return gf_dirent_for_name (tmp_name); -} - -int32_t -bdb_readdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t off) -{ - struct bdb_dir *bfd = NULL; - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - size_t filled = 0; - gf_dirent_t *this_entry = NULL; - gf_dirent_t entries; - struct dirent *entry = NULL; - off_t in_case = 0; - int32_t this_size = 0; - DBC *cursorp = NULL; - int32_t count = 0; - off_t offset = 0; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - INIT_LIST_HEAD (&entries.list); - - BDB_FCTX_GET (fd, this, &bfd); - if (bfd == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "READDIR %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EBADFD " - "(failed to find internal context in fd)", - fd->inode->ino, size, off); - op_errno = EBADFD; - op_ret = -1; - goto out; - } - - op_ret = bdb_cursor_open (bfd->ctx, &cursorp); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "READDIR %"PRId64" - %"GF_PRI_SIZET",%"PRId64": EBADFD " - "(failed to open cursor to database handle)", - fd->inode->ino, size, off); - op_errno = EBADFD; - goto out; - } - - if (off) { - DBT sec = {0,}, pri = {0,}, val = {0,}; - sec.data = &(off); - sec.size = sizeof (off); - sec.flags = DB_DBT_USERMEM; - val.dlen = 0; - val.doff = 0; - val.flags = DB_DBT_PARTIAL; - - op_ret = bdb_cursor_get (cursorp, &sec, &pri, &val, DB_SET); - if (op_ret == DB_NOTFOUND) { - offset = off; - goto dir_read; - } - } - - while (filled <= size) { - DBT sec = {0,}, pri = {0,}, val = {0,}; - - this_entry = NULL; - - sec.flags = DB_DBT_MALLOC; - pri.flags = DB_DBT_MALLOC; - val.dlen = 0; - val.doff = 0; - val.flags = DB_DBT_PARTIAL; - op_ret = bdb_cursor_get (cursorp, &sec, &pri, &val, DB_NEXT); - - if (op_ret == DB_NOTFOUND) { - /* we reached end of the directory */ - op_ret = 0; - op_errno = 0; - break; - } else if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "READDIR %"PRId64" - %"GF_PRI_SIZET",%"PRId64":" - "(failed to read the next entry from database)", - fd->inode->ino, size, off); - op_errno = ENOENT; - break; - } /* if (op_ret == DB_NOTFOUND)...else if...else */ - - if (pri.data == NULL) { - /* NOTE: currently ignore when we get key.data == NULL. - * TODO: we should not get key.data = NULL */ - gf_log (this->name, GF_LOG_DEBUG, - "READDIR %"PRId64" - %"GF_PRI_SIZET",%"PRId64":" - "(null key read for entry from database)", - fd->inode->ino, size, off); - continue; - }/* if(key.data)...else */ - count++; - this_size = bdb_dirent_size (&pri); - if (this_size + filled > size) - break; - /* TODO - consider endianness here */ - this_entry = gf_dirent_for_namen ((const char *)pri.data, - pri.size); - - this_entry->d_ino = bdb_inode_transform (fd->inode->ino, - pri.data, - pri.size); - this_entry->d_off = *(uint32_t *)sec.data; - this_entry->d_type = 0; - this_entry->d_len = pri.size + 1; - - if (sec.data) { - GF_FREE (sec.data); - } - - if (pri.data) - GF_FREE (pri.data); - - list_add_tail (&this_entry->list, &entries.list); - - filled += this_size; - }/* while */ - bdb_cursor_close (bfd->ctx, cursorp); - op_ret = filled; - op_errno = 0; - if (filled >= size) { - goto out; - } -dir_read: - /* hungry kyaa? */ - if (!offset) { - rewinddir (bfd->dir); - } else { - seekdir (bfd->dir, offset); - } - - while (filled <= size) { - this_entry = NULL; - entry = NULL; - this_size = 0; - - in_case = telldir (bfd->dir); - entry = readdir (bfd->dir); - if (!entry) - break; - - if (IS_BDB_PRIVATE_FILE(entry->d_name)) - continue; - - this_size = dirent_size (entry); - - if (this_size + filled > size) { - seekdir (bfd->dir, in_case); - break; - } - - count++; - - this_entry = gf_dirent_for_name (entry->d_name); - this_entry->d_ino = entry->d_ino; - - this_entry->d_off = entry->d_off; - - this_entry->d_type = entry->d_type; - this_entry->d_len = entry->d_reclen; - - - list_add_tail (&this_entry->list, &entries.list); - - filled += this_size; - } - op_ret = filled; - op_errno = 0; - -out: - gf_log (this->name, GF_LOG_DEBUG, - "READDIR %"PRId64" - %"GF_PRI_SIZET" (%"PRId32")" - "/%"GF_PRI_SIZET",%"PRId64":" - "(failed to read the next entry from database)", - fd->inode->ino, filled, count, size, off); - - STACK_UNWIND (frame, count, op_errno, &entries); - - gf_dirent_free (&entries); - - return 0; -} - - -int32_t -bdb_stats (call_frame_t *frame, - xlator_t *this, - int32_t flags) - -{ - int32_t op_ret = 0; - int32_t op_errno = 0; - - struct xlator_stats xlstats = {0, }, *stats = NULL; - struct statvfs buf = {0,}; - struct timeval tv; - struct bdb_private *private = NULL; - int64_t avg_read = 0; - int64_t avg_write = 0; - int64_t _time_ms = 0; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - - private = (struct bdb_private *)(this->private); - stats = &xlstats; - - op_ret = statvfs (private->export_path, &buf); - if (op_ret != 0) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "STATS %s: %s", - private->export_path, strerror (op_errno)); - goto out; - } - - stats->nr_files = private->stats.nr_files; - - /* client info is maintained at FSd */ - stats->nr_clients = private->stats.nr_clients; - - /* Number of Free block in the filesystem. */ - stats->free_disk = buf.f_bfree * buf.f_bsize; - stats->total_disk_size = buf.f_blocks * buf.f_bsize; /* */ - stats->disk_usage = (buf.f_blocks - buf.f_bavail) * buf.f_bsize; - - /* Calculate read and write usage */ - gettimeofday (&tv, NULL); - - /* Read */ - _time_ms = (tv.tv_sec - private->init_time.tv_sec) * 1000 + - ((tv.tv_usec - private->init_time.tv_usec) / 1000); - - avg_read = (_time_ms) ? (private->read_value / _time_ms) : 0;/* KBps */ - avg_write = (_time_ms) ? (private->write_value / _time_ms) : 0; - - _time_ms = (tv.tv_sec - private->prev_fetch_time.tv_sec) * 1000 + - ((tv.tv_usec - private->prev_fetch_time.tv_usec) / 1000); - if (_time_ms - && ((private->interval_read / _time_ms) > private->max_read)) { - private->max_read = (private->interval_read / _time_ms); - } - if (_time_ms - && ((private->interval_write / _time_ms) > private->max_write)) { - private->max_write = private->interval_write / _time_ms; - } - - stats->read_usage = avg_read / private->max_read; - stats->write_usage = avg_write / private->max_write; - - gettimeofday (&(private->prev_fetch_time), NULL); - private->interval_read = 0; - private->interval_write = 0; - -out: - STACK_UNWIND (frame, op_ret, op_errno, stats); - return 0; -} - - -int32_t -bdb_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock) -{ - gf_log (this->name, GF_LOG_ERROR, - "glusterfs internal locking request. please load " - "'features/locks' translator to enable glusterfs " - "support"); - - STACK_UNWIND (frame, -1, ENOSYS); - return 0; -} - - -int32_t -bdb_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock) -{ - gf_log (this->name, GF_LOG_ERROR, - "glusterfs internal locking request. please load " - "'features/locks' translator to enable glusterfs " - "support"); - - STACK_UNWIND (frame, -1, ENOSYS); - return 0; -} - - -int32_t -bdb_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) -{ - gf_log (this->name, GF_LOG_ERROR, - "glusterfs internal locking request. please load " - "'features/locks' translator to enable glusterfs " - "support"); - - STACK_UNWIND (frame, -1, ENOSYS); - return 0; -} - - -int32_t -bdb_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) -{ - gf_log (this->name, GF_LOG_ERROR, - "glusterfs internal locking request. please load " - "'features/locks' translator to enable glusterfs " - "support"); - - STACK_UNWIND (frame, -1, ENOSYS); - return 0; -} - -int32_t -bdb_checksum (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flag) -{ - char *real_path = NULL; - DIR *dir = NULL; - struct dirent *dirent = NULL; - uint8_t file_checksum[NAME_MAX] = {0,}; - uint8_t dir_checksum[NAME_MAX] = {0,}; - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - int32_t idx = 0, length = 0; - bctx_t *bctx = NULL; - DBC *cursorp = NULL; - char *data = NULL; - uint8_t no_break = 1; - - GF_VALIDATE_OR_GOTO ("bdb", frame, out); - GF_VALIDATE_OR_GOTO ("bdb", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - - MAKE_REAL_PATH (real_path, this, loc->path); - - { - dir = opendir (real_path); - op_errno = errno; - GF_VALIDATE_OR_GOTO (this->name, dir, out); - while ((dirent = readdir (dir))) { - if (!dirent) - break; - - if (IS_BDB_PRIVATE_FILE(dirent->d_name)) - continue; - - length = strlen (dirent->d_name); - for (idx = 0; idx < length; idx++) - dir_checksum[idx] ^= dirent->d_name[idx]; - } /* while((dirent...)) */ - closedir (dir); - } - - { - bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); - if (bctx == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "CHECKSUM %"PRId64" (%s): ENOMEM" - "(failed to lookup database handle)", - loc->inode->ino, loc->path); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - op_ret = bdb_cursor_open (bctx, &cursorp); - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "CHECKSUM %"PRId64" (%s): EBADFD" - "(failed to open cursor to database handle)", - loc->inode->ino, loc->path); - op_ret = -1; - op_errno = EBADFD; - goto out; - } - - - do { - DBT key = {0,}, value = {0,}, sec = {0,}; - - key.flags = DB_DBT_MALLOC; - value.doff = 0; - value.dlen = 0; - op_ret = bdb_cursor_get (cursorp, &sec, &key, - &value, DB_NEXT); - - if (op_ret == DB_NOTFOUND) { - op_ret = 0; - op_errno = 0; - no_break = 0; - } else if (op_ret == 0){ - /* successfully read */ - data = key.data; - length = key.size; - for (idx = 0; idx < length; idx++) - file_checksum[idx] ^= data[idx]; - - GF_FREE (key.data); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "CHECKSUM %"PRId64" (%s)", - loc->inode->ino, loc->path); - op_ret = -1; - op_errno = ENOENT; /* TODO: watch errno */ - no_break = 0; - }/* if(op_ret == DB_NOTFOUND)...else if...else */ - } while (no_break); - bdb_cursor_close (bctx, cursorp); - } -out: - if (bctx) { - /* NOTE: bctx_unref always returns success, - * see description of bctx_unref for more details */ - bctx_unref (bctx); - } - - STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum); - - return 0; -} - -/** - * notify - when parent sends PARENT_UP, send CHILD_UP event from here - */ -int32_t -notify (xlator_t *this, - int32_t event, - void *data, - ...) -{ - switch (event) - { - case GF_EVENT_PARENT_UP: - { - /* Tell the parent that bdb xlator is up */ - GF_ASSERT ((this->private != NULL) && - (BDB_ENV(this) != NULL)); - default_notify (this, GF_EVENT_CHILD_UP, data); - } - break; - default: - /* */ - break; - } - return 0; -} - - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_bdb_mt_end + 1); - - if (ret != 0) { - gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - -/** - * init - - */ -int32_t -init (xlator_t *this) -{ - int32_t ret = -1; - struct stat buf = {0,}; - struct bdb_private *_private = NULL; - char *directory = NULL; - bctx_t *bctx = NULL; - - GF_VALIDATE_OR_GOTO ("bdb", this, out); - - if (this->children) { - gf_log (this->name, GF_LOG_ERROR, - "'storage/bdb' translator should be used as leaf node " - "in translator tree. please remove the subvolumes" - " specified and retry."); - goto err; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_ERROR, - "'storage/bdb' translator needs at least one among " - "'protocol/server' or 'mount/fuse' translator as " - "parent. please add 'protocol/server' or 'mount/fuse' " - "as parent of 'storage/bdb' and retry. or you can also" - " try specifying mount-point on command-line."); - goto err; - } - - _private = GF_CALLOC (1, sizeof (*_private), gf_bdb_mt_bdb_private); - if (_private == NULL) { - gf_log (this->name, GF_LOG_ERROR, - "could not allocate memory for 'storage/bdb' " - "configuration data-structure. cannot continue from " - "here"); - goto err; - } - - - ret = dict_get_str (this->options, "directory", &directory); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "'storage/bdb' needs at least " - "'option directory <path-to-export-directory>' as " - "minimal configuration option. please specify an " - "export directory using " - "'option directory <path-to-export-directory>' and " - "retry."); - goto err; - } - - umask (000); /* umask `masking' is done at the client side */ - - /* Check whether the specified directory exists, if not create it. */ - ret = stat (directory, &buf); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "specified export path '%s' does not exist. " - "please create the export path '%s' and retry.", - directory, directory); - goto err; - } else if (!S_ISDIR (buf.st_mode)) { - gf_log (this->name, GF_LOG_ERROR, - "specified export path '%s' is not a directory. " - "please specify a valid and existing directory as " - "export directory and retry.", - directory); - goto err; - } else { - ret = 0; - } - - - _private->export_path = gf_strdup (directory); - if (_private->export_path == NULL) { - gf_log (this->name, GF_LOG_ERROR, - "could not allocate memory for 'storage/bdb' " - "configuration data-structure. cannot continue from " - "here"); - goto err; - } - - _private->export_path_length = strlen (_private->export_path); - - { - /* Stats related variables */ - gettimeofday (&_private->init_time, NULL); - gettimeofday (&_private->prev_fetch_time, NULL); - _private->max_read = 1; - _private->max_write = 1; - } - - this->private = (void *)_private; - - { - ret = bdb_db_init (this, this->options); - - if (ret < 0){ - gf_log (this->name, GF_LOG_ERROR, - "database environment initialisation failed. " - "manually run database recovery tool and " - "retry to run glusterfs"); - goto err; - } else { - bctx = bctx_lookup (_private->b_table, "/"); - /* NOTE: we are not doing bctx_unref() for root bctx, - * let it remain in active list forever */ - if (bctx == NULL) { - gf_log (this->name, GF_LOG_ERROR, - "could not allocate memory for " - "'storage/bdb' configuration data-" - "structure. cannot continue from " - "here"); - goto err; - } else { - ret = 0; - goto out; - } - } - } -err: - if (_private) { - if (_private->export_path) - GF_FREE (_private->export_path); - - GF_FREE (_private); - } -out: - return ret; -} - -void -bctx_cleanup (struct list_head *head) -{ - bctx_t *trav = NULL; - bctx_t *tmp = NULL; - DB *storage = NULL; - DB *secondary = NULL; - - list_for_each_entry_safe (trav, tmp, head, list) { - LOCK (&trav->lock); - { - storage = trav->primary; - trav->primary = NULL; - - secondary = trav->secondary; - trav->secondary = NULL; - - list_del_init (&trav->list); - } - UNLOCK (&trav->lock); - - if (storage) { - storage->close (storage, 0); - storage = NULL; - } - - if (secondary) { - secondary->close (secondary, 0); - secondary = NULL; - } - } - return; -} - -void -fini (xlator_t *this) -{ - struct bdb_private *private = NULL; - int32_t ret = 0; - - private = this->private; - - if (B_TABLE(this)) { - /* close all the dbs from lru list */ - bctx_cleanup (&(B_TABLE(this)->b_lru)); - bctx_cleanup (&(B_TABLE(this)->active)); - - if (BDB_ENV(this)) { - LOCK (&private->active_lock); - { - private->active = 0; - } - UNLOCK (&private->active_lock); - - ret = pthread_join (private->checkpoint_thread, NULL); - if (ret != 0) { - gf_log (this->name, GF_LOG_CRITICAL, - "could not complete checkpointing " - "database environment. this might " - "result in inconsistencies in few" - " recent data and meta-data " - "operations"); - } - - BDB_ENV(this)->close (BDB_ENV(this), 0); - } else { - /* impossible to reach here */ - } - - GF_FREE (B_TABLE(this)); - } - GF_FREE (private); - return; -} - - -struct xlator_fops fops = { - .lookup = bdb_lookup, - .stat = bdb_stat, - .opendir = bdb_opendir, - .readdir = bdb_readdir, - .readlink = bdb_readlink, - .mknod = bdb_mknod, - .mkdir = bdb_mkdir, - .unlink = bdb_unlink, - .rmdir = bdb_rmdir, - .symlink = bdb_symlink, - .rename = bdb_rename, - .link = bdb_link, - .truncate = bdb_truncate, - .create = bdb_create, - .open = bdb_open, - .readv = bdb_readv, - .writev = bdb_writev, - .statfs = bdb_statfs, - .flush = bdb_flush, - .fsync = bdb_fsync, - .setxattr = bdb_setxattr, - .getxattr = bdb_getxattr, - .removexattr = bdb_removexattr, - .fsyncdir = bdb_fsyncdir, - .access = bdb_access, - .ftruncate = bdb_ftruncate, - .fstat = bdb_fstat, - .lk = bdb_lk, - .inodelk = bdb_inodelk, - .finodelk = bdb_finodelk, - .entrylk = bdb_entrylk, - .fentrylk = bdb_fentrylk, - .setdents = bdb_setdents, - .getdents = bdb_getdents, - .checksum = bdb_checksum, - .setattr = bdb_setattr, - .fsetattr = bdb_fsetattr, -}; - -struct xlator_cbks cbks = { - .release = bdb_release, - .releasedir = bdb_releasedir -}; - - -struct volume_options options[] = { - { .key = { "directory" }, - .type = GF_OPTION_TYPE_PATH, - .description = "export directory" - }, - { .key = { "logdir" }, - .type = GF_OPTION_TYPE_PATH, - .description = "directory to be used by libdb for writing" - "transaction logs. NOTE: in absence of 'logdir' " - "export directory itself will be used as 'logdir' also" - }, - { .key = { "errfile" }, - .type = GF_OPTION_TYPE_PATH, - .description = "path to be used for libdb error logging. " - "NOTE: absence of 'errfile' will disable any " - "error logging by libdb." - }, - { .key = { "dir-mode" }, - .type = GF_OPTION_TYPE_ANY /* base 8 number */ - }, - { .key = { "file-mode" }, - .type = GF_OPTION_TYPE_ANY, - .description = "file mode for regular files. stat() on a regular file" - " returns the mode specified by this option. " - "NOTE: specify value in octal" - }, - { .key = { "page-size" }, - .type = GF_OPTION_TYPE_SIZET, - .min = 512, - .max = 16384, - .description = "size of pages used to hold data by libdb. set it to " - "block size of exported filesystem for " - "optimal performance" - }, - { .key = { "open-db-lru-limit" }, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 2048, - .description = "maximum number of per directory databases that can " - "be kept open. NOTE: for _advanced_ users only." - }, - { .key = { "lock-timeout" }, - .type = GF_OPTION_TYPE_TIME, - .min = 0, - .max = 4260000, - .description = "define the maximum time a lock request can " - "be blocked by libdb. NOTE: only for _advanced_ users." - " do not specify this option when not sure." - }, - { .key = { "checkpoint-interval" }, - .type = GF_OPTION_TYPE_TIME, - .min = 1, - .max = 86400, - .description = "define the time interval between two consecutive " - "libdb checpoints. setting to lower value will leave " - "bdb perform slowly, but guarantees that minimum data" - " will be lost in case of a crash. NOTE: this option " - "is valid only when " - "'option mode=\"persistent\"' is set." - }, - { .key = { "transaction-timeout" }, - .type = GF_OPTION_TYPE_TIME, - .min = 0, - .max = 4260000, - .description = "maximum time for which a transaction can block " - "waiting for required resources." - }, - { .key = { "mode" }, - .type = GF_OPTION_TYPE_BOOL, - .value = { "cache", "persistent" }, - .description = "cache: data recovery is not guaranteed in case " - "of crash. persistent: data recovery is guaranteed, " - "since all operations are transaction protected." - }, - { .key = { "access-mode" }, - .type = GF_OPTION_TYPE_STR, - .value = {"btree", "hash" }, - .description = "chose the db access method. " - "NOTE: for _advanced_ users. leave the choice to " - "glusterfs when in doubt." - }, - { .key = { NULL } } -}; diff --git a/xlators/storage/bdb/src/bdb.h b/xlators/storage/bdb/src/bdb.h deleted file mode 100644 index da8937a02..000000000 --- a/xlators/storage/bdb/src/bdb.h +++ /dev/null @@ -1,530 +0,0 @@ -/* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef _BDB_H -#define _BDB_H - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include <stdio.h> -#include <dirent.h> -#include <unistd.h> -#include <sys/types.h> -#include <dirent.h> - -#include <db.h> - -#ifdef linux -#ifdef __GLIBC__ -#include <sys/fsuid.h> -#else -#include <unistd.h> -#endif -#endif - -#ifdef HAVE_SYS_XATTR_H -#include <sys/xattr.h> -#endif - -#ifdef HAVE_SYS_EXTATTR_H -#include <sys/extattr.h> -#endif - -#include <pthread.h> -#include "xlator.h" -#include "inode.h" -#include "compat.h" -#include "compat-errno.h" -#include "fd.h" -#include "syscall.h" - -#define BDB_STORAGE "/glusterfs_storage.db" - -/* numbers are not so reader-friendly, so lets have ON and OFF macros */ -#define ON 1 -#define OFF 0 - -#define BDB_DEFAULT_LRU_LIMIT 100 -#define BDB_DEFAULT_HASH_SIZE 100 - -#define BDB_ENOSPC_THRESHOLD 25600 - -#define BDB_DEFAULT_CHECKPOINT_INTERVAL 30 - -#define BCTX_ENV(bctx) (bctx->table->dbenv) - -#define BDB_EXPORT_PATH_LEN(_private) \ - (((struct bdb_private *)_private)->export_path_length) - -#define BDB_KEY_FROM_FREQUEST_KEY(_key) (&(key[15])) - -#define BDB_EXPORT_PATH(_private) \ - (((struct bdb_private *)_private)->export_path) -/* MAKE_REAL_PATH(var,this,path) - * make the real path on the underlying file-system - * - * @var: destination to hold the real path - * @this: pointer to xlator_t corresponding to bdb xlator - * @path: path, as seen from mount-point - */ -#define MAKE_REAL_PATH(var, this, path) do { \ - int base_len = BDB_EXPORT_PATH_LEN(this->private); \ - var = alloca (strlen (path) + base_len + 2); \ - strcpy (var, BDB_EXPORT_PATH(this->private)); \ - strcpy (&var[base_len], path); \ - } while (0) - - -#define BDB_TIMED_LOG(_errno,_counter) \ - ((_errno == ENOTSUP) && (((++_counter) % GF_UNIVERSAL_ANSWER) == 1)) - -#define GF_FILE_CONTENT_REQUEST ZR_FILE_CONTENT_REQUEST - -/* MAKE_REAL_PATH_TO_STORAGE_DB(var,this,path) - * make the real path to the storage-database file on file-system - * - * @var: destination to hold the real path - * @this: pointer to xlator_t corresponding to bdb xlator - * @path: path of the directory, as seen from mount-point - */ -#define MAKE_REAL_PATH_TO_STORAGE_DB(var, this, path) do { \ - int base_len = BDB_EXPORT_PATH_LEN(this->private); \ - var = alloca (strlen (path) + \ - base_len + \ - strlen (BDB_STORAGE)); \ - strcpy (var, BDB_EXPORT_PATH(this->private)); \ - strcpy (&var[base_len], path); \ - strcat (var, BDB_STORAGE); \ - } while (0) - -/* MAKE_KEY_FROM_PATH(key,path) - * make a 'key', which we use as key in the underlying database by using - * the path - * - * @key: destination to hold the key - * @path: path to file as seen from mount-point - */ -#define MAKE_KEY_FROM_PATH(key, path) do { \ - char *tmp = alloca (strlen (path)); \ - strcpy (tmp, path); \ - key = basename (tmp); \ - }while (0); - -/* IS_BDB_PRIVATE_FILE(name) - * check if a given 'name' is bdb xlator's internal file name - * - * @name: basename of a file. - * - * bdb xlator reserves file names 'glusterfs_storage.db', - * 'glusterfs_ns.db'(used by bdb xlator itself), 'log.*', '__db.*' - * (used by libdb) - */ -#define IS_BDB_PRIVATE_FILE(name) ((!strncmp(name, "__db.", 5)) || \ - (!strcmp(name, "glusterfs_storage.db")) || \ - (!strcmp(name, "glusterfs_ns.db")) || \ - (!strncmp(name, "log.0000", 8))) - -/* check if 'name' is '.' or '..' entry */ -#define IS_DOT_DOTDOT(name) \ - ((!strncmp(name,".", 1)) || (!strncmp(name,"..", 2))) - -/* BDB_ICTX_SET(this,inode,bctx) - * pointer to 'struct bdb_ctx' is stored in inode's ctx of all directories. - * this will happen either in lookup() or mkdir(). - * - * @this: pointer xlator_t of bdb xlator. - * @inode: inode where 'struct bdb_ctx *' has to be stored. - * @bctx: a 'struct bdb_ctx *' - */ -#define BDB_ICTX_SET(_inode,_this,_bctx) do{ \ - inode_ctx_put(_inode, _this, (uint64_t)(long)_bctx); \ - }while (0); - -#define BDB_ICTX_GET(_inode,_this,_bctxp) do { \ - uint64_t tmp_bctx = 0; \ - inode_ctx_get (_inode, _this, &tmp_bctx); \ - *_bctxp = tmp_bctx; \ - }while (0); - -/* BDB_FCTX_SET(this,fd,bctx) - * pointer to 'struct bdb_ctx' is stored in inode's ctx of all directories. - * this will happen either in lookup() or mkdir(). - * - * @this: pointer xlator_t of bdb xlator. - * @inode: inode where 'struct bdb_ctx *' has to be stored. - * @bctx: a 'struct bdb_ctx *' - */ -#define BDB_FCTX_SET(_fd,_this,_bfd) do{ \ - fd_ctx_set(_fd, _this, (uint64_t)(long)_bfd); \ - }while (0); - -#define BDB_FCTX_GET(_fd,_this,_bfdp) do { \ - uint64_t tmp_bfd = 0; \ - fd_ctx_get (_fd, _this, &tmp_bfd); \ - *_bfdp = (void *)(long)tmp_bfd; \ - }while (0); - - -/* maximum number of open dbs that bdb xlator will ever have */ -#define BDB_MAX_OPEN_DBS 100 - -/* convert file size to block-count */ -#define BDB_COUNT_BLOCKS(size,blksize) (((size + blksize - 1)/blksize) - 1) - -/* file permissions, again macros are more readable */ -#define RWXRWXRWX 0777 -#define DEFAULT_FILE_MODE 0600 -#define DEFAULT_DIR_MODE 0755 - -/* see, if have a valid file permissions specification in @mode */ -#define IS_VALID_FILE_MODE(mode) (!(mode & (~RWXRWXRWX))) -#define IS_VALID_DIR_MODE(mode) (!(mode & (~(RWXRWXRWX))) - -/* maximum retries for a failed transactional operation */ -#define BDB_MAX_RETRIES 10 - -#define BDB_LL_PAGE_SIZE_DEFAULT 4096 -#define BDB_LL_PAGE_SIZE_MIN 4096 -#define BDB_LL_PAGE_SIZE_MAX 65536 - -#define PAGE_SIZE_IN_RANGE(_page_size) \ - ((_page_size >= BDB_LL_PAGE_SIZE_MIN) \ - && (table->page_size <= BDB_LL_PAGE_SIZE_MAX)) - -typedef struct bctx_table bctx_table_t; -typedef struct bdb_ctx bctx_t; -typedef struct bdb_cache bdb_cache_t; -typedef struct bdb_private bdb_private_t; - -struct bctx_table { - /* flags to be used for opening each database */ - uint64_t dbflags; - - /* cache: can be either ON or OFF */ - uint64_t cache; - - /* used to lock the 'struct bctx_table *' */ - gf_lock_t lock; - - /* lock for checkpointing */ - gf_lock_t checkpoint_lock; - - /* hash table of 'struct bdb_ctx' */ - struct list_head *b_hash; - - /* list of active 'struct bdb_ctx' */ - struct list_head active; - - /* lru list of inactive 'struct bdb_ctx' */ - struct list_head b_lru; - struct list_head purge; - uint32_t lru_limit; - uint32_t lru_size; - uint32_t hash_size; - - /* access mode for accessing the databases, can be DB_HASH, DB_BTREE */ - DBTYPE access_mode; - - /* DB_ENV under which every db operation is carried over */ - DB_ENV *dbenv; - int32_t transaction; - xlator_t *this; - - /* page-size of DB, DB->set_pagesize(), should be set before DB->open */ - uint64_t page_size; -}; - -struct bdb_ctx { - /* controller members */ - - /* lru list of 'struct bdb_ctx's, a bdb_ctx can exist in one of - * b_hash or lru lists */ - struct list_head list; - - /* directory 'name' hashed list of 'struct bdb_ctx's */ - struct list_head b_hash; - - struct bctx_table *table; - int32_t ref; /* reference count */ - gf_lock_t lock; /* used to lock this 'struct bdb_ctx' */ - - char *directory; /* directory path */ - - /* pointer to open database, that resides inside this directory */ - DB *primary; - DB *secondary; - uint32_t cache; /* cache ON or OFF */ - - /* per directory cache, bdb xlator's internal cache */ - struct list_head c_list; /* linked list of cached records */ - int32_t c_count; /* number of cached records */ - - /* index to hash table list, to which this ctx belongs */ - int32_t key_hash; - char *db_path; /* absolute path to db file */ -}; - -struct bdb_fd { - /* pointer to bdb_ctx of the parent directory */ - struct bdb_ctx *ctx; - - /* name of the file. NOTE: basename, not the complete path */ - char *key; - int32_t flags; /* open flags */ -}; - -struct bdb_dir { - /* pointer to bdb_ctx of this directory */ - struct bdb_ctx *ctx; - - /* open directory pointer, as returned by opendir() */ - DIR *dir; - - char *path; /* path to this directory */ -}; - -/* cache */ -struct bdb_cache { - /* list of 'struct bdb_cache' under a 'struct bdb_ctx' */ - struct list_head c_list; - - /* name of the file this cache holds. NOTE: basename of file */ - char *key; - char *data; /* file content */ - - /* size of the file content that this cache holds */ - size_t size; -}; - - -struct bdb_private { - /* pointer to inode table that we use */ - inode_table_t *itable; - int32_t temp; /**/ - char is_stateless; /**/ - - /* path to the export directory - * (option directory <export-path>) */ - char *export_path; - - /* length of 'export_path' string */ - int32_t export_path_length; - - /* statistics */ - /* Statistics, provides activity of the server */ - struct xlator_stats stats; - - struct timeval prev_fetch_time; - struct timeval init_time; - int32_t max_read; /* */ - int32_t max_write; /* */ - - /* Used to calculate the max_read value */ - int64_t interval_read; - - /* Used to calculate the max_write value */ - int64_t interval_write; - int64_t read_value; /* Total read, from init */ - int64_t write_value; /* Total write, from init */ - - /* bdb xlator specific private data */ - - /* flags used for opening DB_ENV for this xlator */ - uint64_t envflags; - - /* flags to be used for opening each database */ - uint64_t dbflags; - - /* cache: can be either ON or OFF */ - uint64_t cache; - - /* transaction: can be either ON or OFF */ - uint32_t transaction; - uint32_t active; - gf_lock_t active_lock; - struct bctx_table *b_table; - - /* access mode for accessing the databases, can be DB_HASH, DB_BTREE - * (option access-mode <mode>) */ - DBTYPE access_mode; - - /* mode for each and every file stored on bdb - * (option file-mode <mode>) */ - mode_t file_mode; - - /* mode for each and every directory stored on bdb - * (option dir-mode <mode>) */ - mode_t dir_mode; - - /* mode for each and every symlink stored on bdb */ - mode_t symlink_mode; - - /* pthread_t object used for creating checkpoint thread */ - pthread_t checkpoint_thread; - - /* time duration between two consecutive checkpoint operations. - * (option checkpoint-interval <time-in-seconds>) */ - uint32_t checkpoint_interval; - - /* environment log directory (option logdir <directory>) */ - char *logdir; - - /* errfile path, used by environment to print detailed error log. - * (option errfile <errfile-path>) */ - char *errfile; - - /* DB_ENV->set_errfile() expects us to fopen - * the errfile before doing DB_ENV->set_errfile() */ - FILE *errfp; - - /* used by DB_ENV->set_timeout to set the timeout for - * a transactionally encapsulated DB->operation() to - * timeout before waiting for locks to be released. - * (option transaction-timeout <time-in-milliseconds>) - */ - uint32_t txn_timeout; - uint32_t lock_timeout; - - /* DB_AUTO_LOG_REMOVE flag for DB_ENV*/ - uint32_t log_auto_remove; - uint32_t log_region_max; -}; - - -static inline int32_t -bdb_txn_begin (DB_ENV *dbenv, - DB_TXN **ptxnid) -{ - return dbenv->txn_begin (dbenv, NULL, ptxnid, 0); -} - -static inline int32_t -bdb_txn_abort (DB_TXN *txnid) -{ - return txnid->abort (txnid); -} - -static inline int32_t -bdb_txn_commit (DB_TXN *txnid) -{ - return txnid->commit (txnid, 0); -} - -void * -bdb_db_stat (bctx_t *bctx, - DB_TXN *txnid, - uint32_t flags); - -/*int32_t -bdb_db_get(struct bdb_ctx *bctx, - DB_TXN *txnid, - const char *key_string, - char **buf, - size_t size, - off_t offset); -*/ -int32_t -bdb_db_fread (struct bdb_fd *bfd, char *bufp, size_t size, off_t offset); - -int32_t -bdb_db_iread (struct bdb_ctx *bctx, const char *key, char **bufp); - -#define BDB_TRUNCATE_RECORD 0xcafebabe - -/*int32_t -bdb_db_put (struct bdb_ctx *bctx, - DB_TXN *txnid, - const char *key_string, - const char *buf, - size_t size, - off_t offset, - int32_t flags); -*/ -int32_t -bdb_db_icreate (struct bdb_ctx *bctx, const char *key); - -int32_t -bdb_db_fwrite (struct bdb_fd *bfd, char *buf, size_t size, off_t offset); - -int32_t -bdb_db_iwrite (struct bdb_ctx *bctx, const char *key, char *buf, size_t size); - -int32_t -bdb_db_itruncate (struct bdb_ctx *bctx, const char *key); - -int32_t -bdb_db_iremove (struct bdb_ctx *bctx, - const char *key); - -ino_t -bdb_inode_transform (ino_t parent, - const char *name, - size_t namelen); - -int32_t -bdb_cursor_open (struct bdb_ctx *bctx, - DBC **cursorp); - -int32_t -bdb_cursor_get (DBC *cursorp, - DBT *sec, DBT *pri, - DBT *value, - int32_t flags); - - -int32_t -bdb_cursor_close (struct bdb_ctx *ctx, - DBC *cursorp); - - -int32_t -bdb_dirent_size (DBT *key); - -int32_t -dirent_size (struct dirent *entry); - -int -bdb_db_init (xlator_t *this, - dict_t *options); - -void -bdb_dbs_from_dict_close (dict_t *this, - char *key, - data_t *value, - void *data); - -bctx_t * -bctx_lookup (struct bctx_table *table, - const char *path); - -bctx_t * -bctx_parent -(struct bctx_table *table, - const char *path); - -bctx_t * -bctx_unref (bctx_t *ctx); - -bctx_t * -bctx_ref (bctx_t *ctx); - -#endif /* _BDB_H */ diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am index 408dcb80d..88efcc784 100644 --- a/xlators/storage/posix/src/Makefile.am +++ b/xlators/storage/posix/src/Makefile.am @@ -2,17 +2,18 @@ xlator_LTLIBRARIES = posix.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage -posix_la_LDFLAGS = -module -avoidversion +posix_la_LDFLAGS = -module -avoid-version -posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c -posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c posix-aio.c +posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBAIO) -noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h +noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h posix-aio.h -AM_CFLAGS = -fPIC -fno-strict-aliasing -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE \ - -D$(GF_HOST_OS) -Wall -I$(top_srcdir)/libglusterfs/src -shared \ - -nostartfiles -I$(top_srcdir)/rpc/xdr/src \ - -I$(top_srcdir)/rpc/rpc-lib/src $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c new file mode 100644 index 000000000..c3bbddd67 --- /dev/null +++ b/xlators/storage/posix/src/posix-aio.c @@ -0,0 +1,569 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "glusterfs.h" +#include "posix.h" +#include <sys/uio.h> + +#ifdef HAVE_LIBAIO +#include <libaio.h> + + +void +__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size) +{ + int odirect = 0; + int flags = 0; + int ret = 0; + + odirect = pfd->odirect; + + if ((fd->flags|opflags) & O_DIRECT) { + /* if instructed, use O_DIRECT always */ + odirect = 1; + } else { + /* else use O_DIRECT when feasible */ + if ((offset|size) & 0xfff) + odirect = 0; + else + odirect = 1; + } + + if (!odirect && pfd->odirect) { + flags = fcntl (pfd->fd, F_GETFL); + ret = fcntl (pfd->fd, F_SETFL, (flags & (~O_DIRECT))); + pfd->odirect = 0; + } + + if (odirect && !pfd->odirect) { + flags = fcntl (pfd->fd, F_GETFL); + ret = fcntl (pfd->fd, F_SETFL, (flags | O_DIRECT)); + pfd->odirect = 1; + } + + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, + "fcntl() failed (%s). fd=%d flags=%d pfd->odirect=%d", + strerror (errno), pfd->fd, flags, pfd->odirect); + } +} + + +struct posix_aio_cb { + struct iocb iocb; + call_frame_t *frame; + struct iobuf *iobuf; + struct iobref *iobref; + struct iatt prebuf; + int fd; + int op; + off_t offset; +}; + + +int +posix_aio_readv_complete (struct posix_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iobuf *iobuf = NULL; + struct iatt postbuf = {0,}; + int _fd = -1; + int op_ret = -1; + int op_errno = 0; + struct iovec iov; + struct iobref *iobref = NULL; + int ret = 0; + off_t offset = 0; + struct posix_private * priv = NULL; + + + frame = paiocb->frame; + this = frame->this; + priv = this->private; + iobuf = paiocb->iobuf; + _fd = paiocb->fd; + offset = paiocb->offset; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "readv(async) failed fd=%d,size=%lu,offset=%llu (%d/%s)", + _fd, paiocb->iocb.u.c.nbytes, + (unsigned long long) paiocb->offset, + res, strerror (op_errno)); + goto out; + } + + ret = posix_fdstat (this, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%d: %s", _fd, + strerror (op_errno)); + goto out; + } + + op_ret = res; + op_errno = 0; + + iobref = iobref_new (); + if (!iobref) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + iobref_add (iobref, iobuf); + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = op_ret; + + + /* Hack to notify higher layers of EOF. */ + if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size) + op_errno = ENOENT; + + LOCK (&priv->lock); + { + priv->read_value += op_ret; + } + UNLOCK (&priv->lock); + +out: + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, &iov, 1, + &postbuf, iobref, NULL); + if (iobuf) + iobuf_unref (iobuf); + if (iobref) + iobref_unref (iobref); + + GF_FREE (paiocb); + + return 0; +} + + +int +posix_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t offset, uint32_t flags, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct iobuf *iobuf = NULL; + struct posix_fd * pfd = NULL; + int ret = -1; + struct posix_aio_cb *paiocb = NULL; + struct posix_private *priv = NULL; + struct iocb *iocb = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); + goto err; + } + _fd = pfd->fd; + + if (!size) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); + goto err; + } + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto err; + } + + paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_posix_mt_paiocb); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + + paiocb->frame = frame; + paiocb->iobuf = iobuf; + paiocb->offset = offset; + paiocb->fd = _fd; + paiocb->op = GF_FOP_READ; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.c.buf = iobuf_ptr (iobuf); + paiocb->iocb.u.c.nbytes = size; + paiocb->iocb.u.c.offset = offset; + + iocb = &paiocb->iocb; + + LOCK (&fd->lock); + { + __posix_fd_set_odirect (fd, pfd, flags, offset, size); + + ret = io_submit (priv->ctxp, 1, &iocb); + } + UNLOCK (&fd->lock); + + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + if (iobuf) + iobuf_unref (iobuf); + + if (paiocb) + GF_FREE (paiocb); + + return 0; +} + + +int +posix_aio_writev_complete (struct posix_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iatt prebuf = {0,}; + struct iatt postbuf = {0,}; + int _fd = -1; + int op_ret = -1; + int op_errno = 0; + int ret = 0; + struct posix_private * priv = NULL; + + + frame = paiocb->frame; + this = frame->this; + priv = this->private; + prebuf = paiocb->prebuf; + _fd = paiocb->fd; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "writev(async) failed fd=%d,offset=%llu (%d/%s)", + _fd, (unsigned long long) paiocb->offset, res, + strerror (op_errno)); + + goto out; + } + + ret = posix_fdstat (this, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%d: %s", _fd, + strerror (op_errno)); + goto out; + } + + + op_ret = res; + op_errno = 0; + + LOCK (&priv->lock); + { + priv->write_value += op_ret; + } + UNLOCK (&priv->lock); + +out: + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &prebuf, &postbuf, + NULL); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + + +int +posix_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *iov, int count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct posix_fd * pfd = NULL; + int ret = -1; + struct posix_aio_cb *paiocb = NULL; + struct posix_private *priv = NULL; + struct iocb *iocb = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); + goto err; + } + _fd = pfd->fd; + + paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_posix_mt_paiocb); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + + paiocb->frame = frame; + paiocb->offset = offset; + paiocb->fd = _fd; + paiocb->op = GF_FOP_WRITE; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iobref = iobref_ref (iobref); + paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.v.vec = iov; + paiocb->iocb.u.v.nr = count; + paiocb->iocb.u.v.offset = offset; + + iocb = &paiocb->iocb; + + ret = posix_fdstat (this, _fd, &paiocb->prebuf); + if (ret != 0) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%p: %s", fd, + strerror (op_errno)); + goto err; + } + + + LOCK (&fd->lock); + { + __posix_fd_set_odirect (fd, pfd, flags, offset, + iov_length (iov, count)); + + ret = io_submit (priv->ctxp, 1, &iocb); + } + UNLOCK (&fd->lock); + + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, 0, 0, 0); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + + +void * +posix_aio_thread (void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + int ret = 0; + int i = 0; + struct io_event events[POSIX_AIO_MAX_NR_GETEVENTS]; + struct io_event *event = NULL; + struct posix_aio_cb *paiocb = NULL; + + this = data; + THIS = this; + priv = this->private; + + for (;;) { + memset (&events[0], 0, sizeof (events)); + ret = io_getevents (priv->ctxp, 1, POSIX_AIO_MAX_NR_GETEVENTS, + &events[0], NULL); + if (ret <= 0) { + gf_log (this->name, GF_LOG_ERROR, + "io_getevents() returned %d", ret); + if (ret == -EINTR) + continue; + break; + } + + for (i = 0; i < ret; i++) { + event = &events[i]; + + paiocb = event->data; + + switch (paiocb->op) { + case GF_FOP_READ: + posix_aio_readv_complete (paiocb, event->res, + event->res2); + break; + case GF_FOP_WRITE: + posix_aio_writev_complete (paiocb, event->res, + event->res2); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "unknown op %d found in piocb", + paiocb->op); + break; + } + } + } + + return NULL; +} + + +int +posix_aio_init (xlator_t *this) +{ + struct posix_private *priv = NULL; + int ret = 0; + + priv = this->private; + + ret = io_setup (POSIX_AIO_MAX_NR_EVENTS, &priv->ctxp); + if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) { + gf_log (this->name, GF_LOG_WARNING, + "Linux AIO not available at run-time." + " Continuing with synchronous IO"); + ret = 0; + goto out; + } + + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "io_setup() failed. ret=%d, errno=%d", + ret, errno); + goto out; + } + + ret = gf_thread_create (&priv->aiothread, NULL, + posix_aio_thread, this); + if (ret != 0) { + io_destroy (priv->ctxp); + goto out; + } + + this->fops->readv = posix_aio_readv; + this->fops->writev = posix_aio_writev; +out: + return ret; +} + + +int +posix_aio_on (xlator_t *this) +{ + struct posix_private *priv = NULL; + int ret = 0; + + priv = this->private; + + if (!priv->aio_init_done) { + ret = posix_aio_init (this); + if (ret == 0) + priv->aio_capable = _gf_true; + else + priv->aio_capable = _gf_false; + priv->aio_init_done = _gf_true; + } + + if (priv->aio_capable) { + this->fops->readv = posix_aio_readv; + this->fops->writev = posix_aio_writev; + } + + return ret; +} + +int +posix_aio_off (xlator_t *this) +{ + this->fops->readv = posix_readv; + this->fops->writev = posix_writev; + + return 0; +} + + +#else + + +int +posix_aio_on (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +int +posix_aio_off (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +void +__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size) +{ + xlator_t *this = THIS; + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return; +} +#endif diff --git a/xlators/storage/posix/src/posix-aio.h b/xlators/storage/posix/src/posix-aio.h new file mode 100644 index 000000000..5bde71601 --- /dev/null +++ b/xlators/storage/posix/src/posix-aio.h @@ -0,0 +1,39 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _POSIX_AIO_H +#define _POSIX_AIO_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "glusterfs.h" + +// Maximum number of concurrently submitted IO events. The heaviest load +// GlusterFS has been able to handle had 60-80 concurrent calls +#define POSIX_AIO_MAX_NR_EVENTS 256 + +// Maximum number of completed IO operations to reap per getevents syscall +#define POSIX_AIO_MAX_NR_GETEVENTS 16 + + +int posix_aio_on (xlator_t *this); +int posix_aio_off (xlator_t *this); + +int posix_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata); + +#endif /* !_POSIX_AIO_H */ diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c index 38b28edc3..219a582c9 100644 --- a/xlators/storage/posix/src/posix-handle.c +++ b/xlators/storage/posix/src/posix-handle.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" @@ -34,9 +24,11 @@ #include "posix-handle.h" #include "posix.h" #include "xlator.h" +#include "syscall.h" #define HANDLE_PFX ".glusterfs" +#define TRASH_DIR "landfill" #define UUID0_STR "00000000-0000-0000-0000-000000000000" #define SLEN(str) (sizeof(str) - 1) @@ -106,7 +98,7 @@ posix_handle_pump (xlator_t *this, char *buf, int len, int maxlen, if ((ret == 8) && memcmp (linkname, "../../..", 8) == 0) { if (strcmp (base_str, buf) == 0) { - strncpy (buf + pfx_len, "..", 3); + strcpy (buf + pfx_len, ".."); } goto out; } @@ -393,6 +385,107 @@ posix_handle_init (xlator_t *this) return 0; } +gf_boolean_t +posix_does_old_trash_exists (char *old_trash) +{ + uuid_t gfid = {0}; + gf_boolean_t exists = _gf_false; + struct stat stbuf = {0}; + int ret = 0; + + ret = lstat (old_trash, &stbuf); + if ((ret == 0) && S_ISDIR (stbuf.st_mode)) { + ret = sys_lgetxattr (old_trash, "trusted.gfid", gfid, 16); + if ((ret < 0) && (errno == ENODATA)) + exists = _gf_true; + } + return exists; +} + +int +posix_handle_new_trash_init (xlator_t *this, char *trash) +{ + int ret = 0; + struct stat stbuf = {0}; + + ret = lstat (trash, &stbuf); + switch (ret) { + case -1: + if (errno == ENOENT) { + ret = mkdir (trash, 0755); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Creating directory %s failed: %s", + trash, strerror (errno)); + } + } else { + gf_log (this->name, GF_LOG_ERROR, "Checking for %s " + "failed: %s", trash, strerror (errno)); + } + break; + case 0: + if (!S_ISDIR (stbuf.st_mode)) { + gf_log (this->name, GF_LOG_ERROR, + "Not a directory: %s", trash); + ret = -1; + } + break; + default: + break; + } + return ret; +} + +int +posix_mv_old_trash_into_new_trash (xlator_t *this, char *old, char *new) +{ + char dest_old[PATH_MAX] = {0}; + int ret = 0; + uuid_t dest_name = {0}; + + if (!posix_does_old_trash_exists (old)) + goto out; + uuid_generate (dest_name); + snprintf (dest_old, sizeof (dest_old), "%s/%s", new, + uuid_utoa (dest_name)); + ret = rename (old, dest_old); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Not able to move " + "%s -> %s (%s)", old, dest_old, strerror (errno)); + } +out: + return ret; +} + +int +posix_handle_trash_init (xlator_t *this) +{ + int ret = -1; + struct posix_private *priv = NULL; + char old_trash[PATH_MAX] = {0}; + + priv = this->private; + + priv->trash_path = GF_CALLOC (1, priv->base_path_length + strlen ("/") + + strlen (HANDLE_PFX) + strlen ("/") + + strlen (TRASH_DIR) + 1, + gf_posix_mt_trash_path); + + if (!priv->trash_path) + goto out; + + strncpy (priv->trash_path, priv->base_path, priv->base_path_length); + strcat (priv->trash_path, "/" HANDLE_PFX "/" TRASH_DIR); + ret = posix_handle_new_trash_init (this, priv->trash_path); + if (ret) + goto out; + snprintf (old_trash, sizeof (old_trash), "%s/.landfill", + priv->base_path); + ret = posix_mv_old_trash_into_new_trash (this, old_trash, + priv->trash_path); +out: + return ret; +} int posix_handle_mkdir_hashes (xlator_t *this, const char *newpath) @@ -454,7 +547,16 @@ posix_handle_hard (xlator_t *this, const char *oldpath, uuid_t gfid, struct stat return -1; } +#ifdef HAVE_LINKAT + /* + * Use linkat if the target may be a symlink to a directory + * or without an existing target. See comment about linkat() + * usage in posix_link() in posix.c for details + */ + ret = linkat (AT_FDCWD, oldpath, AT_FDCWD, newpath, 0); +#else ret = link (oldpath, newpath); +#endif if (ret) { gf_log (this->name, GF_LOG_WARNING, "link %s -> %s failed (%s)", @@ -471,13 +573,6 @@ posix_handle_hard (xlator_t *this, const char *oldpath, uuid_t gfid, struct stat } } - ret = lstat (newpath, &newbuf); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "lstat on %s failed (%s)", newpath, strerror (errno)); - return -1; - } - if (newbuf.st_ino != oldbuf->st_ino || newbuf.st_dev != oldbuf->st_dev) { gf_log (this->name, GF_LOG_WARNING, @@ -633,7 +728,16 @@ posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid, MAKE_HANDLE_PATH (newpath, this, gfid, NULL); ret = lstat (newpath, &stbuf); if (!ret) { +#ifdef HAVE_LINKAT + /* + * Use linkat if the target may be a symlink to a directory + * or without an existing target. See comment about linkat() + * usage in posix_link() in posix.c for details + */ + ret = linkat (AT_FDCWD, newpath, AT_FDCWD, real_path, 0); +#else ret = link (newpath, real_path); +#endif } return ret; diff --git a/xlators/storage/posix/src/posix-handle.h b/xlators/storage/posix/src/posix-handle.h index ec4baec5b..f1163b727 100644 --- a/xlators/storage/posix/src/posix-handle.h +++ b/xlators/storage/posix/src/posix-handle.h @@ -1,22 +1,12 @@ /* - Copyright (c) 2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef _POSIX_HANDLE_H #define _POSIX_HANDLE_H @@ -148,4 +138,6 @@ int posix_handle_init (xlator_t *this); int posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid, char *real_path); +int +posix_handle_trash_init (xlator_t *this); #endif /* !_POSIX_HANDLE_H */ diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index 06b5cedcb..e295f8850 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" @@ -32,6 +22,7 @@ #include <pthread.h> #include <ftw.h> #include <sys/stat.h> +#include <signal.h> #ifndef GF_BSD_HOST_OS #include <alloca.h> @@ -54,16 +45,9 @@ #include "timer.h" #include "glusterfs3-xdr.h" #include "hashfn.h" +#include "glusterfs-acl.h" #include <fnmatch.h> -typedef struct { - xlator_t *this; - const char *real_path; - dict_t *xattr; - struct iatt *stbuf; - loc_t *loc; -} posix_xattr_filler_t; - char *marker_xattrs[] = {"trusted.glusterfs.quota.*", "trusted.glusterfs.*.xtime", NULL}; @@ -117,7 +101,7 @@ out: return ignore; } -static void +static int _posix_xattr_get_set (dict_t *xattr_req, char *key, data_t *data, @@ -187,19 +171,13 @@ _posix_xattr_get_set (dict_t *xattr_req, err: if (_fd != -1) close (_fd); - if (databuf) - GF_FREE (databuf); + GF_FREE (databuf); } } else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) { loc = filler->loc; - if (loc && !list_empty (&loc->inode->fd_list)) { - ret = dict_set_uint32 (filler->xattr, key, 1); - if (ret < 0) - gf_log (filler->this->name, GF_LOG_WARNING, - "Failed to set dictionary value for %s", - key); - } else { - ret = dict_set_uint32 (filler->xattr, key, 0); + if (loc) { + ret = dict_set_uint32 (filler->xattr, key, + loc->inode->fd_count); if (ret < 0) gf_log (filler->this->name, GF_LOG_WARNING, "Failed to set dictionary value for %s", @@ -212,22 +190,31 @@ _posix_xattr_get_set (dict_t *xattr_req, value = GF_CALLOC (1, xattr_size + 1, gf_posix_mt_char); if (!value) - return; + return -1; - sys_lgetxattr (filler->real_path, key, value, - xattr_size); + xattr_size = sys_lgetxattr (filler->real_path, key, value, + xattr_size); + if (xattr_size <= 0) { + gf_log (filler->this->name, GF_LOG_WARNING, + "getxattr failed. path: %s, key: %s", + filler->real_path, key); + GF_FREE (value); + return -1; + } value[xattr_size] = '\0'; ret = dict_set_bin (filler->xattr, key, value, xattr_size); - if (ret < 0) + if (ret < 0) { gf_log (filler->this->name, GF_LOG_DEBUG, "dict set failed. path: %s, key: %s", filler->real_path, key); + GF_FREE (value); + } } } out: - return; + return 0; } @@ -235,14 +222,17 @@ int posix_fill_gfid_path (xlator_t *this, const char *path, struct iatt *iatt) { int ret = 0; + ssize_t size = 0; if (!iatt) return 0; - ret = sys_lgetxattr (path, GFID_XATTR_KEY, iatt->ia_gfid, 16); + size = sys_lgetxattr (path, GFID_XATTR_KEY, iatt->ia_gfid, 16); /* Return value of getxattr */ - if ((ret == 16) || (ret == -1)) + if ((size == 16) || (size == -1)) ret = 0; + else + ret = size; return ret; } @@ -252,14 +242,17 @@ int posix_fill_gfid_fd (xlator_t *this, int fd, struct iatt *iatt) { int ret = 0; + ssize_t size = 0; if (!iatt) return 0; - ret = sys_fgetxattr (fd, GFID_XATTR_KEY, iatt->ia_gfid, 16); + size = sys_fgetxattr (fd, GFID_XATTR_KEY, iatt->ia_gfid, 16); /* Return value of getxattr */ - if ((ret == 16) || (ret == -1)) + if ((size == 16) || (size == -1)) ret = 0; + else + ret = size; return ret; } @@ -332,11 +325,21 @@ posix_istat (xlator_t *this, uuid_t gfid, const char *basename, ret = lstat (real_path, &lstatbuf); - if (ret == -1) { - if (errno != ENOENT && errno != ELOOP) - gf_log (this->name, GF_LOG_WARNING, - "lstat failed on %s (%s)", - real_path, strerror (errno)); + if (ret != 0) { + if (ret == -1) { + if (errno != ENOENT && errno != ELOOP) + gf_log (this->name, GF_LOG_WARNING, + "lstat failed on %s (%s)", + real_path, strerror (errno)); + } else { + // may be some backend filesystem issue + gf_log (this->name, GF_LOG_ERROR, "lstat failed on " + "%s and return value is %d instead of -1. " + "Please see dmesg output to check whether the " + "failure is due to backend filesystem issue", + real_path, ret); + ret = -1; + } goto out; } @@ -380,11 +383,21 @@ posix_pstat (xlator_t *this, uuid_t gfid, const char *path, ret = lstat (path, &lstatbuf); - if (ret == -1) { - if (errno != ENOENT) - gf_log (this->name, GF_LOG_WARNING, - "lstat failed on %s (%s)", - path, strerror (errno)); + if (ret != 0) { + if (ret == -1) { + if (errno != ENOENT) + gf_log (this->name, GF_LOG_WARNING, + "lstat failed on %s (%s)", + path, strerror (errno)); + } else { + // may be some backend filesytem issue + gf_log (this->name, GF_LOG_ERROR, "lstat failed on " + "%s and return value is %d instead of -1. " + "Please see dmesg output to check whether the " + "failure is due to backend filesystem issue", + path, ret); + ret = -1; + } goto out; } @@ -443,6 +456,7 @@ posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) void *uuid_req = NULL; uuid_t uuid_curr; int ret = 0; + ssize_t size = 0; struct stat stat = {0, }; @@ -452,8 +466,8 @@ posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) if (sys_lstat (path, &stat) != 0) goto out; - ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); - if (ret == 16) { + size = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); + if (size == 16) { ret = 0; goto verify_handle; } @@ -487,8 +501,8 @@ out: int -posix_set_file_contents (xlator_t *this, const char *path, data_pair_t *trav, - int flags) +posix_set_file_contents (xlator_t *this, const char *path, char *keyp, + data_t *value, int flags) { char * key = NULL; char real_path[PATH_MAX]; @@ -500,7 +514,7 @@ posix_set_file_contents (xlator_t *this, const char *path, data_pair_t *trav, /* XXX: does not handle assigning GFID to created files */ return -1; - key = &(trav->key[15]); + key = &(keyp[15]); sprintf (real_path, "%s/%s", path, key); if (flags & XATTR_REPLACE) { @@ -512,9 +526,8 @@ posix_set_file_contents (xlator_t *this, const char *path, data_pair_t *trav, goto create; } - if (trav->value->len) { - ret = write (file_fd, trav->value->data, - trav->value->len); + if (value->len) { + ret = write (file_fd, value->data, value->len); if (ret == -1) { op_ret = -errno; gf_log (this->name, GF_LOG_ERROR, @@ -546,7 +559,7 @@ posix_set_file_contents (xlator_t *this, const char *path, data_pair_t *trav, goto out; } - ret = write (file_fd, trav->value->data, trav->value->len); + ret = write (file_fd, value->data, value->len); if (ret == -1) { op_ret = -errno; gf_log (this->name, GF_LOG_ERROR, @@ -630,8 +643,7 @@ posix_get_file_contents (xlator_t *this, uuid_t pargfid, out: if (op_ret < 0) { - if (*contents) - GF_FREE (*contents); + GF_FREE (*contents); if (file_fd != -1) close (file_fd); } @@ -643,17 +655,17 @@ static int gf_xattr_enotsup_log; int posix_handle_pair (xlator_t *this, const char *real_path, - data_pair_t *trav, int flags) + char *key, data_t *value, int flags) { int sys_ret = -1; int ret = 0; - if (ZR_FILE_CONTENT_REQUEST(trav->key)) { - ret = posix_set_file_contents (this, real_path, trav, flags); + if (ZR_FILE_CONTENT_REQUEST(key)) { + ret = posix_set_file_contents (this, real_path, key, value, + flags); } else { - sys_ret = sys_lsetxattr (real_path, trav->key, - trav->value->data, - trav->value->len, flags); + sys_ret = sys_lsetxattr (real_path, key, value->data, + value->len, flags); if (sys_ret < 0) { if (errno == ENOTSUP) { @@ -663,12 +675,13 @@ posix_handle_pair (xlator_t *this, const char *real_path, "supported (try remounting " "brick with 'user_xattr' " "flag)"); - } else if (errno == ENOENT && - !posix_special_xattr (marker_xattrs, - trav->key)) { - gf_log (this->name, GF_LOG_ERROR, - "setxattr on %s failed: %s", real_path, - strerror (errno)); + } else if (errno == ENOENT) { + if (!posix_special_xattr (marker_xattrs, + key)) { + gf_log (this->name, GF_LOG_ERROR, + "setxattr on %s failed: %s", + real_path, strerror (errno)); + } } else { #ifdef GF_DARWIN_HOST_OS @@ -676,12 +689,12 @@ posix_handle_pair (xlator_t *this, const char *real_path, ((errno == EINVAL) ? GF_LOG_DEBUG : GF_LOG_ERROR), "%s: key:%s error:%s", - real_path, trav->key, + real_path, key, strerror (errno)); #else /* ! DARWIN */ gf_log (this->name, GF_LOG_ERROR, "%s: key:%s error:%s", - real_path, trav->key, + real_path, key, strerror (errno)); #endif /* DARWIN */ } @@ -696,13 +709,13 @@ out: int posix_fhandle_pair (xlator_t *this, int fd, - data_pair_t *trav, int flags) + char *key, data_t *value, int flags) { int sys_ret = -1; int ret = 0; - sys_ret = sys_fsetxattr (fd, trav->key, trav->value->data, - trav->value->len, flags); + sys_ret = sys_fsetxattr (fd, key, value->data, + value->len, flags); if (sys_ret < 0) { if (errno == ENOTSUP) { @@ -723,13 +736,11 @@ posix_fhandle_pair (xlator_t *this, int fd, ((errno == EINVAL) ? GF_LOG_DEBUG : GF_LOG_ERROR), "fd=%d: key:%s error:%s", - fd, trav->key, - strerror (errno)); + fd, key, strerror (errno)); #else /* ! DARWIN */ gf_log (this->name, GF_LOG_ERROR, "fd=%d: key:%s error:%s", - fd, trav->key, - strerror (errno)); + fd, key, strerror (errno)); #endif /* DARWIN */ } @@ -833,7 +844,7 @@ posix_janitor_thread_proc (void *data) time (&now); if ((now - priv->last_landfill_check) > priv->janitor_sleep_duration) { gf_log (this->name, GF_LOG_TRACE, - "janitor cleaning out /" GF_REPLICATE_TRASH_DIR); + "janitor cleaning out %s", priv->trash_path); nftw (priv->trash_path, janitor_walker, @@ -874,8 +885,8 @@ posix_spawn_janitor_thread (xlator_t *this) LOCK (&priv->lock); { if (!priv->janitor_present) { - ret = pthread_create (&priv->janitor, NULL, - posix_janitor_thread_proc, this); + ret = gf_thread_create (&priv->janitor, NULL, + posix_janitor_thread_proc, this); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, @@ -891,6 +902,74 @@ unlock: UNLOCK (&priv->lock); } +static int +is_fresh_file (struct stat *stat) +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + + if ((stat->st_ctime >= (tv.tv_sec - 1)) + && (stat->st_ctime <= tv.tv_sec)) + return 1; + + return 0; +} + + +int +posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) +{ + /* The purpose of this function is to prevent a race + where an inode creation FOP (like mkdir/mknod/create etc) + races with lookup in the following way: + + {create thread} | {lookup thread} + | + t0 + mkdir ("name") | + t1 + | posix_gfid_set ("name", 2); + t2 + posix_gfid_set ("name", 1); | + t3 + lstat ("name"); | lstat ("name"); + + In the above case mkdir FOP would have resulted with GFID 2 while + it should have been GFID 1. It matters in the case where GFID would + have gotten set to 1 on other subvolumes of replciate/distribute + + The "solution" here is that, if we detect lookup is attempting to + set a GFID on a file which is created very recently, but does not + yet have a GFID (i.e, between t1 and t2), then "fake" it as though + posix_gfid_heal was called at t0 instead. + */ + + uuid_t uuid_curr; + int ret = 0; + struct stat stat = {0, }; + + if (!xattr_req) + goto out; + + if (sys_lstat (path, &stat) != 0) + goto out; + + ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); + if (ret != 16) { + if (is_fresh_file (&stat)) { + ret = -1; + errno = ENOENT; + goto out; + } + } + + ret = posix_gfid_set (this, path, loc, xattr_req); +out: + return ret; +} + + int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req) { @@ -904,17 +983,17 @@ posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req) if (sys_lstat (path, &stat) != 0) goto out; - data = dict_get (xattr_req, "system.posix_acl_access"); + data = dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR); if (data) { - ret = sys_lsetxattr (path, "system.posix_acl_access", + ret = sys_lsetxattr (path, POSIX_ACL_ACCESS_XATTR, data->data, data->len, 0); if (ret != 0) goto out; } - data = dict_get (xattr_req, "system.posix_acl_default"); + data = dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR); if (data) { - ret = sys_lsetxattr (path, "system.posix_acl_default", + ret = sys_lsetxattr (path, POSIX_ACL_DEFAULT_XATTR, data->data, data->len, 0); if (ret != 0) goto out; @@ -924,37 +1003,47 @@ out: return ret; } +static int +_handle_entry_create_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + int ret = -1; + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + if (!strcmp (GFID_XATTR_KEY, k) || + !strcmp ("gfid-req", k) || + !strcmp (POSIX_ACL_DEFAULT_XATTR, k) || + !strcmp (POSIX_ACL_ACCESS_XATTR, k) || + ZR_FILE_CONTENT_REQUEST(k)) { + return 0; + } + + ret = posix_handle_pair (filler->this, filler->real_path, k, v, + XATTR_CREATE); + if (ret < 0) { + errno = -ret; + return -1; + } + return 0; +} + int posix_entry_create_xattr_set (xlator_t *this, const char *path, dict_t *dict) { - data_pair_t *trav = NULL; int ret = -1; + posix_xattr_filler_t filler = {0,}; + if (!dict) goto out; - trav = dict->members_list; - while (trav) { - if (!strcmp (GFID_XATTR_KEY, trav->key) || - !strcmp ("gfid-req", trav->key) || - !strcmp ("system.posix_acl_default", trav->key) || - !strcmp ("system.posix_acl_access", trav->key) || - ZR_FILE_CONTENT_REQUEST(trav->key)) { - trav = trav->next; - continue; - } - - ret = posix_handle_pair (this, path, trav, XATTR_CREATE); - if (ret < 0) { - errno = -ret; - ret = -1; - goto out; - } - trav = trav->next; - } + filler.this = this; + filler.real_path = path; - ret = 0; + ret = dict_foreach (dict, _handle_entry_create_keyvalue_pair, &filler); out: return ret; @@ -978,7 +1067,7 @@ __posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd_p) goto out; } - if (fd->pid != -1) + if (!fd_is_anonymous(fd)) /* anonymous fd */ goto out; @@ -1045,10 +1134,258 @@ posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd) return ret; } +static void * +posix_health_check_thread_proc (void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + uint32_t interval = 0; + int ret = -1; + struct stat sb = {0, }; + + this = data; + priv = this->private; + + /* prevent races when the interval is updated */ + interval = priv->health_check_interval; + if (interval == 0) + goto out; + + gf_log (this->name, GF_LOG_DEBUG, "health-check thread started, " + "interval = %d seconds", interval); + + while (1) { + /* aborting sleep() is a request to exit this thread, sleep() + * will normally not return when cancelled */ + ret = sleep (interval); + if (ret > 0) + break; + + /* prevent thread errors while doing the health-check(s) */ + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + + /* Do the health-check, it should be moved to its own function + * in case it gets more complex. */ + ret = stat (priv->base_path, &sb); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "stat() on %s returned: %s", priv->base_path, + strerror (errno)); + goto abort; + } + + pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL); + } + +out: + gf_log (this->name, GF_LOG_DEBUG, "health-check thread exiting"); + + LOCK (&priv->lock); + { + priv->health_check_active = _gf_false; + } + UNLOCK (&priv->lock); + + return NULL; + +abort: + /* health-check failed */ + gf_log (this->name, GF_LOG_EMERG, "health-check failed, going down"); + xlator_notify (this->parents->xlator, GF_EVENT_CHILD_DOWN, this); + + ret = sleep (30); + if (ret == 0) { + gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGTERM"); + kill (getpid(), SIGTERM); + } + + ret = sleep (30); + if (ret == 0) { + gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGKILL"); + kill (getpid(), SIGKILL); + } + + return NULL; +} + +void +posix_spawn_health_check_thread (xlator_t *xl) +{ + struct posix_private *priv = NULL; + int ret = -1; + + priv = xl->private; + + LOCK (&priv->lock); + { + /* cancel the running thread */ + if (priv->health_check_active == _gf_true) { + pthread_cancel (priv->health_check); + priv->health_check_active = _gf_false; + } + + /* prevent scheduling a check in a tight loop */ + if (priv->health_check_interval == 0) + goto unlock; + + ret = gf_thread_create (&priv->health_check, NULL, + posix_health_check_thread_proc, xl); + if (ret < 0) { + priv->health_check_interval = 0; + priv->health_check_active = _gf_false; + gf_log (xl->name, GF_LOG_ERROR, + "unable to setup health-check thread: %s", + strerror (errno)); + goto unlock; + } + + /* run the thread detached, resources will be freed on exit */ + pthread_detach (priv->health_check); + priv->health_check_active = _gf_true; + } +unlock: + UNLOCK (&priv->lock); +} int -posix_fd_ctx_get_off (fd_t *fd, xlator_t *this, struct posix_fd **pfd, - off_t offset) +posix_fsyncer_pick (xlator_t *this, struct list_head *head) +{ + struct posix_private *priv = NULL; + int count = 0; + + priv = this->private; + pthread_mutex_lock (&priv->fsync_mutex); + { + while (list_empty (&priv->fsyncs)) + pthread_cond_wait (&priv->fsync_cond, + &priv->fsync_mutex); + + count = priv->fsync_queue_count; + priv->fsync_queue_count = 0; + list_splice_init (&priv->fsyncs, head); + } + pthread_mutex_unlock (&priv->fsync_mutex); + + return count; +} + + +void +posix_fsyncer_process (xlator_t *this, call_stub_t *stub, gf_boolean_t do_fsync) +{ + struct posix_fd *pfd = NULL; + int ret = -1; + struct posix_private *priv = NULL; + + priv = this->private; + + ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not get fdctx for fd(%s)", + uuid_utoa (stub->args.fd->inode->gfid)); + call_unwind_error (stub, -1, EINVAL); + return; + } + + if (do_fsync) { +#ifdef HAVE_FDATASYNC + if (stub->args.datasync) + ret = fdatasync (pfd->fd); + else +#endif + ret = fsync (pfd->fd); + } else { + ret = 0; + } + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "could not fstat fd(%s)", + uuid_utoa (stub->args.fd->inode->gfid)); + call_unwind_error (stub, -1, errno); + return; + } + + call_unwind_error (stub, 0, 0); +} + + +static void +posix_fsyncer_syncfs (xlator_t *this, struct list_head *head) +{ + call_stub_t *stub = NULL; + struct posix_fd *pfd = NULL; + int ret = -1; + + stub = list_entry (head->prev, call_stub_t, list); + ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); + if (ret) + return; + +#ifdef GF_LINUX_HOST_OS + /* syncfs() is not "declared" in RHEL's glibc even though + the kernel has support. + */ +#include <sys/syscall.h> +#include <unistd.h> +#ifdef SYS_syncfs + syscall (SYS_syncfs, pfd->fd); +#else + sync(); +#endif +#else + sync(); +#endif +} + + +void * +posix_fsyncer (void *d) { - return posix_fd_ctx_get (fd, this, pfd); + xlator_t *this = d; + struct posix_private *priv = NULL; + call_stub_t *stub = NULL; + call_stub_t *tmp = NULL; + struct list_head list; + int count = 0; + gf_boolean_t do_fsync = _gf_true; + + priv = this->private; + + for (;;) { + INIT_LIST_HEAD (&list); + + count = posix_fsyncer_pick (this, &list); + + usleep (priv->batch_fsync_delay_usec); + + gf_log (this->name, GF_LOG_DEBUG, + "picked %d fsyncs", count); + + switch (priv->batch_fsync_mode) { + case BATCH_NONE: + case BATCH_REVERSE_FSYNC: + break; + case BATCH_SYNCFS: + case BATCH_SYNCFS_SINGLE_FSYNC: + case BATCH_SYNCFS_REVERSE_FSYNC: + posix_fsyncer_syncfs (this, &list); + break; + } + + if (priv->batch_fsync_mode == BATCH_SYNCFS) + do_fsync = _gf_false; + else + do_fsync = _gf_true; + + list_for_each_entry_safe_reverse (stub, tmp, &list, list) { + list_del_init (&stub->list); + + posix_fsyncer_process (this, stub, do_fsync); + + if (priv->batch_fsync_mode == BATCH_SYNCFS_SINGLE_FSYNC) + do_fsync = _gf_false; + } + } } diff --git a/xlators/storage/posix/src/posix-mem-types.h b/xlators/storage/posix/src/posix-mem-types.h index 10aa75edc..81752c17e 100644 --- a/xlators/storage/posix/src/posix-mem-types.h +++ b/xlators/storage/posix/src/posix-mem-types.h @@ -1,22 +1,12 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __POSIX_MEM_TYPES_H__ #define __POSIX_MEM_TYPES_H__ @@ -30,6 +20,7 @@ enum gf_posix_mem_types_ { gf_posix_mt_int32_t, gf_posix_mt_posix_dev_t, gf_posix_mt_trash_path, + gf_posix_mt_paiocb, gf_posix_mt_end }; #endif diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index f98056286..fb45c7a67 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" @@ -33,6 +23,8 @@ #include <pthread.h> #include <ftw.h> #include <sys/stat.h> +#include <signal.h> +#include <sys/uio.h> #ifndef GF_BSD_HOST_OS #include <alloca.h> @@ -59,8 +51,11 @@ #include "timer.h" #include "glusterfs3-xdr.h" #include "hashfn.h" +#include "posix-aio.h" +#include "glusterfs-acl.h" extern char *marker_xattrs[]; +#define ALIGN_SIZE 4096 #undef HAVE_SET_FSID #ifdef HAVE_SET_FSID @@ -114,7 +109,6 @@ posix_lookup (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (loc->path, out); /* The Hidden directory should be for housekeeping purpose and it should not get any gfid on it */ @@ -137,7 +131,7 @@ posix_lookup (call_frame_t *frame, xlator_t *this, MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf); if (uuid_is_null (loc->inode->gfid)) { - posix_gfid_set (this, real_path, loc, xdata); + posix_gfid_heal (this, real_path, loc, xdata); MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf); } @@ -218,7 +212,8 @@ posix_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) if (op_ret == -1) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, + gf_log (this->name, (op_errno == ENOENT)? + GF_LOG_DEBUG:GF_LOG_ERROR, "lstat on %s failed: %s", real_path, strerror (op_errno)); goto out; @@ -569,6 +564,289 @@ out: return 0; } +static int32_t +posix_do_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + off_t offset, size_t len, struct iatt *statpre, + struct iatt *statpost) +{ + struct posix_fd *pfd = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "fallocate (fstat) failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + + ret = sys_fallocate(pfd->fd, flags, offset, len); + if (ret == -1) { + ret = -errno; + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "fallocate (fstat) failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + +out: + SET_TO_OLD_FS_ID (); + + return ret; +} + +char* +_page_aligned_alloc (size_t size, char **aligned_buf) +{ + char *alloc_buf = NULL; + char *buf = NULL; + + alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char); + if (!alloc_buf) + goto out; + /* page aligned buffer */ + buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); + *aligned_buf = buf; +out: + return alloc_buf; +} + +static int32_t +_posix_do_zerofill(int fd, off_t offset, size_t len, int o_direct) +{ + size_t num_vect = 0; + int32_t num_loop = 1; + int32_t idx = 0; + int32_t op_ret = -1; + int32_t vect_size = VECTOR_SIZE; + size_t remain = 0; + size_t extra = 0; + struct iovec *vector = NULL; + char *iov_base = NULL; + char *alloc_buf = NULL; + + if (len == 0) + return 0; + if (len < VECTOR_SIZE) + vect_size = len; + + num_vect = len / (vect_size); + remain = len % vect_size ; + if (num_vect > MAX_NO_VECT) { + extra = num_vect % MAX_NO_VECT; + num_loop = num_vect / MAX_NO_VECT; + num_vect = MAX_NO_VECT; + } + + vector = GF_CALLOC (num_vect, sizeof(struct iovec), + gf_common_mt_iovec); + if (!vector) + return -1; + if (o_direct) { + alloc_buf = _page_aligned_alloc(vect_size, &iov_base); + if (!alloc_buf) { + gf_log ("_posix_do_zerofill", GF_LOG_DEBUG, + "memory alloc failed, vect_size %d: %s", + vect_size, strerror(errno)); + GF_FREE(vector); + return -1; + } + } else { + iov_base = GF_CALLOC (vect_size, sizeof(char), + gf_common_mt_char); + if (!iov_base) { + GF_FREE(vector); + return -1; + } + } + + for (idx = 0; idx < num_vect; idx++) { + vector[idx].iov_base = iov_base; + vector[idx].iov_len = vect_size; + } + lseek(fd, offset, SEEK_SET); + for (idx = 0; idx < num_loop; idx++) { + op_ret = writev(fd, vector, num_vect); + if (op_ret < 0) + goto err; + } + if (extra) { + op_ret = writev(fd, vector, extra); + if (op_ret < 0) + goto err; + } + if (remain) { + vector[0].iov_len = remain; + op_ret = writev(fd, vector , 1); + if (op_ret < 0) + goto err; + } +err: + if (o_direct) + GF_FREE(alloc_buf); + else + GF_FREE(iov_base); + GF_FREE(vector); + return op_ret; +} + +static int32_t +posix_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, struct iatt *statpre, + struct iatt *statpost) +{ + struct posix_fd *pfd = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "pre-operation fstat failed on fd = %p: %s", fd, + strerror (errno)); + goto out; + } + ret = _posix_do_zerofill(pfd->fd, offset, len, pfd->flags & O_DIRECT); + if (ret < 0) { + ret = -errno; + gf_log(this->name, GF_LOG_ERROR, + "zerofill failed on fd %d length %ld %s", + pfd->fd, len, strerror(errno)); + goto out; + } + if (pfd->flags & (O_SYNC|O_DSYNC)) { + ret = fsync (pfd->fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fsync() in writev on fd %d failed: %s", + pfd->fd, strerror (errno)); + ret = -errno; + goto out; + } + } + + ret = posix_fdstat (this, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "post operation fstat failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + +out: + SET_TO_OLD_FS_ID (); + + return ret; +} + +static int32_t +_posix_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) +{ + int32_t ret; + int32_t flags = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + if (keep_size) + flags = FALLOC_FL_KEEP_SIZE; + + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(fallocate, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(fallocate, frame, -1, -ret, NULL, NULL, NULL); + return 0; +} + +static int32_t +posix_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret; + int32_t flags = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(discard, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(discard, frame, -1, -ret, NULL, NULL, NULL); + return 0; + +} + +static int32_t +posix_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + ret = posix_do_zerofill(frame, this, fd, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(zerofill, frame, -1, -ret, NULL, NULL, NULL); + return 0; + +} + int32_t posix_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, dict_t *xdata) @@ -1120,6 +1398,7 @@ posix_rmdir (call_frame_t *frame, xlator_t *this, int32_t op_errno = 0; char * real_path = NULL; char * par_path = NULL; + char * gfid_str = NULL; struct iatt preparent = {0,}; struct iatt postparent = {0,}; struct iatt stbuf; @@ -1159,12 +1438,13 @@ posix_rmdir (call_frame_t *frame, xlator_t *this, } if (flags) { - uint32_t hashval = 0; - char *tmp_path = alloca (strlen (priv->trash_path) + 16); + gfid_str = uuid_utoa (stbuf.ia_gfid); + char *tmp_path = alloca (strlen (priv->trash_path) + + strlen ("/") + + strlen (gfid_str) + 1); mkdir (priv->trash_path, 0755); - hashval = gf_dm_hashfn (real_path, strlen (real_path)); - sprintf (tmp_path, "%s/%u", priv->trash_path, hashval); + sprintf (tmp_path, "%s/%s", priv->trash_path, gfid_str); op_ret = rename (real_path, tmp_path); } else { op_ret = rmdir (real_path); @@ -1538,7 +1818,7 @@ posix_link (call_frame_t *frame, xlator_t *this, /* * On most systems (Linux being the notable exception), link(2) * first resolves symlinks. If the target is a directory or - * is nonexistent, it will fail. linkat(2) operates on the + * is nonexistent, it will fail. linkat(2) operates on the * symlink instead of its target when the AT_SYMLINK_FOLLOW * flag is not supplied. */ @@ -1724,6 +2004,9 @@ posix_create (call_frame_t *frame, xlator_t *this, goto out; } + if (was_present) + goto fill_stat; + op_ret = posix_gfid_set (this, real_path, loc, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, @@ -1754,6 +2037,7 @@ posix_create (call_frame_t *frame, xlator_t *this, strerror (errno)); } +fill_stat: op_ret = posix_fdstat (this, _fd, &stbuf); if (op_ret == -1) { op_errno = errno; @@ -1808,7 +2092,7 @@ out: STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, (loc)?loc->inode:NULL, &stbuf, &preparent, - &postparent, NULL); + &postparent, xdata); return 0; } @@ -1971,11 +2255,7 @@ posix_readv (call_frame_t *frame, xlator_t *this, } /* Hack to notify higher layers of EOF. */ - if (stbuf.ia_size == 0) - op_errno = ENOENT; - else if ((offset + vec.iov_len) == stbuf.ia_size) - op_errno = ENOENT; - else if (offset > stbuf.ia_size) + if (!stbuf.ia_size || (offset + vec.iov_len) >= stbuf.ia_size) op_errno = ENOENT; op_ret = vec.iov_len; @@ -2020,14 +2300,12 @@ err: return op_ret; } - int32_t __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, int odirect) { int32_t op_ret = 0; int idx = 0; - int align = 4096; int max_buf_size = 0; int retval = 0; char *buf = NULL; @@ -2043,7 +2321,7 @@ __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, max_buf_size = vector[idx].iov_len; } - alloc_buf = GF_MALLOC (1 * (max_buf_size + align), gf_posix_mt_char); + alloc_buf = _page_aligned_alloc (max_buf_size, &buf); if (!alloc_buf) { op_ret = -errno; goto err; @@ -2051,9 +2329,6 @@ __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, internal_off = startoff; for (idx = 0; idx < count; idx++) { - /* page aligned buffer */ - buf = GF_ALIGN_BUF (alloc_buf, align); - memcpy (buf, vector[idx].iov_base, vector[idx].iov_len); /* not sure whether writev works on O_DIRECT'd fd */ @@ -2068,12 +2343,53 @@ __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, } err: - if (alloc_buf) - GF_FREE (alloc_buf); + GF_FREE (alloc_buf); return op_ret; } +dict_t* +_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append) +{ + dict_t *rsp_xdata = NULL; + int32_t ret = 0; + inode_t *inode = NULL; + + if (fd) + inode = fd->inode; + + if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid Args: " + "fd: %p inode: %p gfid:%s", fd, inode?inode:0, + inode?uuid_utoa(inode->gfid):"N/A"); + goto out; + } + + if (!xdata || !dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) + goto out; + + rsp_xdata = dict_new(); + if (!rsp_xdata) + goto out; + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, + fd->inode->fd_count); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_OPEN_FD_COUNT); + } + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, + is_append); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_WRITE_IS_APPEND); + } +out: + return rsp_xdata; +} int32_t posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, @@ -2088,6 +2404,9 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt preop = {0,}; struct iatt postop = {0,}; int ret = -1; + dict_t *rsp_xdata = NULL; + int is_append = 0; + gf_boolean_t locked = _gf_false; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -2109,6 +2428,17 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, _fd = pfd->fd; + if (xdata && dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { + /* The write_is_append check and write must happen + atomically. Else another write can overtake this + write after the check and get written earlier. + + So lock before preop-stat and unlock after write. + */ + locked = _gf_true; + LOCK(&fd->inode->lock); + } + op_ret = posix_fdstat (this, _fd, &preop); if (op_ret == -1) { op_errno = errno; @@ -2118,8 +2448,19 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, goto out; } + if (locked) { + if (preop.ia_size == offset || (fd->flags & O_APPEND)) + is_append = 1; + } + op_ret = __posix_writev (_fd, vector, count, offset, (pfd->flags & O_DIRECT)); + + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + if (op_ret < 0) { op_errno = -op_ret; op_ret = -1; @@ -2135,14 +2476,21 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, UNLOCK (&priv->lock); if (op_ret >= 0) { + rsp_xdata = _fill_writev_xdata (fd, xdata, this, is_append); /* wiretv successful, we also need to get the stat of * the file we wrote to */ - if (pfd->flushwrites) { - /* NOTE: ignore the error, if one occurs at this - * point */ - fsync (_fd); + if (flags & (O_SYNC|O_DSYNC)) { + ret = fsync (_fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fsync() in writev on fd %d failed: %s", + _fd, strerror (errno)); + op_ret = -1; + op_errno = errno; + goto out; + } } ret = posix_fdstat (this, _fd, &postop); @@ -2158,9 +2506,16 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, out: + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop, - NULL); + rsp_xdata); + if (rsp_xdata) + dict_unref (rsp_xdata); return 0; } @@ -2287,6 +2642,33 @@ out: } +int +posix_batch_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int datasync, dict_t *xdata) +{ + call_stub_t *stub = NULL; + struct posix_private *priv = NULL; + + priv = this->private; + + stub = fop_fsync_stub (frame, default_fsync, fd, datasync, xdata); + if (!stub) { + STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + pthread_mutex_lock (&priv->fsync_mutex); + { + list_add_tail (&stub->list, &priv->fsyncs); + priv->fsync_queue_count++; + pthread_cond_signal (&priv->fsync_cond); + } + pthread_mutex_unlock (&priv->fsync_mutex); + + return 0; +} + + int32_t posix_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, dict_t *xdata) @@ -2298,6 +2680,7 @@ posix_fsync (call_frame_t *frame, xlator_t *this, int ret = -1; struct iatt preop = {0,}; struct iatt postop = {0,}; + struct posix_private *priv = NULL; DECLARE_OLD_FS_ID_VAR; @@ -2313,6 +2696,12 @@ posix_fsync (call_frame_t *frame, xlator_t *this, goto out; #endif + priv = this->private; + if (priv->batch_fsync_mode && xdata && dict_get (xdata, "batch-fsync")) { + posix_batch_fsync (frame, this, fd, datasync, xdata); + return 0; + } + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; @@ -2374,6 +2763,17 @@ out: } static int gf_posix_xattr_enotsup_log; +static int +_handle_setxattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + return posix_handle_pair (filler->this, filler->real_path, k, v, + filler->flags); +} int32_t posix_setxattr (call_frame_t *frame, xlator_t *this, @@ -2382,8 +2782,8 @@ posix_setxattr (call_frame_t *frame, xlator_t *this, int32_t op_ret = -1; int32_t op_errno = 0; char * real_path = NULL; - data_pair_t * trav = NULL; - int ret = -1; + + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; SET_FS_ID (frame->root->uid, frame->root->gid); @@ -2398,18 +2798,13 @@ posix_setxattr (call_frame_t *frame, xlator_t *this, op_ret = -1; dict_del (dict, GFID_XATTR_KEY); - trav = dict->members_list; - - while (trav) { - ret = posix_handle_pair (this, real_path, trav, flags); - if (ret < 0) { - op_errno = -ret; - goto out; - } - trav = trav->next; - } - - op_ret = 0; + filler.real_path = real_path; + filler.this = this; + filler.flags = flags; + op_ret = dict_foreach (dict, _handle_setxattr_keyvalue_pair, + &filler); + if (op_ret < 0) + op_errno = -op_ret; out: SET_TO_OLD_FS_ID (); @@ -2419,6 +2814,53 @@ out: return 0; } + +int +posix_xattr_get_real_filename (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *dict, dict_t *xdata) +{ + char *real_path = NULL; + struct dirent *dirent = NULL; + DIR *fd = NULL; + const char *fname = NULL; + char *found = NULL; + int ret = -1; + int op_ret = -1; + + MAKE_INODE_HANDLE (real_path, this, loc, NULL); + + fd = opendir (real_path); + if (!fd) + return -errno; + + fname = key + strlen (GF_XATTR_GET_REAL_FILENAME_KEY); + + while ((dirent = readdir (fd))) { + if (strcasecmp (dirent->d_name, fname) == 0) { + found = gf_strdup (dirent->d_name); + if (!found) { + closedir (fd); + return -ENOMEM; + } + break; + } + } + + closedir (fd); + + if (!found) + return -ENOENT; + + ret = dict_set_dynstr (dict, (char *)key, found); + if (ret) { + GF_FREE (found); + return -ENOMEM; + } + ret = strlen (found) + 1; + + return ret; +} + /** * posix_getxattr - this function returns a dictionary with all the * key:value pair present as xattr. used for @@ -2432,7 +2874,7 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, int32_t op_ret = -1; int32_t op_errno = 0; int32_t list_offset = 0; - size_t size = 0; + ssize_t size = 0; size_t remaining_size = 0; char key[4096] = {0,}; char host_buf[1024] = {0,}; @@ -2473,9 +2915,29 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, dict = dict_new (); if (!dict) { + op_errno = ENOMEM; goto out; } + if (loc->inode && name && + (strncmp (name, GF_XATTR_GET_REAL_FILENAME_KEY, + strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)) { + ret = posix_xattr_get_real_filename (frame, this, loc, + name, dict, xdata); + if (ret < 0) { + op_ret = -1; + op_errno = -ret; + gf_log (this->name, (op_errno == ENOENT) ? + GF_LOG_DEBUG : GF_LOG_WARNING, + "Failed to get real filename (%s, %s): %s", + loc->path, name, strerror (op_errno)); + goto out; + } + + size = ret; + goto done; + } + if (loc->inode && name && !strcmp (name, GLUSTERFS_OPEN_FD_COUNT)) { if (!list_empty (&loc->inode->fd_list)) { ret = dict_set_uint32 (dict, (char *)name, 1); @@ -2499,8 +2961,13 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, else rpath = real_path; - (void) snprintf (host_buf, 1024, "<POSIX(%s):%s:%s>", - priv->base_path, priv->hostname, rpath); + (void) snprintf (host_buf, 1024, + "<POSIX(%s):%s:%s>", priv->base_path, + ((priv->node_uuid_pathinfo + && !uuid_is_null(priv->glusterd_uuid)) + ? uuid_utoa (priv->glusterd_uuid) + : priv->hostname), + rpath); dyn_rpath = gf_strdup (host_buf); if (!dyn_rpath) { @@ -2514,6 +2981,7 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_WARNING, "could not set value (%s) in dictionary", dyn_rpath); + GF_FREE (dyn_rpath); } goto done; @@ -2538,6 +3006,7 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_WARNING, "could not set value (%s) in dictionary", dyn_rpath); + GF_FREE (dyn_rpath); } goto done; } @@ -2556,6 +3025,7 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_WARNING, "could not set value (%s) in dictionary", host_buf); + GF_FREE (path); } goto done; } @@ -2573,6 +3043,11 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, "supported (try remounting" " brick with 'user_xattr' " "flag)"); + } else if (op_errno == ENOATTR || + op_errno == ENODATA) { + gf_log (this->name, GF_LOG_DEBUG, + "No such attribute:%s for file %s", + key, real_path); } else { gf_log (this->name, GF_LOG_ERROR, "getxattr failed on %s: %s (%s)", @@ -2586,14 +3061,22 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, op_ret = -1; goto out; } - op_ret = sys_lgetxattr (real_path, key, value, size); - if (op_ret == -1) { + size = sys_lgetxattr (real_path, key, value, size); + if (size == -1) { + op_ret = -1; op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "getxattr failed on " + "%s: key = %s (%s)", real_path, key, + strerror (op_errno)); + GF_FREE (value); goto out; } - value [op_ret] = '\0'; - op_ret = dict_set_dynptr (dict, key, value, op_ret); + value [size] = '\0'; + op_ret = dict_set_dynptr (dict, key, value, size); if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "on %s for the key %s failed.", real_path, key); + GF_FREE (value); goto out; } @@ -2637,26 +3120,40 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, break; strcpy (key, list + list_offset); - op_ret = sys_lgetxattr (real_path, key, NULL, 0); - if (op_ret == -1) + size = sys_lgetxattr (real_path, key, NULL, 0); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "getxattr failed on " + "%s: key = %s (%s)", real_path, key, + strerror (op_errno)); break; + } - value = GF_CALLOC (op_ret + 1, sizeof(char), + value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char); if (!value) { op_errno = errno; goto out; } - op_ret = sys_lgetxattr (real_path, key, value, op_ret); - if (op_ret == -1) { + size = sys_lgetxattr (real_path, key, value, size); + if (size == -1) { + op_ret = -1; op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "getxattr failed on " + "%s: key = %s (%s)", real_path, key, + strerror (op_errno)); + GF_FREE (value); break; } - value [op_ret] = '\0'; - op_ret = dict_set_dynptr (dict, key, value, op_ret); + value [size] = '\0'; + op_ret = dict_set_dynptr (dict, key, value, size); if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "on %s for the key %s failed.", real_path, key); + GF_FREE (value); goto out; } @@ -2693,7 +3190,7 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, struct posix_fd * pfd = NULL; int _fd = -1; int32_t list_offset = 0; - size_t size = 0; + ssize_t size = 0; size_t remaining_size = 0; char key[4096] = {0,}; char * value = NULL; @@ -2740,6 +3237,8 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, size = sys_fgetxattr (_fd, key, NULL, 0); if (size <= 0) { op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "key %s (%s)", key, strerror (op_errno)); goto done; } @@ -2748,14 +3247,22 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, op_ret = -1; goto out; } - op_ret = sys_fgetxattr (_fd, key, value, op_ret); - if (op_ret == -1) { + size = sys_fgetxattr (_fd, key, value, size); + if (size == -1) { + op_ret = -1; op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "fd %p for the key %s (%s)", fd, key, + strerror (op_errno)); + GF_FREE (value); goto out; } - value [op_ret] = '\0'; - op_ret = dict_set_dynptr (dict, key, value, op_ret); + value [size] = '\0'; + op_ret = dict_set_dynptr (dict, key, value, size); if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "on key %s failed", key); + GF_FREE (value); goto out; } goto done; @@ -2797,24 +3304,41 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, break; strcpy (key, list + list_offset); - op_ret = sys_fgetxattr (_fd, key, NULL, 0); - if (op_ret == -1) + size = sys_fgetxattr (_fd, key, NULL, 0); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "fd %p for the key %s (%s)", fd, key, + strerror (op_errno)); break; + } - value = GF_CALLOC (op_ret + 1, sizeof(char), + value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char); if (!value) { + op_ret = -1; op_errno = errno; goto out; } - op_ret = sys_fgetxattr (_fd, key, value, op_ret); - if (op_ret == -1) + size = sys_fgetxattr (_fd, key, value, size); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "the fd %p for the key %s (%s)", fd, key, + strerror (op_errno)); + GF_FREE (value); break; + } - value [op_ret] = '\0'; - op_ret = dict_set_dynptr (dict, key, value, op_ret); + value [size] = '\0'; + op_ret = dict_set_dynptr (dict, key, value, size); if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "failed on key %s", key); + GF_FREE (value); goto out; } remaining_size -= strlen (key) + 1; @@ -2841,6 +3365,17 @@ out: return 0; } +static int +_handle_fsetxattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + return posix_fhandle_pair (filler->this, filler->fd, k, v, + filler->flags); +} int32_t posix_fsetxattr (call_frame_t *frame, xlator_t *this, @@ -2850,8 +3385,9 @@ posix_fsetxattr (call_frame_t *frame, xlator_t *this, int32_t op_errno = 0; struct posix_fd * pfd = NULL; int _fd = -1; - data_pair_t * trav = NULL; - int ret = -1; + int ret = -1; + + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; SET_FS_ID (frame->root->uid, frame->root->gid); @@ -2872,18 +3408,13 @@ posix_fsetxattr (call_frame_t *frame, xlator_t *this, dict_del (dict, GFID_XATTR_KEY); - trav = dict->members_list; - - while (trav) { - ret = posix_fhandle_pair (this, _fd, trav, flags); - if (ret < 0) { - op_errno = -ret; - goto out; - } - trav = trav->next; - } - - op_ret = 0; + filler.fd = _fd; + filler.this = this; + filler.flags = flags; + op_ret = dict_foreach (dict, _handle_fsetxattr_keyvalue_pair, + &filler); + if (op_ret < 0) + op_errno = -op_ret; out: SET_TO_OLD_FS_ID (); @@ -2893,6 +3424,28 @@ out: return 0; } +int +_posix_remove_xattr (dict_t *dict, char *key, data_t *value, void *data) +{ + int32_t op_ret = 0; + xlator_t *this = NULL; + posix_xattr_filler_t *filler = NULL; + + filler = (posix_xattr_filler_t *) data; + this = filler->this; + + op_ret = sys_lremovexattr (filler->real_path, key); + if (op_ret == -1) { + filler->op_errno = errno; + if (errno != ENOATTR && errno != EPERM) + gf_log (this->name, GF_LOG_ERROR, + "removexattr failed on %s (for %s): %s", + filler->real_path, key, strerror (errno)); + } + + return op_ret; +} + int32_t posix_removexattr (call_frame_t *frame, xlator_t *this, @@ -2901,6 +3454,7 @@ posix_removexattr (call_frame_t *frame, xlator_t *this, int32_t op_ret = -1; int32_t op_errno = 0; char * real_path = NULL; + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; @@ -2916,6 +3470,22 @@ posix_removexattr (call_frame_t *frame, xlator_t *this, SET_FS_ID (frame->root->uid, frame->root->gid); + /** + * sending an empty key name with xdata containing the + * list of key(s) to be removed implies "bulk remove request" + * for removexattr. + */ + if (name && (strcmp (name, "") == 0) && xdata) { + filler.real_path = real_path; + filler.this = this; + op_ret = dict_foreach (xdata, _posix_remove_xattr, &filler); + if (op_ret) { + op_errno = filler.op_errno; + } + + goto out; + } + op_ret = sys_lremovexattr (real_path, name); if (op_ret == -1) { op_errno = errno; @@ -2943,7 +3513,6 @@ posix_fremovexattr (call_frame_t *frame, xlator_t *this, int32_t op_errno = 0; struct posix_fd * pfd = NULL; int _fd = -1; - uint64_t tmp_pfd = 0; int ret = -1; DECLARE_OLD_FS_ID_VAR; @@ -2954,15 +3523,13 @@ posix_fremovexattr (call_frame_t *frame, xlator_t *this, goto out; } - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; gf_log (this->name, GF_LOG_WARNING, "pfd is NULL from fd=%p", fd); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; - _fd = pfd->fd; @@ -3040,9 +3607,31 @@ posix_print_xattr (dict_t *this, static void __add_array (int32_t *dest, int32_t *src, int count) { + int i = 0; + int32_t destval = 0; + for (i = 0; i < count; i++) { + destval = ntoh32 (dest[i]); + if (destval == 0xffffffff) + continue; + dest[i] = hton32 (destval + ntoh32 (src[i])); + } +} + +static void +__or_array (int32_t *dest, int32_t *src, int count) +{ int i = 0; for (i = 0; i < count; i++) { - dest[i] = hton32 (ntoh32 (dest[i]) + ntoh32 (src[i])); + dest[i] = hton32 (ntoh32 (dest[i]) | ntoh32 (src[i])); + } +} + +static void +__and_array (int32_t *dest, int32_t *src, int count) +{ + int i = 0; + for (i = 0; i < count; i++) { + dest[i] = hton32 (ntoh32 (dest[i]) & ntoh32 (src[i])); } } @@ -3055,6 +3644,159 @@ __add_long_array (int64_t *dest, int64_t *src, int count) } } +static int +_posix_handle_xattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + int size = 0; + int count = 0; + int op_ret = 0; + int op_errno = 0; + gf_xattrop_flags_t optype = 0; + char *array = NULL; + inode_t *inode = NULL; + xlator_t *this = NULL; + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + optype = (gf_xattrop_flags_t)(filler->flags); + this = filler->this; + inode = filler->inode; + + count = v->len; + array = GF_CALLOC (count, sizeof (char), gf_posix_mt_char); + + LOCK (&inode->lock); + { + if (filler->real_path) { + size = sys_lgetxattr (filler->real_path, k, + (char *)array, v->len); + } else { + size = sys_fgetxattr (filler->fd, k, (char *)array, + v->len); + } + + op_errno = errno; + if ((size == -1) && (op_errno != ENODATA) && + (op_errno != ENOATTR)) { + if (op_errno == ENOTSUP) { + GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, + this->name, GF_LOG_WARNING, + "Extended attributes not " + "supported by filesystem"); + } else if (op_errno != ENOENT || + !posix_special_xattr (marker_xattrs, + k)) { + if (filler->real_path) + gf_log (this->name, GF_LOG_ERROR, + "getxattr failed on %s while doing " + "xattrop: Key:%s (%s)", + filler->real_path, + k, strerror (op_errno)); + else + gf_log (this->name, GF_LOG_ERROR, + "fgetxattr failed on fd=%d while doing " + "xattrop: Key:%s (%s)", + filler->fd, + k, strerror (op_errno)); + } + + op_ret = -1; + goto unlock; + } + + switch (optype) { + + case GF_XATTROP_ADD_ARRAY: + __add_array ((int32_t *) array, (int32_t *) v->data, + v->len / 4); + break; + + case GF_XATTROP_ADD_ARRAY64: + __add_long_array ((int64_t *) array, (int64_t *) v->data, + v->len / 8); + break; + + case GF_XATTROP_OR_ARRAY: + __or_array ((int32_t *) array, + (int32_t *) v->data, + v->len / 4); + break; + + case GF_XATTROP_AND_ARRAY: + __and_array ((int32_t *) array, + (int32_t *) v->data, + v->len / 4); + break; + + default: + gf_log (this->name, GF_LOG_ERROR, + "Unknown xattrop type (%d) on %s. Please send " + "a bug report to gluster-devel@nongnu.org", + optype, filler->real_path); + op_ret = -1; + op_errno = EINVAL; + goto unlock; + } + + if (filler->real_path) { + size = sys_lsetxattr (filler->real_path, k, array, + v->len, 0); + } else { + size = sys_fsetxattr (filler->fd, k, (char *)array, + v->len, 0); + } + } +unlock: + UNLOCK (&inode->lock); + + if (op_ret == -1) + goto out; + + op_errno = errno; + if (size == -1) { + if (filler->real_path) + gf_log (this->name, GF_LOG_ERROR, + "setxattr failed on %s while doing xattrop: " + "key=%s (%s)", filler->real_path, + k, strerror (op_errno)); + else + gf_log (this->name, GF_LOG_ERROR, + "fsetxattr failed on fd=%d while doing xattrop: " + "key=%s (%s)", filler->fd, + k, strerror (op_errno)); + + op_ret = -1; + goto out; + } else { + size = dict_set_bin (d, k, array, v->len); + + if (size != 0) { + if (filler->real_path) + gf_log (this->name, GF_LOG_DEBUG, + "dict_set_bin failed (path=%s): " + "key=%s (%s)", filler->real_path, + k, strerror (-size)); + else + gf_log (this->name, GF_LOG_DEBUG, + "dict_set_bin failed (fd=%d): " + "key=%s (%s)", filler->fd, + k, strerror (-size)); + + op_ret = -1; + op_errno = EINVAL; + goto out; + } + array = NULL; + } + + array = NULL; + +out: + return op_ret; +} + /** * xattrop - xattr operations - for internal use by GlusterFS * @optype: ADD_ARRAY: @@ -3066,36 +3808,24 @@ int do_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) { - char *real_path = NULL; - char *array = NULL; - int size = 0; - int count = 0; - - int op_ret = 0; - int op_errno = 0; - - int ret = 0; - int _fd = -1; - struct posix_fd *pfd = NULL; - - data_pair_t *trav = NULL; - - char * path = NULL; - inode_t * inode = NULL; + int op_ret = 0; + int op_errno = 0; + int _fd = -1; + char *real_path = NULL; + struct posix_fd *pfd = NULL; + inode_t *inode = NULL; + posix_xattr_filler_t filler = {0,}; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (xattr, out); VALIDATE_OR_GOTO (this, out); - trav = xattr->members_list; - if (fd) { - ret = posix_fd_ctx_get (fd, this, &pfd); - if (ret < 0) { + op_ret = posix_fd_ctx_get (fd, this, &pfd); + if (op_ret < 0) { gf_log (this->name, GF_LOG_WARNING, "failed to get pfd from fd=%p", fd); - op_ret = -1; op_errno = EBADFD; goto out; } @@ -3106,138 +3836,21 @@ do_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, MAKE_INODE_HANDLE (real_path, this, loc, NULL); if (real_path) { - path = gf_strdup (real_path); inode = loc->inode; } else if (fd) { inode = fd->inode; } - while (trav && inode) { - count = trav->value->len; - array = GF_CALLOC (count, sizeof (char), - gf_posix_mt_char); - - LOCK (&inode->lock); - { - if (loc) { - size = sys_lgetxattr (real_path, trav->key, (char *)array, - trav->value->len); - } else { - size = sys_fgetxattr (_fd, trav->key, (char *)array, - trav->value->len); - } - - op_errno = errno; - if ((size == -1) && (op_errno != ENODATA) && - (op_errno != ENOATTR)) { - if (op_errno == ENOTSUP) { - GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, - this->name,GF_LOG_WARNING, - "Extended attributes not " - "supported by filesystem"); - } else if (op_errno != ENOENT || - !posix_special_xattr (marker_xattrs, - trav->key)) { - if (loc) - gf_log (this->name, GF_LOG_ERROR, - "getxattr failed on %s while doing " - "xattrop: Key:%s (%s)", path, - trav->key, strerror (op_errno)); - else - gf_log (this->name, GF_LOG_ERROR, - "fgetxattr failed on fd=%d while doing " - "xattrop: Key:%s (%s)", _fd, - trav->key, strerror (op_errno)); - } - - op_ret = -1; - goto unlock; - } - - switch (optype) { - - case GF_XATTROP_ADD_ARRAY: - __add_array ((int32_t *) array, (int32_t *) trav->value->data, - trav->value->len / 4); - break; - - case GF_XATTROP_ADD_ARRAY64: - __add_long_array ((int64_t *) array, (int64_t *) trav->value->data, - trav->value->len / 8); - break; - - default: - gf_log (this->name, GF_LOG_ERROR, - "Unknown xattrop type (%d) on %s. Please send " - "a bug report to gluster-devel@nongnu.org", - optype, path); - op_ret = -1; - op_errno = EINVAL; - goto unlock; - } - - if (loc) { - size = sys_lsetxattr (real_path, trav->key, array, - trav->value->len, 0); - } else { - size = sys_fsetxattr (_fd, trav->key, (char *)array, - trav->value->len, 0); - } - } - unlock: - UNLOCK (&inode->lock); + filler.this = this; + filler.fd = _fd; + filler.real_path = real_path; + filler.flags = (int)optype; + filler.inode = inode; - if (op_ret == -1) - goto out; - - op_errno = errno; - if (size == -1) { - if (loc) - gf_log (this->name, GF_LOG_ERROR, - "setxattr failed on %s while doing xattrop: " - "key=%s (%s)", path, - trav->key, strerror (op_errno)); - else - gf_log (this->name, GF_LOG_ERROR, - "fsetxattr failed on fd=%d while doing xattrop: " - "key=%s (%s)", _fd, - trav->key, strerror (op_errno)); - - op_ret = -1; - goto out; - } else { - size = dict_set_bin (xattr, trav->key, array, - trav->value->len); - - if (size != 0) { - if (loc) - gf_log (this->name, GF_LOG_DEBUG, - "dict_set_bin failed (path=%s): " - "key=%s (%s)", path, - trav->key, strerror (-size)); - else - gf_log (this->name, GF_LOG_DEBUG, - "dict_set_bin failed (fd=%d): " - "key=%s (%s)", _fd, - trav->key, strerror (-size)); - - op_ret = -1; - op_errno = EINVAL; - goto out; - } - array = NULL; - } - - array = NULL; - trav = trav->next; - } + op_ret = dict_foreach (xattr, _posix_handle_xattr_keyvalue_pair, + &filler); out: - if (array) - GF_FREE (array); - - if (path) - GF_FREE (path); STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, xattr, NULL); return 0; @@ -3495,7 +4108,7 @@ posix_fentrylk (call_frame_t *frame, xlator_t *this, int posix_fill_readdir (fd_t *fd, DIR *dir, off_t off, size_t size, - gf_dirent_t *entries) + gf_dirent_t *entries, xlator_t *this, int32_t skip_dirs) { off_t in_case = -1; size_t filled = 0; @@ -3505,6 +4118,18 @@ posix_fill_readdir (fd_t *fd, DIR *dir, off_t off, size_t size, int32_t this_size = -1; gf_dirent_t *this_entry = NULL; uuid_t rootgfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; + struct stat stbuf = {0,}; + char *hpath = NULL; + int len = 0; + int ret = 0; + + if (skip_dirs) { + len = posix_handle_path (this, fd->inode->gfid, NULL, NULL, 0); + hpath = alloca (len + 256); /* NAME_MAX */ + posix_handle_path (this, fd->inode->gfid, NULL, hpath, len); + len = strlen (hpath); + hpath[len] = '/'; + } if (!off) { rewinddir (dir); @@ -3536,10 +4161,6 @@ posix_fill_readdir (fd_t *fd, DIR *dir, off_t off, size_t size, break; } - if ((uuid_compare (fd->inode->gfid, rootgfid) == 0) - && (!strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR))) - continue; - #ifdef __NetBSD__ /* * NetBSD with UFS1 backend uses backing files for @@ -3559,6 +4180,17 @@ posix_fill_readdir (fd_t *fd, DIR *dir, off_t off, size_t size, continue; } + if (skip_dirs) { + if (DT_ISDIR (entry->d_type)) { + continue; + } else if (hpath) { + strcpy (&hpath[len+1],entry->d_name); + ret = lstat (hpath, &stbuf); + if (!ret && S_ISDIR (stbuf.st_mode)) + continue; + } + } + this_size = max (sizeof (gf_dirent_t), sizeof (gfs3_dirplist)) + strlen (entry->d_name) + 1; @@ -3578,6 +4210,7 @@ posix_fill_readdir (fd_t *fd, DIR *dir, off_t off, size_t size, } this_entry->d_off = telldir (dir); this_entry->d_ino = entry->d_ino; + this_entry->d_type = entry->d_type; list_add_tail (&this_entry->list, &entries->list); @@ -3610,6 +4243,66 @@ posix_entry_xattr_fill (xlator_t *this, inode_t *inode, } + +int +posix_readdirp_fill (xlator_t *this, fd_t *fd, gf_dirent_t *entries, dict_t *dict) +{ + gf_dirent_t *entry = NULL; + inode_table_t *itable = NULL; + inode_t *inode = NULL; + char *hpath = NULL; + int len = 0; + struct iatt stbuf = {0, }; + uuid_t gfid; + + if (list_empty(&entries->list)) + return 0; + + itable = fd->inode->table; + + len = posix_handle_path (this, fd->inode->gfid, NULL, NULL, 0); + hpath = alloca (len + 256); /* NAME_MAX */ + posix_handle_path (this, fd->inode->gfid, NULL, hpath, len); + len = strlen (hpath); + hpath[len] = '/'; + + list_for_each_entry (entry, &entries->list, list) { + memset (gfid, 0, 16); + inode = inode_grep (fd->inode->table, fd->inode, + entry->d_name); + if (inode) + uuid_copy (gfid, inode->gfid); + + strcpy (&hpath[len+1], entry->d_name); + + posix_pstat (this, gfid, hpath, &stbuf); + + if (!inode) + inode = inode_find (itable, stbuf.ia_gfid); + + if (!inode) + inode = inode_new (itable); + + entry->inode = inode; + + if (dict) { + entry->dict = + posix_entry_xattr_fill (this, entry->inode, + fd, entry->d_name, + dict, &stbuf); + dict_ref (entry->dict); + } + + entry->d_stat = stbuf; + if (stbuf.ia_ino) + entry->d_ino = stbuf.ia_ino; + inode = NULL; + } + + return 0; +} + + int32_t posix_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off, int whichop, dict_t *dict) @@ -3621,13 +4314,8 @@ posix_do_readdir (call_frame_t *frame, xlator_t *this, int32_t op_ret = -1; int32_t op_errno = 0; gf_dirent_t entries; - struct iatt stbuf = {0, }; - gf_dirent_t *tmp_entry = NULL; - inode_table_t *itable = NULL; -#ifdef IGNORE_READDIRP_ATTRS - uuid_t gfid; - ia_type_t entry_type = 0; -#endif + int32_t skip_dirs = 0; + VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -3650,9 +4338,30 @@ posix_do_readdir (call_frame_t *frame, xlator_t *this, "dir is NULL for fd=%p", fd); op_errno = EINVAL; goto out; - } + } + + /* When READDIR_FILTER option is set to on, we can filter out + * directory's entry from the entry->list. + */ + ret = dict_get_int32 (dict, GF_READDIR_SKIP_DIRS, &skip_dirs); - count = posix_fill_readdir (fd, dir, off, size, &entries); + LOCK (&fd->lock); + { + /* posix_fill_readdir performs multiple separate individual + readdir() calls to fill up the buffer. + + In case of NFS where the same anonymous FD is shared between + different applications, reading a common directory can + result in the anonymous fd getting re-used unsafely between + the two readdir requests (in two different io-threads). + + It would also help, in the future, to replace the loop + around readdir() with a single large getdents() call. + */ + count = posix_fill_readdir (fd, dir, off, size, &entries, this, + skip_dirs); + } + UNLOCK (&fd->lock); /* pick ENOENT to indicate EOF */ op_errno = errno; @@ -3661,43 +4370,7 @@ posix_do_readdir (call_frame_t *frame, xlator_t *this, if (whichop != GF_FOP_READDIRP) goto out; - itable = fd->inode->table; - - list_for_each_entry (tmp_entry, &entries.list, list) { -#ifdef IGNORE_READDIRP_ATTRS - ret = inode_grep_for_gfid (fd->inode->table, fd->inode, - tmp_entry->d_name, gfid, - &entry_type); - if (ret == 0) { - memset (&stbuf, 0, sizeof (stbuf)); - uuid_copy (stbuf.ia_gfid, gfid); - posix_fill_ino_from_gfid (this, &stbuf); - stbuf.ia_type = entry_type; - } else { - posix_istat (this, fd->inode->gfid, - tmp_entry->d_name, &stbuf); - } -#else - posix_istat (this, fd->inode->gfid, - tmp_entry->d_name, &stbuf); -#endif - if (stbuf.ia_ino) - tmp_entry->d_ino = stbuf.ia_ino; - - if (dict) { - tmp_entry->inode = inode_find (itable, stbuf.ia_gfid); - if (!tmp_entry->inode) - tmp_entry->inode = inode_new (itable); - - tmp_entry->dict = - posix_entry_xattr_fill (this, tmp_entry->inode, - fd, tmp_entry->d_name, - dict, &stbuf); - dict_ref (tmp_entry->dict); - } - - tmp_entry->d_stat = stbuf; - } + posix_readdirp_fill (this, fd, &entries, dict); out: STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, NULL); @@ -3763,23 +4436,26 @@ int32_t posix_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, int32_t len, dict_t *xdata) { - char *buf = NULL; - int _fd = -1; - struct posix_fd *pfd = NULL; - int op_ret = -1; - int op_errno = 0; - int ret = 0; - int32_t weak_checksum = 0; - unsigned char strong_checksum[MD5_DIGEST_LENGTH]; + char *alloc_buf = NULL; + char *buf = NULL; + int _fd = -1; + struct posix_fd *pfd = NULL; + int op_ret = -1; + int op_errno = 0; + int ret = 0; + int32_t weak_checksum = 0; + unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0}; + struct posix_private *priv = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); + priv = this->private; memset (strong_checksum, 0, MD5_DIGEST_LENGTH); - buf = GF_CALLOC (1, len, gf_posix_mt_char); - if (!buf) { + alloc_buf = _page_aligned_alloc (len, &buf); + if (!alloc_buf) { op_errno = ENOMEM; goto out; } @@ -3794,15 +4470,25 @@ posix_rchecksum (call_frame_t *frame, xlator_t *this, _fd = pfd->fd; - ret = pread (_fd, buf, len, offset); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "pread of %d bytes returned %d (%s)", - len, ret, strerror (errno)); + LOCK (&fd->lock); + { + if (priv->aio_capable && priv->aio_init_done) + __posix_fd_set_odirect (fd, pfd, 0, offset, len); + + ret = pread (_fd, buf, len, offset); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "pread of %d bytes returned %d (%s)", + len, ret, strerror (errno)); + + op_errno = errno; + } - op_errno = errno; - goto out; } + UNLOCK (&fd->lock); + + if (ret < 0) + goto out; weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf, (size_t) len); gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len, (unsigned char *) strong_checksum); @@ -3812,8 +4498,7 @@ out: STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno, weak_checksum, strong_checksum, NULL); - if (buf) - GF_FREE (buf); + GF_FREE (alloc_buf); return 0; } @@ -3862,6 +4547,99 @@ mem_acct_init (xlator_t *this) return ret; } +static int +posix_set_owner (xlator_t *this, uid_t uid, gid_t gid) +{ + struct posix_private *priv = NULL; + int ret = -1; + + priv = this->private; + + ret = sys_chown (priv->base_path, uid, gid); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "uid/gid for brick path %s, %s", + priv->base_path, strerror (errno)); + + return ret; +} + + +static int +set_batch_fsync_mode (struct posix_private *priv, const char *str) +{ + if (strcmp (str, "none") == 0) + priv->batch_fsync_mode = BATCH_NONE; + else if (strcmp (str, "syncfs") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS; + else if (strcmp (str, "syncfs-single-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_SINGLE_FSYNC; + else if (strcmp (str, "syncfs-reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_REVERSE_FSYNC; + else if (strcmp (str, "reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_REVERSE_FSYNC; + else + return -1; + + return 0; +} + + +int +reconfigure (xlator_t *this, dict_t *options) +{ + int ret = -1; + struct posix_private *priv = NULL; + uid_t uid = -1; + gid_t gid = -1; + char *batch_fsync_mode_str = NULL; + + priv = this->private; + + GF_OPTION_RECONF ("brick-uid", uid, options, uint32, out); + GF_OPTION_RECONF ("brick-gid", gid, options, uint32, out); + posix_set_owner (this, uid, gid); + + GF_OPTION_RECONF ("batch-fsync-delay-usec", priv->batch_fsync_delay_usec, + options, uint32, out); + + GF_OPTION_RECONF ("batch-fsync-mode", batch_fsync_mode_str, + options, str, out); + + if (set_batch_fsync_mode (priv, batch_fsync_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", + batch_fsync_mode_str); + goto out; + } + + GF_OPTION_RECONF ("linux-aio", priv->aio_configured, + options, bool, out); + + if (priv->aio_configured) + posix_aio_on (this); + else + posix_aio_off (this); + + GF_OPTION_RECONF ("node-uuid-pathinfo", priv->node_uuid_pathinfo, + options, bool, out); + + if (priv->node_uuid_pathinfo && + (uuid_is_null (priv->glusterd_uuid))) { + gf_log (this->name, GF_LOG_INFO, + "glusterd uuid is NULL, pathinfo xattr would" + " fallback to <hostname>:<export>"); + } + + GF_OPTION_RECONF ("health-check-interval", priv->health_check_interval, + options, uint32, out); + posix_spawn_health_check_thread (this); + + ret = 0; +out: + return ret; +} + + /** * init - */ @@ -3876,12 +4654,16 @@ init (xlator_t *this) int dict_ret = 0; int ret = 0; int op_ret = -1; + ssize_t size = -1; int32_t janitor_sleep = 0; uuid_t old_uuid = {0,}; uuid_t dict_uuid = {0,}; uuid_t gfid = {0,}; uuid_t rootgfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; char *guuid = NULL; + uid_t uid = -1; + gid_t gid = -1; + char *batch_fsync_mode_str; dir_data = dict_get (this->options, "directory"); @@ -3962,9 +4744,9 @@ init (xlator_t *this) ret = -1; goto out; } - op_ret = sys_lgetxattr (dir_data->data, - "trusted.glusterfs.volume-id", old_uuid, 16); - if (op_ret == 16) { + size = sys_lgetxattr (dir_data->data, + "trusted.glusterfs.volume-id", old_uuid, 16); + if (size == 16) { if (uuid_compare (old_uuid, dict_uuid)) { gf_log (this->name, GF_LOG_ERROR, "mismatching volume-id (%s) received. " @@ -3973,22 +4755,20 @@ init (xlator_t *this) ret = -1; goto out; } - } else if ((op_ret == -1) && (errno == ENODATA)) { - /* Using the export for first time */ - op_ret = sys_lsetxattr (dir_data->data, - "trusted.glusterfs.volume-id", - dict_uuid, 16, 0); - if (op_ret == -1) { + } else if ((size == -1) && (errno == ENODATA)) { + gf_log (this->name, GF_LOG_ERROR, - "failed to set volume id on export"); + "Extended attribute trusted.glusterfs." + "volume-id is absent"); ret = -1; goto out; - } - } else if ((op_ret == -1) && (errno != ENODATA)) { + + } else if ((size == -1) && (errno != ENODATA)) { /* Wrong 'volume-id' is set, it should be error */ gf_log (this->name, GF_LOG_WARNING, "%s: failed to fetch volume-id (%s)", dir_data->data, strerror (errno)); + ret = -1; goto out; } else { ret = -1; @@ -4000,8 +4780,8 @@ init (xlator_t *this) /* Now check if the export directory has some other 'gfid', other than that of root '/' */ - ret = sys_lgetxattr (dir_data->data, "trusted.gfid", gfid, 16); - if (ret == 16) { + size = sys_lgetxattr (dir_data->data, "trusted.gfid", gfid, 16); + if (size == 16) { if (!__is_root_gfid (gfid)) { gf_log (this->name, GF_LOG_WARNING, "%s: gfid (%s) is not that of glusterfs '/' ", @@ -4009,34 +4789,36 @@ init (xlator_t *this) ret = -1; goto out; } - } else if (ret != -1) { + } else if (size != -1) { /* Wrong 'gfid' is set, it should be error */ gf_log (this->name, GF_LOG_WARNING, "%s: wrong value set as gfid", dir_data->data); ret = -1; goto out; - } else if ((ret == -1) && (errno != ENODATA)) { + } else if ((size == -1) && (errno != ENODATA)) { /* Wrong 'gfid' is set, it should be error */ gf_log (this->name, GF_LOG_WARNING, "%s: failed to fetch gfid (%s)", dir_data->data, strerror (errno)); + ret = -1; goto out; } else { /* First time volume, set the GFID */ - ret = sys_lsetxattr (dir_data->data, "trusted.gfid", rootgfid, + size = sys_lsetxattr (dir_data->data, "trusted.gfid", rootgfid, 16, XATTR_CREATE); - if (ret) { + if (size) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set gfid (%s)", dir_data->data, strerror (errno)); + ret = -1; goto out; } } - op_ret = sys_lgetxattr (dir_data->data, "system.posix_acl_access", - NULL, 0); - if ((op_ret < 0) && (errno == ENOTSUP)) + size = sys_lgetxattr (dir_data->data, POSIX_ACL_ACCESS_XATTR, + NULL, 0); + if ((size < 0) && (errno == ENOTSUP)) gf_log (this->name, GF_LOG_WARNING, "Posix access control list is not supported."); @@ -4051,20 +4833,6 @@ init (xlator_t *this) _private->base_path = gf_strdup (dir_data->data); _private->base_path_length = strlen (_private->base_path); - _private->trash_path = GF_CALLOC (1, _private->base_path_length - + strlen ("/") - + strlen (GF_REPLICATE_TRASH_DIR) - + 1, - gf_posix_mt_trash_path); - - if (!_private->trash_path) { - ret = -1; - goto out; - } - - strncpy (_private->trash_path, _private->base_path, _private->base_path_length); - strcat (_private->trash_path, "/" GF_REPLICATE_TRASH_DIR); - LOCK_INIT (&_private->lock); ret = dict_get_str (this->options, "hostname", &_private->hostname); @@ -4200,11 +4968,76 @@ init (xlator_t *this) goto out; } + op_ret = posix_handle_trash_init (this); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Posix landfill setup failed"); + ret = -1; + goto out; + } + + _private->aio_init_done = _gf_false; + _private->aio_capable = _gf_false; + + GF_OPTION_INIT ("brick-uid", uid, uint32, out); + GF_OPTION_INIT ("brick-gid", gid, uint32, out); + posix_set_owner (this, uid, gid); + + GF_OPTION_INIT ("linux-aio", _private->aio_configured, bool, out); + + if (_private->aio_configured) { + op_ret = posix_aio_on (this); + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Posix AIO init failed"); + ret = -1; + goto out; + } + } + + GF_OPTION_INIT ("node-uuid-pathinfo", + _private->node_uuid_pathinfo, bool, out); + if (_private->node_uuid_pathinfo && + (uuid_is_null (_private->glusterd_uuid))) { + gf_log (this->name, GF_LOG_INFO, + "glusterd uuid is NULL, pathinfo xattr would" + " fallback to <hostname>:<export>"); + } + + _private->health_check_active = _gf_false; + GF_OPTION_INIT ("health-check-interval", + _private->health_check_interval, uint32, out); + if (_private->health_check_interval) + posix_spawn_health_check_thread (this); + pthread_mutex_init (&_private->janitor_lock, NULL); pthread_cond_init (&_private->janitor_cond, NULL); INIT_LIST_HEAD (&_private->janitor_fds); posix_spawn_janitor_thread (this); + + pthread_mutex_init (&_private->fsync_mutex, NULL); + pthread_cond_init (&_private->fsync_cond, NULL); + INIT_LIST_HEAD (&_private->fsyncs); + + ret = gf_thread_create (&_private->fsyncer, NULL, posix_fsyncer, this); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fsyncer thread" + " creation failed (%s)", strerror (errno)); + goto out; + } + + GF_OPTION_INIT ("batch-fsync-mode", batch_fsync_mode_str, str, out); + + if (set_batch_fsync_mode (_private, batch_fsync_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", + batch_fsync_mode_str); + goto out; + } + + GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec, + uint32, out); out: return ret; } @@ -4270,6 +5103,9 @@ struct xlator_fops fops = { .fxattrop = posix_fxattrop, .setattr = posix_setattr, .fsetattr = posix_fsetattr, + .fallocate = _posix_fallocate, + .discard = posix_discard, + .zerofill = posix_zerofill, }; struct xlator_cbks cbks = { @@ -4297,5 +5133,60 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_ANY }, { .key = {"glusterd-uuid"}, .type = GF_OPTION_TYPE_STR }, + { + .key = {"linux-aio"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Support for native Linux AIO" + }, + { + .key = {"brick-uid"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Support for setting uid of brick's owner" + }, + { + .key = {"brick-gid"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Support for setting gid of brick's owner" + }, + { .key = {"node-uuid-pathinfo"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "return glusterd's node-uuid in pathinfo xattr" + " string instead of hostname" + }, + { + .key = {"health-check-interval"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "30", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Interval in seconds for a filesystem health check, " + "set to 0 to disable" + }, + { .key = {"batch-fsync-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "reverse-fsync", + .description = "Possible values:\n" + "\t- syncfs: Perform one syncfs() on behalf oa batch" + "of fsyncs.\n" + "\t- syncfs-single-fsync: Perform one syncfs() on behalf of a batch" + " of fsyncs and one fsync() per batch.\n" + "\t- syncfs-reverse-fsync: Preform one syncfs() on behalf of a batch" + " of fsyncs and fsync() each file in the batch in reverse order.\n" + " in reverse order.\n" + "\t- reverse-fsync: Perform fsync() of each file in the batch in" + " reverse order." + }, + { .key = {"batch-fsync-delay-usec"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "0", + .description = "Num of usecs to wait for aggregating fsync" + " requests", + }, { .key = {NULL} } }; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 7c2b47bb0..3121db271 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -1,22 +1,12 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef _POSIX_H #define _POSIX_H @@ -53,7 +43,15 @@ #include "timer.h" #include "posix-mem-types.h" #include "posix-handle.h" +#include "call-stub.h" +#ifdef HAVE_LIBAIO +#include <libaio.h> +#include "posix-aio.h" +#endif + +#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/ +#define MAX_NO_VECT 1024 /** * posix_fd - internal structure common to file and directory fd's */ @@ -62,9 +60,7 @@ struct posix_fd { int fd; /* fd returned by the kernel */ int32_t flags; /* flags for open/creat */ DIR * dir; /* handle returned by the kernel */ - int flushwrites; int odirect; - int op_performed; struct list_head list; /* to add to the janitor list */ }; @@ -124,8 +120,52 @@ struct posix_private { /* uuid of glusterd that swapned the brick process */ uuid_t glusterd_uuid; + gf_boolean_t aio_configured; + gf_boolean_t aio_init_done; + gf_boolean_t aio_capable; +#ifdef HAVE_LIBAIO + io_context_t ctxp; + pthread_t aiothread; +#endif + + /* node-uuid in pathinfo xattr */ + gf_boolean_t node_uuid_pathinfo; + + pthread_t fsyncer; + struct list_head fsyncs; + pthread_mutex_t fsync_mutex; + pthread_cond_t fsync_cond; + int fsync_queue_count; + + enum { + BATCH_NONE = 0, + BATCH_SYNCFS, + BATCH_SYNCFS_SINGLE_FSYNC, + BATCH_REVERSE_FSYNC, + BATCH_SYNCFS_REVERSE_FSYNC + } batch_fsync_mode; + + uint32_t batch_fsync_delay_usec; + + /* seconds to sleep between health checks */ + uint32_t health_check_interval; + pthread_t health_check; + gf_boolean_t health_check_active; }; +typedef struct { + xlator_t *this; + const char *real_path; + dict_t *xattr; + struct iatt *stbuf; + loc_t *loc; + inode_t *inode; /* for all do_xattrop() key handling */ + int fd; + int flags; + int32_t op_errno; +} posix_xattr_filler_t; + + #define POSIX_BASE_PATH(this) (((struct posix_private *)this->private)->base_path) #define POSIX_BASE_PATH_LEN(this) (((struct posix_private *)this->private)->base_path_length) @@ -140,23 +180,29 @@ int posix_pstat (xlator_t *this, uuid_t gfid, const char *real_path, struct iatt *iatt); dict_t *posix_lookup_xattr_fill (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr, struct iatt *buf); -int posix_handle_pair (xlator_t *this, const char *real_path, - data_pair_t *trav, int flags); -int posix_fhandle_pair (xlator_t *this, int fd, data_pair_t *trav, int flags); +int posix_handle_pair (xlator_t *this, const char *real_path, char *key, + data_t *value, int flags); +int posix_fhandle_pair (xlator_t *this, int fd, char *key, data_t *value, + int flags); void posix_spawn_janitor_thread (xlator_t *this); int posix_get_file_contents (xlator_t *this, uuid_t pargfid, const char *name, char **contents); -int posix_set_file_contents (xlator_t *this, const char *path, - data_pair_t *trav, int flags); +int posix_set_file_contents (xlator_t *this, const char *path, char *key, + data_t *value, int flags); int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req); -int posix_gfid_heal (xlator_t *this, const char *path, dict_t *xattr_req); +int posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req); int posix_entry_create_xattr_set (xlator_t *this, const char *path, dict_t *dict); int posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd); -int posix_fd_ctx_get_off (fd_t *fd, xlator_t *this, struct posix_fd **pfd, - off_t off); void posix_fill_ino_from_gfid (xlator_t *this, struct iatt *buf); gf_boolean_t posix_special_xattr (char **pattern, char *key); + +void +__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size); +void posix_spawn_health_check_thread (xlator_t *this); + +void *posix_fsyncer (void *); #endif /* _POSIX_H */ |
