diff options
Diffstat (limited to 'xlators/performance')
42 files changed, 13395 insertions, 6234 deletions
diff --git a/xlators/performance/Makefile.am b/xlators/performance/Makefile.am index f7504bbe8..a494190ba 100644 --- a/xlators/performance/Makefile.am +++ b/xlators/performance/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = write-behind read-ahead io-threads io-cache symlink-cache +SUBDIRS = write-behind read-ahead readdir-ahead io-threads io-cache symlink-cache quick-read md-cache open-behind CLEANFILES = diff --git a/xlators/performance/io-cache/src/Makefile.am b/xlators/performance/io-cache/src/Makefile.am index b1bf5bfbf..155be9988 100644 --- a/xlators/performance/io-cache/src/Makefile.am +++ b/xlators/performance/io-cache/src/Makefile.am @@ -1,14 +1,16 @@ xlator_LTLIBRARIES = io-cache.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -io_cache_la_LDFLAGS = -module -avoidversion +io_cache_la_LDFLAGS = -module -avoid-version io_cache_la_SOURCES = io-cache.c page.c ioc-inode.c io_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = io-cache.h +noinst_HEADERS = io-cache.h ioc-mem-types.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(CONTRIBDIR)/rbtree + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/performance/io-cache/src/io-cache.c b/xlators/performance/io-cache/src/io-cache.c index 8f018fc75..201777b38 100644 --- a/xlators/performance/io-cache/src/io-cache.c +++ b/xlators/performance/io-cache/src/io-cache.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -27,364 +18,282 @@ #include "dict.h" #include "xlator.h" #include "io-cache.h" +#include "ioc-mem-types.h" +#include "statedump.h" #include <assert.h> #include <sys/time.h> -uint32_t -ioc_get_priority (ioc_table_t *table, const char *path); +int ioc_log2_page_size; uint32_t ioc_get_priority (ioc_table_t *table, const char *path); -inline ioc_inode_t * +struct volume_options options[]; + + +static inline uint32_t +ioc_hashfn (void *data, int len) +{ + off_t offset; + + offset = *(off_t *) data; + + return (offset >> ioc_log2_page_size); +} + +static inline ioc_inode_t * ioc_inode_reupdate (ioc_inode_t *ioc_inode) { - ioc_table_t *table = ioc_inode->table; + ioc_table_t *table = NULL; - list_add_tail (&ioc_inode->inode_lru, - &table->inode_lru[ioc_inode->weight]); - - return ioc_inode; + table = ioc_inode->table; + + list_add_tail (&ioc_inode->inode_lru, + &table->inode_lru[ioc_inode->weight]); + + return ioc_inode; } -inline ioc_inode_t * +static inline ioc_inode_t * ioc_get_inode (dict_t *dict, char *name) { - ioc_inode_t *ioc_inode = NULL; - data_t *ioc_inode_data = dict_get (dict, name); - ioc_table_t *table = NULL; - - if (ioc_inode_data) { - ioc_inode = data_to_ptr (ioc_inode_data); - table = ioc_inode->table; - - ioc_table_lock (table); - { - if (list_empty (&ioc_inode->inode_lru)) { - ioc_inode = ioc_inode_reupdate (ioc_inode); - } - } - ioc_table_unlock (table); - } - - return ioc_inode; + ioc_inode_t *ioc_inode = NULL; + data_t *ioc_inode_data = NULL; + ioc_table_t *table = NULL; + + ioc_inode_data = dict_get (dict, name); + if (ioc_inode_data) { + ioc_inode = data_to_ptr (ioc_inode_data); + table = ioc_inode->table; + + ioc_table_lock (table); + { + if (list_empty (&ioc_inode->inode_lru)) { + ioc_inode = ioc_inode_reupdate (ioc_inode); + } + } + ioc_table_unlock (table); + } + + return ioc_inode; } int32_t ioc_inode_need_revalidate (ioc_inode_t *ioc_inode) { - int8_t need_revalidate = 0; - struct timeval tv = {0,}; - int32_t ret = -1; - ioc_table_t *table = ioc_inode->table; + int8_t need_revalidate = 0; + struct timeval tv = {0,}; + ioc_table_t *table = NULL; - ret = gettimeofday (&tv, NULL); + table = ioc_inode->table; + + gettimeofday (&tv, NULL); - if (time_elapsed (&tv, &ioc_inode->tv) >= table->cache_timeout) - need_revalidate = 1; + if (time_elapsed (&tv, &ioc_inode->cache.tv) >= table->cache_timeout) + need_revalidate = 1; - return need_revalidate; + return need_revalidate; } /* * __ioc_inode_flush - flush all the cached pages of the given inode * - * @ioc_inode: + * @ioc_inode: * * assumes lock is held */ -int32_t +int64_t __ioc_inode_flush (ioc_inode_t *ioc_inode) { - ioc_page_t *curr = NULL, *next = NULL; - int32_t destroy_size = 0; - int32_t ret = 0; - - list_for_each_entry_safe (curr, next, &ioc_inode->pages, pages) { - ret = ioc_page_destroy (curr); - - if (ret != -1) - destroy_size += ret; - } - - return destroy_size; + ioc_page_t *curr = NULL, *next = NULL; + int64_t destroy_size = 0; + int64_t ret = 0; + + list_for_each_entry_safe (curr, next, &ioc_inode->cache.page_lru, + page_lru) { + ret = __ioc_page_destroy (curr); + + if (ret != -1) + destroy_size += ret; + } + + return destroy_size; } void ioc_inode_flush (ioc_inode_t *ioc_inode) { - int32_t destroy_size = 0; - - ioc_inode_lock (ioc_inode); - { - destroy_size = __ioc_inode_flush (ioc_inode); - } - ioc_inode_unlock (ioc_inode); - - if (destroy_size) { - ioc_table_lock (ioc_inode->table); - { - ioc_inode->table->cache_used -= destroy_size; - } - ioc_table_unlock (ioc_inode->table); - } - - return; + int64_t destroy_size = 0; + + ioc_inode_lock (ioc_inode); + { + destroy_size = __ioc_inode_flush (ioc_inode); + } + ioc_inode_unlock (ioc_inode); + + if (destroy_size) { + ioc_table_lock (ioc_inode->table); + { + ioc_inode->table->cache_used -= destroy_size; + } + ioc_table_unlock (ioc_inode->table); + } + + return; } -/* - * ioc_utimens_cbk - - * - * @frame: - * @cookie: - * @this: - * @op_ret: - * @op_errno: - * @stbuf: - * - */ int32_t -ioc_utimens_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *stbuf) +ioc_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, stbuf); - return 0; + STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop, + xdata); + return 0; } -/* - * ioc_utimens - - * - * @frame: - * @this: - * @loc: - * @tv: - * - */ int32_t -ioc_utimens (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct timespec *tv) +ioc_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { - uint64_t ioc_inode = 0; - inode_ctx_get (loc->inode, this, &ioc_inode); + uint64_t ioc_inode = 0; - if (ioc_inode) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + inode_ctx_get (loc->inode, this, &ioc_inode); - STACK_WIND (frame, ioc_utimens_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->utimens, loc, tv); + if (ioc_inode + && ((valid & GF_SET_ATTR_ATIME) + || (valid & GF_SET_ATTR_MTIME))) + ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); - return 0; + STACK_WIND (frame, ioc_setattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, loc, stbuf, valid, xdata); + + return 0; } int32_t ioc_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *stbuf, dict_t *dict) + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xdata, struct iatt *postparent) { - ioc_inode_t *ioc_inode = NULL; - ioc_local_t *local = frame->local; - ioc_table_t *table = this->private; - ioc_page_t *page = NULL; - data_t *content_data = NULL; - char *src = NULL; - char need_unref = 0; - uint8_t cache_still_valid = 0; - uint32_t weight = 0; - uint64_t tmp_ioc_inode = 0; - char *buf = NULL; - char *tmp = NULL; - int i; - struct iobref *iobref = NULL; - struct iobuf *iobuf = NULL; - - if (op_ret != 0) - goto out; - - inode_ctx_get (inode, this, &tmp_ioc_inode); - ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; - if (ioc_inode) { - cache_still_valid = ioc_cache_still_valid (ioc_inode, - stbuf); - - if (!cache_still_valid) { - ioc_inode_flush (ioc_inode); - } - /* update the time-stamp of revalidation */ - ioc_inode_lock (ioc_inode); - { - gettimeofday (&ioc_inode->tv, NULL); - } - ioc_inode_unlock (ioc_inode); - - ioc_table_lock (ioc_inode->table); - { - list_move_tail (&ioc_inode->inode_lru, - &table->inode_lru[ioc_inode->weight]); - } - ioc_table_unlock (ioc_inode->table); - } - - if (local && stbuf->st_size && - local->need_xattr >= stbuf->st_size) { - if (!ioc_inode) { - weight = ioc_get_priority (table, - local->file_loc.path); - ioc_inode = ioc_inode_update (table, - inode, weight); - inode_ctx_put (inode, this, - (uint64_t)(long)ioc_inode); - } - - ioc_inode_lock (ioc_inode); - { - content_data = dict_get (dict, "glusterfs.content"); - page = ioc_page_get (ioc_inode, 0); - - if (content_data) { - if (page) { - iobref_unref (page->iobref); - free (page->vector); - page->vector = NULL; - - ioc_table_lock (table); - { - table->cache_used -= - iobref_size (page->iobref); - } - ioc_table_unlock (table); - } else { - page = ioc_page_create (ioc_inode, 0); - } - - src = data_to_ptr (content_data); - - iobuf = iobuf_get (this->ctx->iobuf_pool); - page->iobref = iobref_new (); - iobref_add (page->iobref, iobuf); - - memcpy (iobuf->ptr, src, stbuf->st_size); - - page->vector = CALLOC (1, - sizeof (*page->vector)); - page->vector->iov_base = iobuf->ptr; - page->vector->iov_len = stbuf->st_size; - page->count = 1; - - page->waitq = NULL; - page->size = stbuf->st_size; - page->ready = 1; - - ioc_table_lock (table); - { - table->cache_used += - iobref_size (page->iobref); - } - ioc_table_unlock (table); - - } else { - if (!(page && page->ready)) { - gf_log (this->name, GF_LOG_DEBUG, - "page not present"); - - ioc_inode_unlock (ioc_inode); - STACK_WIND (frame, ioc_lookup_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->lookup, - &local->file_loc, - local->xattr_req); - return 0; - } - buf = CALLOC (1, stbuf->st_size); - tmp = buf; - - for (i = 0; i < page->count; i++) { - memcpy (tmp, page->vector[i].iov_base, - page->vector[i].iov_len); - tmp += page->vector[i].iov_len; - } - - gf_log (this->name, GF_LOG_TRACE, - "serving file %s from cache", - local->file_loc.path); - - if (!dict) { - need_unref = 1; - dict = dict_ref ( - get_new_dict ()); - } - dict_set (dict, "glusterfs.content", - data_from_dynptr (buf, - stbuf->st_size)); - } - - ioc_inode->mtime = stbuf->st_mtime; - gettimeofday (&ioc_inode->tv, NULL); - } - ioc_inode_unlock (ioc_inode); - - if (content_data && - ioc_need_prune (ioc_inode->table)) { - ioc_prune (ioc_inode->table); - } - } + ioc_inode_t *ioc_inode = NULL; + ioc_table_t *table = NULL; + uint8_t cache_still_valid = 0; + uint64_t tmp_ioc_inode = 0; + uint32_t weight = 0xffffffff; + const char *path = NULL; + ioc_local_t *local = NULL; -out: - STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, dict); + if (op_ret != 0) + goto out; + + local = frame->local; + if (local == NULL) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + if (!this || !this->private) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + table = this->private; - if (need_unref) { - dict_unref (dict); - } + path = local->file_loc.path; - if (iobref) - iobref_unref (iobref); - if (iobuf) - iobuf_unref (iobuf); + LOCK (&inode->lock); + { + __inode_ctx_get (inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; - return 0; + if (!ioc_inode) { + weight = ioc_get_priority (table, path); + + ioc_inode = ioc_inode_update (table, inode, + weight); + + __inode_ctx_put (inode, this, + (uint64_t)(long)ioc_inode); + } + } + UNLOCK (&inode->lock); + + ioc_inode_lock (ioc_inode); + { + if (ioc_inode->cache.mtime == 0) { + ioc_inode->cache.mtime = stbuf->ia_mtime; + ioc_inode->cache.mtime_nsec = stbuf->ia_mtime_nsec; + } + + ioc_inode->ia_size = stbuf->ia_size; + } + ioc_inode_unlock (ioc_inode); + + cache_still_valid = ioc_cache_still_valid (ioc_inode, + stbuf); + + if (!cache_still_valid) { + ioc_inode_flush (ioc_inode); + } + + ioc_table_lock (ioc_inode->table); + { + list_move_tail (&ioc_inode->inode_lru, + &table->inode_lru[ioc_inode->weight]); + } + ioc_table_unlock (ioc_inode->table); + +out: + if (frame->local != NULL) { + local = frame->local; + loc_wipe (&local->file_loc); + } + + STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, stbuf, + xdata, postparent); + return 0; } -int32_t +int32_t ioc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xattr_req) + dict_t *xdata) { - uint64_t content_limit = 0; - uint64_t tmp_ioc_inode = 0; - ioc_inode_t *ioc_inode = NULL; - ioc_page_t *page = NULL; - ioc_local_t *local = NULL; + ioc_local_t *local = NULL; + int32_t op_errno = -1, ret = -1; - if (GF_FILE_CONTENT_REQUESTED(xattr_req, &content_limit)) { - local = CALLOC (1, sizeof (*local)); - local->need_xattr = content_limit; - local->file_loc.path = loc->path; - local->file_loc.inode = loc->inode; - frame->local = local; - - inode_ctx_get (loc->inode, this, &tmp_ioc_inode); - ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; - - if (ioc_inode) { - ioc_inode_lock (ioc_inode); - { - page = ioc_page_get (ioc_inode, 0); - if ((content_limit <= - ioc_inode->table->page_size) && - page && page->ready) { - local->need_xattr = -1; - } - } - ioc_inode_unlock (ioc_inode); - } - } - - STACK_WIND (frame, ioc_lookup_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->lookup, loc, xattr_req); + local = mem_get0 (this->local_pool); + if (local == NULL) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto unwind; + } - return 0; + ret = loc_copy (&local->file_loc, loc); + if (ret != 0) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto unwind; + } + + frame->local = local; + + STACK_WIND (frame, ioc_lookup_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, loc, xdata); + + return 0; + +unwind: + STACK_UNWIND_STRICT (lookup, frame, -1, op_errno, NULL, NULL, + NULL, NULL); + + return 0; } /* - * ioc_forget - + * ioc_forget - * * @frame: * @this: @@ -394,19 +303,33 @@ ioc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t ioc_forget (xlator_t *this, inode_t *inode) { - uint64_t ioc_inode = 0; + uint64_t ioc_inode = 0; + + inode_ctx_get (inode, this, &ioc_inode); - inode_ctx_get (inode, this, &ioc_inode); + if (ioc_inode) + ioc_inode_destroy ((ioc_inode_t *)(long)ioc_inode); + + return 0; +} + +static int32_t +ioc_invalidate(xlator_t *this, inode_t *inode) +{ + uint64_t ioc_addr = 0; + ioc_inode_t *ioc_inode = NULL; + + inode_ctx_get(inode, this, (uint64_t *) &ioc_addr); + ioc_inode = (void *) ioc_addr; if (ioc_inode) - ioc_inode_destroy ((ioc_inode_t *)(long)ioc_inode); - + ioc_inode_flush(ioc_inode); + return 0; } - -/* - * ioc_cache_validate_cbk - +/* + * ioc_cache_validate_cbk - * * @frame: * @cookie: @@ -418,90 +341,103 @@ ioc_forget (xlator_t *this, inode_t *inode) */ int32_t ioc_cache_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *stbuf) + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + dict_t *xdata) { - ioc_local_t *local = NULL; - ioc_inode_t *ioc_inode = NULL; - size_t destroy_size = 0; - struct stat *local_stbuf = NULL; + ioc_local_t *local = NULL; + ioc_inode_t *ioc_inode = NULL; + size_t destroy_size = 0; + struct iatt *local_stbuf = NULL; local = frame->local; - ioc_inode = local->inode; + ioc_inode = local->inode; local_stbuf = stbuf; - if ((op_ret == -1) || - ((op_ret >= 0) && !ioc_cache_still_valid(ioc_inode, stbuf))) { - gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG, - "cache for inode(%p) is invalid. flushing all pages", - ioc_inode); - /* NOTE: only pages with no waiting frames are flushed by - * ioc_inode_flush. page_fault will be generated for all - * the pages which have waiting frames by ioc_inode_wakeup() - */ - ioc_inode_lock (ioc_inode); - { - destroy_size = __ioc_inode_flush (ioc_inode); - if (op_ret >= 0) - ioc_inode->mtime = stbuf->st_mtime; - } - ioc_inode_unlock (ioc_inode); - local_stbuf = NULL; - } - - if (destroy_size) { - ioc_table_lock (ioc_inode->table); - { - ioc_inode->table->cache_used -= destroy_size; - } - ioc_table_unlock (ioc_inode->table); - } - - if (op_ret < 0) - local_stbuf = NULL; - - ioc_inode_lock (ioc_inode); - { - gettimeofday (&ioc_inode->tv, NULL); - } - ioc_inode_unlock (ioc_inode); - - ioc_inode_wakeup (frame, ioc_inode, local_stbuf); - - /* any page-fault initiated by ioc_inode_wakeup() will have its own - * fd_ref on fd, safe to unref validate frame's private copy - */ - fd_unref (local->fd); - - STACK_DESTROY (frame->root); - - return 0; + if ((op_ret == -1) || + ((op_ret >= 0) && !ioc_cache_still_valid(ioc_inode, stbuf))) { + gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG, + "cache for inode(%p) is invalid. flushing all pages", + ioc_inode); + /* NOTE: only pages with no waiting frames are flushed by + * ioc_inode_flush. page_fault will be generated for all + * the pages which have waiting frames by ioc_inode_wakeup() + */ + ioc_inode_lock (ioc_inode); + { + destroy_size = __ioc_inode_flush (ioc_inode); + if (op_ret >= 0) { + ioc_inode->cache.mtime = stbuf->ia_mtime; + ioc_inode->cache.mtime_nsec + = stbuf->ia_mtime_nsec; + } + } + ioc_inode_unlock (ioc_inode); + local_stbuf = NULL; + } + + if (destroy_size) { + ioc_table_lock (ioc_inode->table); + { + ioc_inode->table->cache_used -= destroy_size; + } + ioc_table_unlock (ioc_inode->table); + } + + if (op_ret < 0) + local_stbuf = NULL; + + ioc_inode_lock (ioc_inode); + { + gettimeofday (&ioc_inode->cache.tv, NULL); + } + ioc_inode_unlock (ioc_inode); + + ioc_inode_wakeup (frame, ioc_inode, local_stbuf); + + /* any page-fault initiated by ioc_inode_wakeup() will have its own + * fd_ref on fd, safe to unref validate frame's private copy + */ + fd_unref (local->fd); + + STACK_DESTROY (frame->root); + + return 0; } int32_t ioc_wait_on_inode (ioc_inode_t *ioc_inode, ioc_page_t *page) { - ioc_waitq_t *waiter = NULL, *trav = NULL; - uint32_t page_found = 0; - - trav = ioc_inode->waitq; - - while (trav) { - if (trav->data == page) { - page_found = 1; - break; - } - trav = trav->next; - } - - if (!page_found) { - waiter = CALLOC (1, sizeof (ioc_waitq_t)); - ERR_ABORT (waiter); - waiter->data = page; - waiter->next = ioc_inode->waitq; - ioc_inode->waitq = waiter; - } - - return 0; + ioc_waitq_t *waiter = NULL, *trav = NULL; + uint32_t page_found = 0; + int32_t ret = 0; + + trav = ioc_inode->waitq; + + while (trav) { + if (trav->data == page) { + page_found = 1; + break; + } + trav = trav->next; + } + + if (!page_found) { + waiter = GF_CALLOC (1, sizeof (ioc_waitq_t), + gf_ioc_mt_ioc_waitq_t); + if (waiter == NULL) { + gf_log (ioc_inode->table->xl->name, GF_LOG_ERROR, + "out of memory"); + ret = -ENOMEM; + goto out; + } + + waiter->data = page; + waiter->next = ioc_inode->waitq; + ioc_inode->waitq = waiter; + } + +out: + return ret; } /* @@ -514,54 +450,76 @@ ioc_wait_on_inode (ioc_inode_t *ioc_inode, ioc_page_t *page) */ int32_t ioc_cache_validate (call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd, - ioc_page_t *page) + ioc_page_t *page) { - call_frame_t *validate_frame = NULL; - ioc_local_t *validate_local = NULL; - - validate_local = CALLOC (1, sizeof (ioc_local_t)); - ERR_ABORT (validate_local); - validate_frame = copy_frame (frame); - validate_local->fd = fd_ref (fd); - validate_local->inode = ioc_inode; - validate_frame->local = validate_local; - - STACK_WIND (validate_frame, ioc_cache_validate_cbk, + call_frame_t *validate_frame = NULL; + ioc_local_t *validate_local = NULL; + ioc_local_t *local = NULL; + int32_t ret = 0; + + local = frame->local; + validate_local = mem_get0 (THIS->local_pool); + if (validate_local == NULL) { + ret = -1; + local->op_ret = -1; + local->op_errno = ENOMEM; + gf_log (ioc_inode->table->xl->name, GF_LOG_ERROR, + "out of memory"); + goto out; + } + + validate_frame = copy_frame (frame); + if (validate_frame == NULL) { + ret = -1; + local->op_ret = -1; + local->op_errno = ENOMEM; + mem_put (validate_local); + gf_log (ioc_inode->table->xl->name, GF_LOG_ERROR, + "out of memory"); + goto out; + } + + validate_local->fd = fd_ref (fd); + validate_local->inode = ioc_inode; + validate_frame->local = validate_local; + + STACK_WIND (validate_frame, ioc_cache_validate_cbk, FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->fstat, fd); + FIRST_CHILD (frame->this)->fops->fstat, fd, NULL); - return 0; +out: + return ret; } -inline uint32_t +static inline uint32_t is_match (const char *path, const char *pattern) { - char *pathname = NULL; - int32_t ret = 0; - - pathname = strdup (path); - ret = fnmatch (pattern, path, FNM_NOESCAPE); - - free (pathname); - - return (ret == 0); + int32_t ret = 0; + + ret = fnmatch (pattern, path, FNM_NOESCAPE); + + return (ret == 0); } uint32_t ioc_get_priority (ioc_table_t *table, const char *path) { - uint32_t priority = 0; - struct ioc_priority *curr = NULL; - - list_for_each_entry (curr, &table->priority_list, list) { - if (is_match (path, curr->pattern)) - priority = curr->priority; - } - - return priority; + uint32_t priority = 1; + struct ioc_priority *curr = NULL; + + if (list_empty(&table->priority_list)) + return priority; + + priority = 0; + list_for_each_entry (curr, &table->priority_list, list) { + if (is_match (path, curr->pattern)) + priority = curr->priority; + } + + return priority; } -/* +/* * ioc_open_cbk - open callback for io cache * * @frame: call frame @@ -574,73 +532,74 @@ ioc_get_priority (ioc_table_t *table, const char *path) */ int32_t ioc_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, fd_t *fd) + int32_t op_errno, fd_t *fd, dict_t *xdata) { - uint64_t tmp_ioc_inode = 0; - ioc_local_t *local = NULL; - ioc_table_t *table = NULL; - ioc_inode_t *ioc_inode = NULL; - inode_t *inode = NULL; - uint32_t weight = 0; - const char *path = NULL; + uint64_t tmp_ioc_inode = 0; + ioc_local_t *local = NULL; + ioc_table_t *table = NULL; + ioc_inode_t *ioc_inode = NULL; + uint32_t weight = 0xffffffff; local = frame->local; + if (!this || !this->private) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + table = this->private; - inode = local->file_loc.inode; - path = local->file_loc.path; - if (op_ret != -1) { - /* look for ioc_inode corresponding to this fd */ - LOCK (&fd->inode->lock); - { - __inode_ctx_get (fd->inode, this, &tmp_ioc_inode); - ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; - - if (!ioc_inode) { - /* - this is the first time someone is opening - this file, assign weight - */ - weight = ioc_get_priority (table, path); - - ioc_inode = ioc_inode_update (table, inode, - weight); - __inode_ctx_put (fd->inode, this, - (uint64_t)(long)ioc_inode); - } else { - ioc_table_lock (ioc_inode->table); - { - list_move_tail (&ioc_inode->inode_lru, - &table->inode_lru[ioc_inode->weight]); - } - ioc_table_unlock (ioc_inode->table); + if (op_ret != -1) { + inode_ctx_get (fd->inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + + //TODO: see why inode context is NULL and handle it. + if (!ioc_inode) { + gf_log (this->name, GF_LOG_ERROR, "inode context is " + "NULL (%s)", uuid_utoa (fd->inode->gfid)); + goto out; + } + + ioc_table_lock (ioc_inode->table); + { + list_move_tail (&ioc_inode->inode_lru, + &table->inode_lru[ioc_inode->weight]); + } + ioc_table_unlock (ioc_inode->table); + + ioc_inode_lock (ioc_inode); + { + if ((table->min_file_size > ioc_inode->ia_size) + || ((table->max_file_size > 0) + && (table->max_file_size < ioc_inode->ia_size))) { + fd_ctx_set (fd, this, 1); } + } + ioc_inode_unlock (ioc_inode); + + /* If O_DIRECT open, we disable caching on it */ + if ((local->flags & O_DIRECT)){ + /* O_DIRECT is only for one fd, not the inode + * as a whole + */ + fd_ctx_set (fd, this, 1); + } + + /* weight = 0, we disable caching on it */ + if (weight == 0) { + /* we allow a pattern-matched cache disable this way + */ + fd_ctx_set (fd, this, 1); + } + } + +out: + mem_put (local); + frame->local = NULL; - } - UNLOCK (&fd->inode->lock); - - /* If mandatory locking has been enabled on this file, - we disable caching on it */ - if (((inode->st_mode & S_ISGID) - && !(inode->st_mode & S_IXGRP))) { - fd_ctx_set (fd, this, 1); - } - - /* If O_DIRECT open, we disable caching on it */ - if ((local->flags & O_DIRECT)){ - /* O_DIRECT is only for one fd, not the inode - * as a whole - */ - fd_ctx_set (fd, this, 1); - } - } - - FREE (local); - frame->local = NULL; - - STACK_UNWIND (frame, op_ret, op_errno, fd); + STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); - return 0; + return 0; } /* @@ -659,56 +618,182 @@ ioc_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t ioc_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, - inode_t *inode, struct stat *buf) + inode_t *inode, struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - ioc_local_t *local = NULL; - ioc_table_t *table = NULL; - ioc_inode_t *ioc_inode = NULL; - uint32_t weight = 0; - const char *path = NULL; + ioc_local_t *local = NULL; + ioc_table_t *table = NULL; + ioc_inode_t *ioc_inode = NULL; + uint32_t weight = 0xffffffff; + const char *path = NULL; + int ret = -1; local = frame->local; + if (!this || !this->private) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + table = this->private; path = local->file_loc.path; - if (op_ret != -1) { - { - /* assign weight */ - weight = ioc_get_priority (table, path); + if (op_ret != -1) { + /* assign weight */ + weight = ioc_get_priority (table, path); + + ioc_inode = ioc_inode_update (table, inode, weight); + + ioc_inode_lock (ioc_inode); + { + ioc_inode->cache.mtime = buf->ia_mtime; + ioc_inode->cache.mtime_nsec = buf->ia_mtime_nsec; + ioc_inode->ia_size = buf->ia_size; + + if ((table->min_file_size > ioc_inode->ia_size) + || ((table->max_file_size > 0) + && (table->max_file_size < ioc_inode->ia_size))) { + ret = fd_ctx_set (fd, this, 1); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set fd ctx", + local->file_loc.path); + } + } + ioc_inode_unlock (ioc_inode); + + inode_ctx_put (fd->inode, this, + (uint64_t)(long)ioc_inode); + + /* If O_DIRECT open, we disable caching on it */ + if (local->flags & O_DIRECT) { + /* + * O_DIRECT is only for one fd, not the inode + * as a whole */ + ret = fd_ctx_set (fd, this, 1); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set fd ctx", + local->file_loc.path); + } + + /* if weight == 0, we disable caching on it */ + if (!weight) { + /* we allow a pattern-matched cache disable this way */ + ret = fd_ctx_set (fd, this, 1); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set fd ctx", + local->file_loc.path); + } + + } - ioc_inode = ioc_inode_update (table, inode, weight); +out: + frame->local = NULL; + mem_put (local); - inode_ctx_put (fd->inode, this, - (uint64_t)(long)ioc_inode); - } - /* - * If mandatory locking has been enabled on this file, - * we disable caching on it - */ - if ((inode->st_mode & S_ISGID) && - !(inode->st_mode & S_IXGRP)) { - fd_ctx_set (fd, this, 1); - } - - /* If O_DIRECT open, we disable caching on it */ - if (local->flags & O_DIRECT){ - /* - * O_DIRECT is only for one fd, not the inode - * as a whole - */ - fd_ctx_set (fd, this, 1); - } - - } - - frame->local = NULL; - FREE (local); - - STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); - return 0; + return 0; +} + + +int32_t +ioc_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + ioc_local_t *local = NULL; + ioc_table_t *table = NULL; + ioc_inode_t *ioc_inode = NULL; + uint32_t weight = 0xffffffff; + const char *path = NULL; + + local = frame->local; + if (!this || !this->private) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + table = this->private; + path = local->file_loc.path; + + if (op_ret != -1) { + /* assign weight */ + weight = ioc_get_priority (table, path); + + ioc_inode = ioc_inode_update (table, inode, weight); + + ioc_inode_lock (ioc_inode); + { + ioc_inode->cache.mtime = buf->ia_mtime; + ioc_inode->cache.mtime_nsec = buf->ia_mtime_nsec; + ioc_inode->ia_size = buf->ia_size; + } + ioc_inode_unlock (ioc_inode); + + inode_ctx_put (inode, this, + (uint64_t)(long)ioc_inode); + } + +out: + frame->local = NULL; + + loc_wipe (&local->file_loc); + mem_put (local); + + STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +ioc_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + ioc_local_t *local = NULL; + int32_t op_errno = -1, ret = -1; + + local = mem_get0 (this->local_pool); + if (local == NULL) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto unwind; + } + + ret = loc_copy (&local->file_loc, loc); + if (ret != 0) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto unwind; + } + + frame->local = local; + + STACK_WIND (frame, ioc_mknod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, + loc, mode, rdev, umask, xdata); + return 0; + +unwind: + if (local != NULL) { + loc_wipe (&local->file_loc); + mem_put (local); + } + + STACK_UNWIND_STRICT (mknod, frame, -1, op_errno, NULL, NULL, + NULL, NULL, NULL); + + return 0; } + /* * ioc_open - open fop for io cache * @frame: @@ -719,29 +804,34 @@ ioc_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, */ int32_t ioc_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd) + fd_t *fd, dict_t *xdata) { - - ioc_local_t *local = NULL; - local = CALLOC (1, sizeof (ioc_local_t)); - ERR_ABORT (local); + ioc_local_t *local = NULL; - local->flags = flags; - local->file_loc.path = loc->path; - local->file_loc.inode = loc->inode; - - frame->local = local; - - STACK_WIND (frame, ioc_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, loc, flags, fd); + local = mem_get0 (this->local_pool); + if (local == NULL) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + STACK_UNWIND_STRICT (open, frame, -1, ENOMEM, NULL, NULL); + return 0; + } - return 0; + local->flags = flags; + local->file_loc.path = loc->path; + local->file_loc.inode = loc->inode; + + frame->local = local; + + STACK_WIND (frame, ioc_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, + xdata); + + return 0; } /* * ioc_create - create fop for io cache - * + * * @frame: * @this: * @pathname: @@ -751,21 +841,27 @@ ioc_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, */ int32_t ioc_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, fd_t *fd) + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - ioc_local_t *local = NULL; - - local = CALLOC (1, sizeof (ioc_local_t)); - ERR_ABORT (local); + ioc_local_t *local = NULL; - local->flags = flags; - local->file_loc.path = loc->path; - frame->local = local; + local = mem_get0 (this->local_pool); + if (local == NULL) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + STACK_UNWIND_STRICT (create, frame, -1, ENOMEM, NULL, NULL, + NULL, NULL, NULL, NULL); + return 0; + } - STACK_WIND (frame, ioc_create_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, loc, flags, mode, fd); + local->flags = flags; + local->file_loc.path = loc->path; + frame->local = local; - return 0; + STACK_WIND (frame, ioc_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, + umask, fd, xdata); + + return 0; } @@ -773,7 +869,7 @@ ioc_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, /* * ioc_release - release fop for io cache - * + * * @frame: * @this: * @fd: @@ -782,11 +878,11 @@ ioc_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, int32_t ioc_release (xlator_t *this, fd_t *fd) { - return 0; + return 0; } -/* - * ioc_readv_disabled_cbk +/* + * ioc_readv_disabled_cbk * @frame: * @cookie: * @this: @@ -795,160 +891,200 @@ ioc_release (xlator_t *this, fd_t *fd) * @vector: * @count: * - */ + */ int32_t ioc_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct stat *stbuf, - struct iobref *iobref) + int32_t count, struct iatt *stbuf, + struct iobref *iobref, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf, iobref); - return 0; + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, + stbuf, iobref, xdata); + return 0; } int32_t ioc_need_prune (ioc_table_t *table) { - int64_t cache_difference = 0; - - ioc_table_lock (table); - { - cache_difference = table->cache_used - table->cache_size; - } - ioc_table_unlock (table); - - if (cache_difference > 0) - return 1; - else - return 0; + int64_t cache_difference = 0; + + ioc_table_lock (table); + { + cache_difference = table->cache_used - table->cache_size; + } + ioc_table_unlock (table); + + if (cache_difference > 0) + return 1; + else + return 0; } /* * ioc_dispatch_requests - - * + * * @frame: * @inode: * - * + * */ void ioc_dispatch_requests (call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd, - off_t offset, size_t size) + off_t offset, size_t size) { - ioc_local_t *local = NULL; - ioc_table_t *table = NULL; - ioc_page_t *trav = NULL; - ioc_waitq_t *waitq = NULL; - off_t rounded_offset = 0; - off_t rounded_end = 0; - off_t trav_offset = 0; - int32_t fault = 0; - size_t trav_size = 0; - off_t local_offset = 0; - int8_t need_validate = 0; - int8_t might_need_validate = 0; /* - * if a page exists, do we need - * to validate it? - */ + ioc_local_t *local = NULL; + ioc_table_t *table = NULL; + ioc_page_t *trav = NULL; + ioc_waitq_t *waitq = NULL; + off_t rounded_offset = 0; + off_t rounded_end = 0; + off_t trav_offset = 0; + int32_t fault = 0; + size_t trav_size = 0; + off_t local_offset = 0; + int32_t ret = -1; + int8_t need_validate = 0; + int8_t might_need_validate = 0; /* + * if a page exists, do we need + * to validate it? + */ local = frame->local; table = ioc_inode->table; - rounded_offset = floor (offset, table->page_size); - rounded_end = roof (offset + size, table->page_size); - trav_offset = rounded_offset; - - /* once a frame does read, it should be waiting on something */ - local->wait_count++; - - /* Requested region can fall in three different pages, - * 1. Ready - region is already in cache, we just have to serve it. - * 2. In-transit - page fault has been generated on this page, we need - * to wait till the page is ready - * 3. Fault - page is not in cache, we have to generate a page fault - */ - - might_need_validate = ioc_inode_need_revalidate (ioc_inode); - - while (trav_offset < rounded_end) { - ioc_inode_lock (ioc_inode); - //{ - - /* look for requested region in the cache */ - trav = ioc_page_get (ioc_inode, trav_offset); - - local_offset = max (trav_offset, offset); - trav_size = min (((offset+size) - local_offset), - table->page_size); - - if (!trav) { - /* page not in cache, we need to generate page fault */ - trav = ioc_page_create (ioc_inode, trav_offset); - fault = 1; - if (!trav) { - gf_log (frame->this->name, GF_LOG_CRITICAL, - "out of memory"); - } - } - - ioc_wait_on_page (trav, frame, local_offset, trav_size); - - if (trav->ready) { - /* page found in cache */ - if (!might_need_validate) { - /* fresh enough */ - gf_log (frame->this->name, GF_LOG_TRACE, - "cache hit for trav_offset=%"PRId64"" - "/local_offset=%"PRId64"", - trav_offset, local_offset); - waitq = ioc_page_wakeup (trav); - } else { - /* if waitq already exists, fstat revalidate is - already on the way */ - if (!ioc_inode->waitq) { - need_validate = 1; - } - ioc_wait_on_inode (ioc_inode, trav); - } - } - - //} - ioc_inode_unlock (ioc_inode); - - ioc_waitq_return (waitq); - waitq = NULL; - - if (fault) { - fault = 0; - /* new page created, increase the table->cache_used */ - ioc_page_fault (ioc_inode, frame, fd, trav_offset); - } - - if (need_validate) { - need_validate = 0; - gf_log (frame->this->name, GF_LOG_TRACE, - "sending validate request for " - "inode(%"PRId64") at offset=%"PRId64"", - fd->inode->ino, trav_offset); - ioc_cache_validate (frame, ioc_inode, fd, trav); - } - - trav_offset += table->page_size; - } - - ioc_frame_return (frame); - - if (ioc_need_prune (ioc_inode->table)) { - ioc_prune (ioc_inode->table); - } - - return; + rounded_offset = floor (offset, table->page_size); + rounded_end = roof (offset + size, table->page_size); + trav_offset = rounded_offset; + + /* once a frame does read, it should be waiting on something */ + local->wait_count++; + + /* Requested region can fall in three different pages, + * 1. Ready - region is already in cache, we just have to serve it. + * 2. In-transit - page fault has been generated on this page, we need + * to wait till the page is ready + * 3. Fault - page is not in cache, we have to generate a page fault + */ + + might_need_validate = ioc_inode_need_revalidate (ioc_inode); + + while (trav_offset < rounded_end) { + ioc_inode_lock (ioc_inode); + { + /* look for requested region in the cache */ + trav = __ioc_page_get (ioc_inode, trav_offset); + + local_offset = max (trav_offset, offset); + trav_size = min (((offset+size) - local_offset), + table->page_size); + + if (!trav) { + /* page not in cache, we need to generate page + * fault + */ + trav = __ioc_page_create (ioc_inode, + trav_offset); + fault = 1; + if (!trav) { + gf_log (frame->this->name, + GF_LOG_CRITICAL, + "out of memory"); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + } + + __ioc_wait_on_page (trav, frame, local_offset, + trav_size); + + if (trav->ready) { + /* page found in cache */ + if (!might_need_validate && !ioc_inode->waitq) { + /* fresh enough */ + gf_log (frame->this->name, GF_LOG_TRACE, + "cache hit for trav_offset=%" + PRId64"/local_offset=%"PRId64"", + trav_offset, local_offset); + waitq = __ioc_page_wakeup (trav, + trav->op_errno); + } else { + /* if waitq already exists, fstat + * revalidate is + * already on the way + */ + if (!ioc_inode->waitq) { + need_validate = 1; + } + + ret = ioc_wait_on_inode (ioc_inode, + trav); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + need_validate = 0; + + waitq = __ioc_page_wakeup (trav, + trav->op_errno); + ioc_inode_unlock (ioc_inode); + + ioc_waitq_return (waitq); + waitq = NULL; + goto out; + } + } + } + + } + ioc_inode_unlock (ioc_inode); + + ioc_waitq_return (waitq); + waitq = NULL; + + if (fault) { + fault = 0; + /* new page created, increase the table->cache_used */ + ioc_page_fault (ioc_inode, frame, fd, trav_offset); + } + + if (need_validate) { + need_validate = 0; + gf_log (frame->this->name, GF_LOG_TRACE, + "sending validate request for " + "inode(%s) at offset=%"PRId64"", + uuid_utoa (fd->inode->gfid), trav_offset); + ret = ioc_cache_validate (frame, ioc_inode, fd, trav); + if (ret == -1) { + ioc_inode_lock (ioc_inode); + { + waitq = __ioc_page_wakeup (trav, + trav->op_errno); + } + ioc_inode_unlock (ioc_inode); + + ioc_waitq_return (waitq); + waitq = NULL; + goto out; + } + } + + trav_offset += table->page_size; + } + +out: + ioc_frame_return (frame); + + if (ioc_need_prune (ioc_inode->table)) { + ioc_prune (ioc_inode->table); + } + + return; } /* * ioc_readv - - * + * * @frame: * @this: * @fd: @@ -958,65 +1094,107 @@ ioc_dispatch_requests (call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd, */ int32_t ioc_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset) + size_t size, off_t offset, uint32_t flags, dict_t *xdata) { - uint64_t tmp_ioc_inode = 0; - ioc_inode_t *ioc_inode = NULL; - ioc_local_t *local = NULL; - uint32_t weight = 0; - - inode_ctx_get (fd->inode, this, &tmp_ioc_inode); - ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; - if (!ioc_inode) { - /* caching disabled, go ahead with normal readv */ - STACK_WIND (frame, ioc_readv_disabled_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->readv, fd, size, - offset); - return 0; - } - - if (!fd_ctx_get (fd, this, NULL)) { - /* disable caching for this fd, go ahead with normal readv */ - STACK_WIND (frame, ioc_readv_disabled_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->readv, fd, size, - offset); - return 0; - } - - local = (ioc_local_t *) CALLOC (1, sizeof (ioc_local_t)); - ERR_ABORT (local); - INIT_LIST_HEAD (&local->fill_list); - - frame->local = local; - local->pending_offset = offset; - local->pending_size = size; - local->offset = offset; - local->size = size; - local->inode = ioc_inode; - - gf_log (this->name, GF_LOG_TRACE, - "NEW REQ (%p) offset = %"PRId64" && size = %"GF_PRI_SIZET"", - frame, offset, size); - - weight = ioc_inode->weight; - - ioc_table_lock (ioc_inode->table); - { - list_move_tail (&ioc_inode->inode_lru, - &ioc_inode->table->inode_lru[weight]); - } - ioc_table_unlock (ioc_inode->table); - - ioc_dispatch_requests (frame, ioc_inode, fd, offset, size); - - return 0; + uint64_t tmp_ioc_inode = 0; + ioc_inode_t *ioc_inode = NULL; + ioc_local_t *local = NULL; + uint32_t weight = 0; + ioc_table_t *table = NULL; + int32_t op_errno = -1; + + if (!this) { + goto out; + } + + inode_ctx_get (fd->inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + if (!ioc_inode) { + /* caching disabled, go ahead with normal readv */ + STACK_WIND (frame, ioc_readv_disabled_cbk, + FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->readv, fd, size, + offset, flags, xdata); + return 0; + } + + + table = this->private; + + if (!table) { + gf_log (this->name, GF_LOG_ERROR, "table is null"); + op_errno = EINVAL; + goto out; + } + + ioc_inode_lock (ioc_inode); + { + if (!ioc_inode->cache.page_table) { + ioc_inode->cache.page_table + = rbthash_table_init + (IOC_PAGE_TABLE_BUCKET_COUNT, + ioc_hashfn, NULL, 0, + table->mem_pool); + + if (ioc_inode->cache.page_table == NULL) { + op_errno = ENOMEM; + ioc_inode_unlock (ioc_inode); + goto out; + } + } + } + ioc_inode_unlock (ioc_inode); + + if (!fd_ctx_get (fd, this, NULL)) { + /* disable caching for this fd, go ahead with normal readv */ + STACK_WIND (frame, ioc_readv_disabled_cbk, + FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->readv, fd, size, + offset, flags, xdata); + return 0; + } + + local = mem_get0 (this->local_pool); + if (local == NULL) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + op_errno = ENOMEM; + goto out; + } + + INIT_LIST_HEAD (&local->fill_list); + + frame->local = local; + local->pending_offset = offset; + local->pending_size = size; + local->offset = offset; + local->size = size; + local->inode = ioc_inode; + + gf_log (this->name, GF_LOG_TRACE, + "NEW REQ (%p) offset = %"PRId64" && size = %"GF_PRI_SIZET"", + frame, offset, size); + + weight = ioc_inode->weight; + + ioc_table_lock (ioc_inode->table); + { + list_move_tail (&ioc_inode->inode_lru, + &ioc_inode->table->inode_lru[weight]); + } + ioc_table_unlock (ioc_inode->table); + + ioc_dispatch_requests (frame, ioc_inode, fd, offset, size); + return 0; + +out: + STACK_UNWIND_STRICT (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, + NULL); + return 0; } /* * ioc_writev_cbk - - * + * * @frame: * @cookie: * @this: @@ -1026,24 +1204,26 @@ ioc_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, */ int32_t ioc_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *stbuf) + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - ioc_local_t *local = NULL; - uint64_t ioc_inode = 0; + ioc_local_t *local = NULL; + uint64_t ioc_inode = 0; local = frame->local; - inode_ctx_get (local->fd->inode, this, &ioc_inode); - - if (ioc_inode) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + inode_ctx_get (local->fd->inode, this, &ioc_inode); - STACK_UNWIND (frame, op_ret, op_errno, stbuf); - return 0; + if (ioc_inode) + ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } /* * ioc_writev - * + * * @frame: * @this: * @fd: @@ -1054,33 +1234,38 @@ ioc_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, */ int32_t ioc_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) { - ioc_local_t *local = NULL; - uint64_t ioc_inode = 0; - - local = CALLOC (1, sizeof (ioc_local_t)); - ERR_ABORT (local); + ioc_local_t *local = NULL; + uint64_t ioc_inode = 0; - /* TODO: why is it not fd_ref'ed */ - local->fd = fd; - frame->local = local; + local = mem_get0 (this->local_pool); + if (local == NULL) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); - inode_ctx_get (fd->inode, this, &ioc_inode); - if (ioc_inode) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + STACK_UNWIND_STRICT (writev, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } - STACK_WIND (frame, ioc_writev_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, - iobref); + /* TODO: why is it not fd_ref'ed */ + local->fd = fd; + frame->local = local; - return 0; + inode_ctx_get (fd->inode, this, &ioc_inode); + if (ioc_inode) + ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + + STACK_WIND (frame, ioc_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + + return 0; } /* * ioc_truncate_cbk - - * + * * @frame: * @cookie: * @this: @@ -1089,41 +1274,69 @@ ioc_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, * @buf: * */ -int32_t +int32_t ioc_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; + STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; +} + + +/* + * ioc_ftruncate_cbk - + * + * @frame: + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @buf: + * + */ +int32_t +ioc_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + + STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; } + /* * ioc_truncate - - * + * * @frame: * @this: * @loc: * @offset: * */ -int32_t -ioc_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) +int32_t +ioc_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - uint64_t ioc_inode = 0; - inode_ctx_get (loc->inode, this, &ioc_inode); + uint64_t ioc_inode = 0; - if (ioc_inode) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + inode_ctx_get (loc->inode, this, &ioc_inode); - STACK_WIND (frame, ioc_truncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, loc, offset); - return 0; + if (ioc_inode) + ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + + STACK_WIND (frame, ioc_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; } /* * ioc_ftruncate - - * + * * @frame: * @this: * @fd: @@ -1131,247 +1344,815 @@ ioc_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) * */ int32_t -ioc_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) +ioc_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - uint64_t ioc_inode = 0; - inode_ctx_get (fd->inode, this, &ioc_inode); + uint64_t ioc_inode = 0; - if (ioc_inode) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + inode_ctx_get (fd->inode, this, &ioc_inode); - STACK_WIND (frame, ioc_truncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, fd, offset); - return 0; + if (ioc_inode) + ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + + STACK_WIND (frame, ioc_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; } int32_t ioc_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct flock *lock) + int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, lock); - return 0; + STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock, xdata); + return 0; } -int32_t +int32_t ioc_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct flock *lock) + struct gf_flock *lock, dict_t *xdata) +{ + ioc_inode_t *ioc_inode = NULL; + uint64_t tmp_inode = 0; + + inode_ctx_get (fd->inode, this, &tmp_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_inode; + if (!ioc_inode) { + gf_log (this->name, GF_LOG_DEBUG, + "inode context is NULL: returning EBADFD"); + STACK_UNWIND_STRICT (lk, frame, -1, EBADFD, NULL, NULL); + return 0; + } + + ioc_inode_lock (ioc_inode); + { + gettimeofday (&ioc_inode->cache.tv, NULL); + } + ioc_inode_unlock (ioc_inode); + + STACK_WIND (frame, ioc_lk_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lk, fd, cmd, lock, xdata); + + return 0; +} + +int +ioc_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + + if (op_ret <= 0) + goto unwind; + + list_for_each_entry (entry, &entries->list, list) { + /* TODO: fill things */ + } + +unwind: + STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); + + return 0; +} +int +ioc_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict) { - ioc_inode_t *ioc_inode = NULL; - uint64_t tmp_inode = 0; - - inode_ctx_get (fd->inode, this, &tmp_inode); - ioc_inode = (ioc_inode_t *)(long)tmp_inode; - if (!ioc_inode) { - gf_log (this->name, GF_LOG_DEBUG, - "inode context is NULL: returning EBADFD"); - STACK_UNWIND (frame, -1, EBADFD, NULL); - return 0; - } - - ioc_inode_lock (ioc_inode); - { - gettimeofday (&ioc_inode->tv, NULL); - } - ioc_inode_unlock (ioc_inode); - - STACK_WIND (frame, ioc_lk_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->lk, fd, cmd, lock); + STACK_WIND (frame, ioc_readdirp_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, + fd, size, offset, dict); + + return 0; +} +static int32_t +ioc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, pre, post, xdata); return 0; } +static int32_t +ioc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + uint64_t ioc_inode = 0; + + inode_ctx_get (fd->inode, this, &ioc_inode); + + if (ioc_inode) + ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + + STACK_WIND(frame, ioc_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; +} + +static int32_t +ioc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + STACK_UNWIND_STRICT(zerofill, frame, op_ret, + op_errno, pre, post, xdata); + return 0; +} + +static int32_t +ioc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + uint64_t ioc_inode = 0; + + inode_ctx_get (fd->inode, this, &ioc_inode); + + if (ioc_inode) + ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + + STACK_WIND(frame, ioc_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; +} + + int32_t ioc_get_priority_list (const char *opt_str, struct list_head *first) { - int32_t max_pri = 0; - char *tmp_str = NULL; - char *tmp_str1 = NULL; - char *tmp_str2 = NULL; - char *dup_str = NULL; - char *stripe_str = NULL; - char *pattern = NULL; - char *priority = NULL; - char *string = NULL; - struct ioc_priority *curr = NULL; - - string = strdup (opt_str); - /* Get the pattern for cache priority. - * "option priority *.jpg:1,abc*:2" etc - */ - /* TODO: inode_lru in table is statically hard-coded to 5, - * should be changed to run-time configuration - */ - stripe_str = strtok_r (string, ",", &tmp_str); - while (stripe_str) { - curr = CALLOC (1, sizeof (struct ioc_priority)); - ERR_ABORT (curr); - list_add_tail (&curr->list, first); - - dup_str = strdup (stripe_str); - pattern = strtok_r (dup_str, ":", &tmp_str1); - if (!pattern) - return -1; - priority = strtok_r (NULL, ":", &tmp_str1); - if (!priority) - return -1; - gf_log ("io-cache", GF_LOG_TRACE, - "ioc priority : pattern %s : priority %s", - pattern, - priority); - curr->pattern = strdup (pattern); - curr->priority = strtol (priority, &tmp_str2, 0); - if (tmp_str2 && (*tmp_str2)) - return -1; - else - max_pri = max (max_pri, curr->priority); - stripe_str = strtok_r (NULL, ",", &tmp_str); - } - - return max_pri; + int32_t max_pri = 1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *tmp_str2 = NULL; + char *dup_str = NULL; + char *stripe_str = NULL; + char *pattern = NULL; + char *priority = NULL; + char *string = NULL; + struct ioc_priority *curr = NULL, *tmp = NULL; + + string = gf_strdup (opt_str); + if (string == NULL) { + max_pri = -1; + goto out; + } + + /* Get the pattern for cache priority. + * "option priority *.jpg:1,abc*:2" etc + */ + /* TODO: inode_lru in table is statically hard-coded to 5, + * should be changed to run-time configuration + */ + stripe_str = strtok_r (string, ",", &tmp_str); + while (stripe_str) { + curr = GF_CALLOC (1, sizeof (struct ioc_priority), + gf_ioc_mt_ioc_priority); + if (curr == NULL) { + max_pri = -1; + goto out; + } + + list_add_tail (&curr->list, first); + + dup_str = gf_strdup (stripe_str); + if (dup_str == NULL) { + max_pri = -1; + goto out; + } + + pattern = strtok_r (dup_str, ":", &tmp_str1); + if (!pattern) { + max_pri = -1; + goto out; + } + + priority = strtok_r (NULL, ":", &tmp_str1); + if (!priority) { + max_pri = -1; + goto out; + } + + gf_log ("io-cache", GF_LOG_TRACE, + "ioc priority : pattern %s : priority %s", + pattern, + priority); + + curr->pattern = gf_strdup (pattern); + if (curr->pattern == NULL) { + max_pri = -1; + goto out; + } + + curr->priority = strtol (priority, &tmp_str2, 0); + if (tmp_str2 && (*tmp_str2)) { + max_pri = -1; + goto out; + } else { + max_pri = max (max_pri, curr->priority); + } + + GF_FREE (dup_str); + dup_str = NULL; + + stripe_str = strtok_r (NULL, ",", &tmp_str); + } +out: + GF_FREE (string); + + GF_FREE (dup_str); + + if (max_pri == -1) { + list_for_each_entry_safe (curr, tmp, first, list) { + list_del_init (&curr->list); + GF_FREE (curr->pattern); + GF_FREE (curr); + } + } + + return max_pri; } +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_ioc_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; + } + + return ret; +} + + +static gf_boolean_t +check_cache_size_ok (xlator_t *this, uint64_t cache_size) +{ + gf_boolean_t ret = _gf_true; + uint64_t total_mem = 0; + uint64_t max_cache_size = 0; + volume_option_t *opt = NULL; + + GF_ASSERT (this); + opt = xlator_volume_option_get (this, "cache-size"); + if (!opt) { + ret = _gf_false; + gf_log (this->name, GF_LOG_ERROR, + "could not get cache-size option"); + goto out; + } + + total_mem = get_mem_size (); + if (-1 == total_mem) + max_cache_size = opt->max; + else + max_cache_size = total_mem; + + gf_log (this->name, GF_LOG_DEBUG, "Max cache size is %"PRIu64, + max_cache_size); + + if (cache_size > max_cache_size) { + ret = _gf_false; + gf_log (this->name, GF_LOG_ERROR, "Cache size %"PRIu64 + " is greater than the max size of %"PRIu64, + cache_size, max_cache_size); + goto out; + } +out: + return ret; +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ + data_t *data = NULL; + ioc_table_t *table = NULL; + int ret = -1; + uint64_t cache_size_new = 0; + if (!this || !this->private) + goto out; + + table = this->private; + + ioc_table_lock (table); + { + GF_OPTION_RECONF ("cache-timeout", table->cache_timeout, + options, int32, unlock); + + data = dict_get (options, "priority"); + if (data) { + char *option_list = data_to_str (data); + + gf_log (this->name, GF_LOG_TRACE, + "option path %s", option_list); + /* parse the list of pattern:priority */ + table->max_pri = ioc_get_priority_list (option_list, + &table->priority_list); + + if (table->max_pri == -1) { + goto unlock; + } + table->max_pri ++; + } + + GF_OPTION_RECONF ("max-file-size", table->max_file_size, + options, size, unlock); + + GF_OPTION_RECONF ("min-file-size", table->min_file_size, + options, size, unlock); + + if ((table->max_file_size >= 0) && + (table->min_file_size > table->max_file_size)) { + gf_log (this->name, GF_LOG_ERROR, "minimum size (%" + PRIu64") of a file that can be cached is " + "greater than maximum size (%"PRIu64"). " + "Hence Defaulting to old value", + table->min_file_size, table->max_file_size); + goto unlock; + } + + GF_OPTION_RECONF ("cache-size", cache_size_new, + options, size, unlock); + if (!check_cache_size_ok (this, cache_size_new)) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "Not reconfiguring cache-size"); + goto unlock; + } + table->cache_size = cache_size_new; + + ret = 0; + } +unlock: + ioc_table_unlock (table); +out: + return ret; +} + + /* - * init - + * init - * @this: * */ -int32_t +int32_t init (xlator_t *this) { - ioc_table_t *table; - dict_t *options = this->options; - uint32_t index = 0; - char *cache_size_string = NULL; - - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: io-cache not configured with exactly " - "one child"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - table = (void *) CALLOC (1, sizeof (*table)); - ERR_ABORT (table); - - table->xl = this; - table->page_size = this->ctx->page_size; - table->cache_size = IOC_CACHE_SIZE; - - if (dict_get (options, "cache-size")) - cache_size_string = data_to_str (dict_get (options, - "cache-size")); - if (cache_size_string) { - if (gf_string2bytesize (cache_size_string, - &table->cache_size) != 0) { - gf_log ("io-cache", GF_LOG_ERROR, - "invalid number format \"%s\" of " - "\"option cache-size\"", - cache_size_string); - return -1; - } - - gf_log (this->name, GF_LOG_TRACE, - "using cache-size %"PRIu64"", table->cache_size); - } - - table->cache_timeout = 1; - - if (dict_get (options, "cache-timeout")) { - table->cache_timeout = - data_to_uint32 (dict_get (options, - "cache-timeout")); - gf_log (this->name, GF_LOG_TRACE, - "Using %d seconds to revalidate cache", - table->cache_timeout); - } - - INIT_LIST_HEAD (&table->priority_list); - if (dict_get (options, "priority")) { - char *option_list = data_to_str (dict_get (options, - "priority")); - gf_log (this->name, GF_LOG_TRACE, - "option path %s", option_list); - /* parse the list of pattern:priority */ - table->max_pri = ioc_get_priority_list (option_list, - &table->priority_list); - - if (table->max_pri == -1) - return -1; - } - table->max_pri ++; - INIT_LIST_HEAD (&table->inodes); - - table->inode_lru = CALLOC (table->max_pri, sizeof (struct list_head)); - ERR_ABORT (table->inode_lru); - for (index = 0; index < (table->max_pri); index++) - INIT_LIST_HEAD (&table->inode_lru[index]); - - pthread_mutex_init (&table->table_lock, NULL); - this->private = table; - return 0; + ioc_table_t *table = NULL; + dict_t *xl_options = NULL; + uint32_t index = 0; + int32_t ret = -1; + glusterfs_ctx_t *ctx = NULL; + data_t *data = 0; + uint32_t num_pages = 0; + + xl_options = this->options; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: io-cache not configured with exactly " + "one child"); + goto out; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + table = (void *) GF_CALLOC (1, sizeof (*table), gf_ioc_mt_ioc_table_t); + if (table == NULL) { + gf_log (this->name, GF_LOG_ERROR, "out of memory"); + goto out; + } + + table->xl = this; + table->page_size = this->ctx->page_size; + + GF_OPTION_INIT ("cache-size", table->cache_size, size, out); + + GF_OPTION_INIT ("cache-timeout", table->cache_timeout, int32, out); + + GF_OPTION_INIT ("min-file-size", table->min_file_size, size, out); + + GF_OPTION_INIT ("max-file-size", table->max_file_size, size, out); + + if (!check_cache_size_ok (this, table->cache_size)) { + ret = -1; + goto out; + } + + INIT_LIST_HEAD (&table->priority_list); + table->max_pri = 1; + data = dict_get (xl_options, "priority"); + if (data) { + char *option_list = data_to_str (data); + gf_log (this->name, GF_LOG_TRACE, + "option path %s", option_list); + /* parse the list of pattern:priority */ + table->max_pri = ioc_get_priority_list (option_list, + &table->priority_list); + + if (table->max_pri == -1) { + goto out; + } + } + table->max_pri ++; + + INIT_LIST_HEAD (&table->inodes); + + if ((table->max_file_size >= 0) + && (table->min_file_size > table->max_file_size)) { + gf_log ("io-cache", GF_LOG_ERROR, "minimum size (%" + PRIu64") of a file that can be cached is " + "greater than maximum size (%"PRIu64")", + table->min_file_size, table->max_file_size); + goto out; + } + + table->inode_lru = GF_CALLOC (table->max_pri, + sizeof (struct list_head), + gf_ioc_mt_list_head); + if (table->inode_lru == NULL) { + goto out; + } + + for (index = 0; index < (table->max_pri); index++) + INIT_LIST_HEAD (&table->inode_lru[index]); + + this->local_pool = mem_pool_new (ioc_local_t, 64); + if (!this->local_pool) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto out; + } + + pthread_mutex_init (&table->table_lock, NULL); + this->private = table; + + num_pages = (table->cache_size / table->page_size) + + ((table->cache_size % table->page_size) + ? 1 : 0); + + table->mem_pool = mem_pool_new (rbthash_entry_t, num_pages); + if (!table->mem_pool) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to allocate mem_pool"); + goto out; + } + + ret = 0; + + ctx = this->ctx; + ioc_log2_page_size = log_base2 (ctx->page_size); + +out: + if (ret == -1) { + if (table != NULL) { + GF_FREE (table->inode_lru); + GF_FREE (table); + } + } + + return ret; +} + +void +ioc_page_waitq_dump (ioc_page_t *page, char *prefix) +{ + ioc_waitq_t *trav = NULL; + call_frame_t *frame = NULL; + int32_t i = 0; + char key[GF_DUMP_MAX_BUF_LEN] = {0, }; + + trav = page->waitq; + + while (trav) { + frame = trav->data; + sprintf (key, "waitq.frame[%d]", i++); + gf_proc_dump_write (key, "%"PRId64, frame->root->unique); + + trav = trav->next; + } +} + +void +__ioc_inode_waitq_dump (ioc_inode_t *ioc_inode, char *prefix) +{ + ioc_waitq_t *trav = NULL; + ioc_page_t *page = NULL; + int32_t i = 0; + char key[GF_DUMP_MAX_BUF_LEN] = {0, }; + + trav = ioc_inode->waitq; + + while (trav) { + page = trav->data; + + sprintf (key, "cache-validation-waitq.page[%d].offset", i++); + gf_proc_dump_write (key, "%"PRId64, page->offset); + + trav = trav->next; + } +} + +void +__ioc_page_dump (ioc_page_t *page, char *prefix) +{ + + int ret = -1; + + if (!page) + return; + /* ioc_page_lock can be used to hold the mutex. But in statedump + * its better to use trylock to avoid deadlocks. + */ + ret = pthread_mutex_trylock (&page->page_lock); + if (ret) + goto out; + { + gf_proc_dump_write ("offset", "%"PRId64, page->offset); + gf_proc_dump_write ("size", "%"PRId64, page->size); + gf_proc_dump_write ("dirty", "%s", page->dirty ? "yes" : "no"); + gf_proc_dump_write ("ready", "%s", page->ready ? "yes" : "no"); + ioc_page_waitq_dump (page, prefix); + } + pthread_mutex_unlock (&page->page_lock); + +out: + if (ret && page) + gf_proc_dump_write ("Unable to dump the page information", + "(Lock acquisition failed) %p", page); + + return; +} + +void +__ioc_cache_dump (ioc_inode_t *ioc_inode, char *prefix) +{ + off_t offset = 0; + ioc_table_t *table = NULL; + ioc_page_t *page = NULL; + int i = 0; + char key[GF_DUMP_MAX_BUF_LEN] = {0, }; + char timestr[256] = {0, }; + + if ((ioc_inode == NULL) || (prefix == NULL)) { + goto out; + } + + table = ioc_inode->table; + + if (ioc_inode->cache.tv.tv_sec) { + gf_time_fmt (timestr, sizeof timestr, + ioc_inode->cache.tv.tv_sec, gf_timefmt_FT); + snprintf (timestr + strlen (timestr), sizeof timestr - strlen (timestr), + ".%"GF_PRI_SUSECONDS, ioc_inode->cache.tv.tv_usec); + + gf_proc_dump_write ("last-cache-validation-time", "%s", + timestr); + } + + for (offset = 0; offset < ioc_inode->ia_size; + offset += table->page_size) { + page = __ioc_page_get (ioc_inode, offset); + if (page == NULL) { + continue; + } + + sprintf (key, "inode.cache.page[%d]", i++); + __ioc_page_dump (page, key); + } +out: + return; +} + + +int +ioc_inode_dump (xlator_t *this, inode_t *inode) +{ + + char *path = NULL; + int ret = -1; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; + uint64_t tmp_ioc_inode = 0; + ioc_inode_t *ioc_inode = NULL; + gf_boolean_t section_added = _gf_false; + char uuid_str[64] = {0,}; + + if (this == NULL || inode == NULL) + goto out; + + gf_proc_dump_build_key (key_prefix, "io-cache", "inode"); + + inode_ctx_get (inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + if (ioc_inode == NULL) + goto out; + + /* Similar to ioc_page_dump function its better to use + * pthread_mutex_trylock and not to use gf_log in statedump + * to avoid deadlocks. + */ + ret = pthread_mutex_trylock (&ioc_inode->inode_lock); + if (ret) + goto out; + + { + if (uuid_is_null (ioc_inode->inode->gfid)) + goto unlock; + + gf_proc_dump_add_section (key_prefix); + section_added = _gf_true; + + __inode_path (ioc_inode->inode, NULL, &path); + + gf_proc_dump_write ("inode.weight", "%d", ioc_inode->weight); + + if (path) { + gf_proc_dump_write ("path", "%s", path); + GF_FREE (path); + } + + gf_proc_dump_write ("uuid", "%s", uuid_utoa_r + (ioc_inode->inode->gfid, uuid_str)); + __ioc_cache_dump (ioc_inode, key_prefix); + __ioc_inode_waitq_dump (ioc_inode, key_prefix); + } +unlock: + pthread_mutex_unlock (&ioc_inode->inode_lock); + +out: + if (ret && ioc_inode) { + if (section_added == _gf_false) + gf_proc_dump_add_section (key_prefix); + gf_proc_dump_write ("Unable to print the status of ioc_inode", + "(Lock acquisition failed) %s", + uuid_utoa (inode->gfid)); + } + return ret; +} + +int +ioc_priv_dump (xlator_t *this) +{ + ioc_table_t *priv = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; + int ret = -1; + gf_boolean_t add_section = _gf_false; + + if (!this || !this->private) + goto out; + + priv = this->private; + + gf_proc_dump_build_key (key_prefix, "io-cache", "priv"); + gf_proc_dump_add_section (key_prefix); + add_section = _gf_true; + + ret = pthread_mutex_trylock (&priv->table_lock); + if (ret) + goto out; + { + gf_proc_dump_write ("page_size", "%ld", priv->page_size); + gf_proc_dump_write ("cache_size", "%ld", priv->cache_size); + gf_proc_dump_write ("cache_used", "%ld", priv->cache_used); + gf_proc_dump_write ("inode_count", "%u", priv->inode_count); + gf_proc_dump_write ("cache_timeout", "%u", priv->cache_timeout); + gf_proc_dump_write ("min-file-size", "%u", priv->min_file_size); + gf_proc_dump_write ("max-file-size", "%u", priv->max_file_size); + } + pthread_mutex_unlock (&priv->table_lock); +out: + if (ret && priv) { + if (!add_section) { + gf_proc_dump_build_key (key_prefix, "xlator." + "performance.io-cache", "priv"); + gf_proc_dump_add_section (key_prefix); + } + gf_proc_dump_write ("Unable to dump the state of private " + "structure of io-cache xlator", "(Lock " + "acquisition failed) %s", this->name); + } + + return 0; } /* * fini - - * + * * @this: * */ void fini (xlator_t *this) { - ioc_table_t *table = this->private; + ioc_table_t *table = NULL; + struct ioc_priority *curr = NULL, *tmp = NULL; + int i = 0; + + table = this->private; + + if (table == NULL) + return; - pthread_mutex_destroy (&table->table_lock); - FREE (table); + this->private = NULL; - this->private = NULL; - return; + if (table->mem_pool != NULL) { + mem_pool_destroy (table->mem_pool); + table->mem_pool = NULL; + } + + list_for_each_entry_safe (curr, tmp, &table->priority_list, list) { + list_del_init (&curr->list); + GF_FREE (curr->pattern); + GF_FREE (curr); + } + + for (i = 0; i < table->max_pri; i++) { + GF_ASSERT (list_empty (&table->inode_lru[i])); + } + + GF_ASSERT (list_empty (&table->inodes)); + pthread_mutex_destroy (&table->table_lock); + GF_FREE (table); + + this->private = NULL; + return; } struct xlator_fops fops = { - .open = ioc_open, - .create = ioc_create, - .readv = ioc_readv, - .writev = ioc_writev, - .truncate = ioc_truncate, - .ftruncate = ioc_ftruncate, - .utimens = ioc_utimens, - .lookup = ioc_lookup, - .lk = ioc_lk + .open = ioc_open, + .create = ioc_create, + .readv = ioc_readv, + .writev = ioc_writev, + .truncate = ioc_truncate, + .ftruncate = ioc_ftruncate, + .lookup = ioc_lookup, + .lk = ioc_lk, + .setattr = ioc_setattr, + .mknod = ioc_mknod, + + .readdirp = ioc_readdirp, + .discard = ioc_discard, + .zerofill = ioc_zerofill, }; -struct xlator_mops mops = { + +struct xlator_dumpops dumpops = { + .priv = ioc_priv_dump, + .inodectx = ioc_inode_dump, }; struct xlator_cbks cbks = { - .forget = ioc_forget, - .release = ioc_release + .forget = ioc_forget, + .release = ioc_release, + .invalidate = ioc_invalidate, }; struct volume_options options[] = { - { .key = {"priority"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"cache-timeout", "force-revalidate-timeout"}, - .type = GF_OPTION_TYPE_INT, - .min = 0, - .max = 60 - }, - { .key = {"page-size"}, - .type = GF_OPTION_TYPE_SIZET, - .min = 16 * GF_UNIT_KB, - .max = 4 * GF_UNIT_MB - }, - { .key = {"cache-size"}, - .type = GF_OPTION_TYPE_SIZET, - .min = 4 * GF_UNIT_MB, - .max = 6 * GF_UNIT_GB - }, - { .key = {NULL} }, + { .key = {"priority"}, + .type = GF_OPTION_TYPE_PRIORITY_LIST, + .default_value = "", + .description = "Assigns priority to filenames with specific " + "patterns so that when a page needs to be ejected " + "out of the cache, the page of a file whose " + "priority is the lowest will be ejected earlier" + }, + { .key = {"cache-timeout", "force-revalidate-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 60, + .default_value = "1", + .description = "The cached data for a file will be retained till " + "'cache-refresh-timeout' seconds, after which data " + "re-validation is performed." + }, + { .key = {"cache-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 4 * GF_UNIT_MB, + .max = 32 * GF_UNIT_GB, + .default_value = "32MB", + .description = "Size of the read cache." + }, + { .key = {"min-file-size"}, + .type = GF_OPTION_TYPE_SIZET, + .default_value = "0", + .description = "Minimum file size which would be cached by the " + "io-cache translator." + }, + { .key = {"max-file-size"}, + .type = GF_OPTION_TYPE_SIZET, + .default_value = "0", + .description = "Maximum file size which would be cached by the " + "io-cache translator." + }, + { .key = {NULL} }, }; diff --git a/xlators/performance/io-cache/src/io-cache.h b/xlators/performance/io-cache/src/io-cache.h index 5d0590d33..46d758a66 100644 --- a/xlators/performance/io-cache/src/io-cache.h +++ b/xlators/performance/io-cache/src/io-cache.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __IO_CACHE_H @@ -34,11 +25,14 @@ #include "xlator.h" #include "common-utils.h" #include "call-stub.h" +#include "rbthash.h" +#include "hashfn.h" #include <sys/time.h> #include <fnmatch.h> #define IOC_PAGE_SIZE (1024 * 128) /* 128KB */ #define IOC_CACHE_SIZE (32 * 1024 * 1024) +#define IOC_PAGE_TABLE_BUCKET_COUNT 1 struct ioc_table; struct ioc_local; @@ -46,123 +40,135 @@ struct ioc_page; struct ioc_inode; struct ioc_priority { - struct list_head list; - char *pattern; - uint32_t priority; + struct list_head list; + char *pattern; + uint32_t priority; }; /* - * ioc_waitq - this structure is used to represents the waiting + * ioc_waitq - this structure is used to represents the waiting * frames on a page * * @next: pointer to next object in waitq * @data: pointer to the frame which is waiting */ struct ioc_waitq { - struct ioc_waitq *next; - void *data; - off_t pending_offset; - size_t pending_size; + struct ioc_waitq *next; + void *data; + off_t pending_offset; + size_t pending_size; }; /* - * ioc_fill - + * ioc_fill - * */ struct ioc_fill { - struct list_head list; /* list of ioc_fill structures of a frame */ - off_t offset; - size_t size; - struct iovec *vector; - int32_t count; - struct iobref *iobref; + struct list_head list; /* list of ioc_fill structures of a frame */ + off_t offset; + size_t size; + struct iovec *vector; + int32_t count; + struct iobref *iobref; }; struct ioc_local { - mode_t mode; - int32_t flags; - loc_t file_loc; - off_t offset; - size_t size; - int32_t op_ret; - int32_t op_errno; - struct list_head fill_list; /* list of ioc_fill structures */ - off_t pending_offset; /* + mode_t mode; + int32_t flags; + loc_t file_loc; + off_t offset; + size_t size; + int32_t op_ret; + int32_t op_errno; + struct list_head fill_list; /* list of ioc_fill structures */ + off_t pending_offset; /* * offset from this frame should * continue */ - size_t pending_size; /* + size_t pending_size; /* * size of data this frame is waiting * on */ - struct ioc_inode *inode; - int32_t wait_count; - pthread_mutex_t local_lock; - struct ioc_waitq *waitq; - void *stub; - fd_t *fd; - int32_t need_xattr; - dict_t *xattr_req; + struct ioc_inode *inode; + int32_t wait_count; + pthread_mutex_t local_lock; + struct ioc_waitq *waitq; + void *stub; + fd_t *fd; + int32_t need_xattr; + dict_t *xattr_req; }; /* - * ioc_page - structure to store page of data from file + * ioc_page - structure to store page of data from file * */ struct ioc_page { - struct list_head pages; - struct list_head page_lru; - struct ioc_inode *inode; /* inode this page belongs to */ - struct ioc_priority *priority; - char dirty; - char ready; - struct iovec *vector; - int32_t count; - off_t offset; - size_t size; - struct ioc_waitq *waitq; - struct iobref *iobref; - pthread_mutex_t page_lock; + struct list_head page_lru; + struct ioc_inode *inode; /* inode this page belongs to */ + struct ioc_priority *priority; + char dirty; + char ready; + struct iovec *vector; + int32_t count; + off_t offset; + size_t size; + struct ioc_waitq *waitq; + struct iobref *iobref; + pthread_mutex_t page_lock; + int32_t op_errno; + char stale; +}; + +struct ioc_cache { + rbthash_table_t *page_table; + struct list_head page_lru; + time_t mtime; /* + * seconds component of file mtime + */ + time_t mtime_nsec; /* + * nanosecond component of file mtime + */ + struct timeval tv; /* + * time-stamp at last re-validate + */ }; struct ioc_inode { - struct ioc_table *table; - struct list_head pages; /* list of pages of this inode */ - struct list_head inode_list; /* - * list of inodes, maintained by io-cache - * translator - */ - struct list_head inode_lru; - struct list_head page_lru; - struct ioc_waitq *waitq; - pthread_mutex_t inode_lock; - uint32_t weight; /* - * weight of the inode, increases on each - * read - */ - time_t mtime; /* - * mtime of the server file when last - * cached - */ - struct timeval tv; /* - * time-stamp at last re-validate - */ + struct ioc_table *table; + off_t ia_size; + struct ioc_cache cache; + struct list_head inode_list; /* + * list of inodes, maintained by + * io-cache translator + */ + struct list_head inode_lru; + struct ioc_waitq *waitq; + pthread_mutex_t inode_lock; + uint32_t weight; /* + * weight of the inode, increases + * on each read + */ + inode_t *inode; }; struct ioc_table { - uint64_t page_size; - uint64_t cache_size; - uint64_t cache_used; - struct list_head inodes; /* list of inodes cached */ - struct list_head active; - struct list_head *inode_lru; - struct list_head priority_list; - int32_t readv_count; - pthread_mutex_t table_lock; - xlator_t *xl; - uint32_t inode_count; - int32_t cache_timeout; - int32_t max_pri; + uint64_t page_size; + uint64_t cache_size; + uint64_t cache_used; + uint64_t min_file_size; + uint64_t max_file_size; + struct list_head inodes; /* list of inodes cached */ + struct list_head active; + struct list_head *inode_lru; + struct list_head priority_list; + int32_t readv_count; + pthread_mutex_t table_lock; + xlator_t *xl; + uint32_t inode_count; + int32_t cache_timeout; + int32_t max_pri; + struct mem_pool *mem_pool; }; typedef struct ioc_table ioc_table_t; @@ -178,36 +184,33 @@ str_to_ptr (char *string); char * ptr_to_str (void *ptr); -int32_t +int32_t ioc_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct stat *stbuf, - struct iobref *iobref); + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, + struct iobref *iobref, dict_t *xdata); ioc_page_t * -ioc_page_get (ioc_inode_t *ioc_inode, off_t offset); +__ioc_page_get (ioc_inode_t *ioc_inode, off_t offset); ioc_page_t * -ioc_page_create (ioc_inode_t *ioc_inode, off_t offset); +__ioc_page_create (ioc_inode_t *ioc_inode, off_t offset); void -ioc_page_fault (ioc_inode_t *ioc_inode, call_frame_t *frame, fd_t *fd, - off_t offset); +ioc_page_fault (ioc_inode_t *ioc_inode, call_frame_t *frame, fd_t *fd, + off_t offset); void -ioc_wait_on_page (ioc_page_t *page, call_frame_t *frame, off_t offset, - size_t size); +__ioc_wait_on_page (ioc_page_t *page, call_frame_t *frame, off_t offset, + size_t size); ioc_waitq_t * -ioc_page_wakeup (ioc_page_t *page); +__ioc_page_wakeup (ioc_page_t *page, int32_t op_errno); void ioc_page_flush (ioc_page_t *page); ioc_waitq_t * -ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno); - -void -ioc_page_purge (ioc_page_t *page); +__ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno); void ioc_frame_return (call_frame_t *frame); @@ -215,99 +218,99 @@ ioc_frame_return (call_frame_t *frame); void ioc_waitq_return (ioc_waitq_t *waitq); -void +int32_t ioc_frame_fill (ioc_page_t *page, call_frame_t *frame, off_t offset, - size_t size); + size_t size, int32_t op_errno); -#define ioc_inode_lock(ioc_inode) \ - do { \ - gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, \ - "locked inode(%p)", ioc_inode); \ - pthread_mutex_lock (&ioc_inode->inode_lock); \ - } while (0) +#define ioc_inode_lock(ioc_inode) \ + do { \ + gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, \ + "locked inode(%p)", ioc_inode); \ + pthread_mutex_lock (&ioc_inode->inode_lock); \ + } while (0) -#define ioc_inode_unlock(ioc_inode) \ - do { \ - gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, \ - "unlocked inode(%p)", ioc_inode); \ - pthread_mutex_unlock (&ioc_inode->inode_lock); \ - } while (0) +#define ioc_inode_unlock(ioc_inode) \ + do { \ + gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, \ + "unlocked inode(%p)", ioc_inode); \ + pthread_mutex_unlock (&ioc_inode->inode_lock); \ + } while (0) -#define ioc_table_lock(table) \ - do { \ - gf_log (table->xl->name, GF_LOG_TRACE, \ - "locked table(%p)", table); \ - pthread_mutex_lock (&table->table_lock); \ - } while (0) +#define ioc_table_lock(table) \ + do { \ + gf_log (table->xl->name, GF_LOG_TRACE, \ + "locked table(%p)", table); \ + pthread_mutex_lock (&table->table_lock); \ + } while (0) -#define ioc_table_unlock(table) \ - do { \ - gf_log (table->xl->name, GF_LOG_TRACE, \ - "unlocked table(%p)", table); \ - pthread_mutex_unlock (&table->table_lock); \ - } while (0) +#define ioc_table_unlock(table) \ + do { \ + gf_log (table->xl->name, GF_LOG_TRACE, \ + "unlocked table(%p)", table); \ + pthread_mutex_unlock (&table->table_lock); \ + } while (0) -#define ioc_local_lock(local) \ - do { \ - gf_log (local->inode->table->xl->name, GF_LOG_TRACE, \ - "locked local(%p)", local); \ - pthread_mutex_lock (&local->local_lock); \ - } while (0) +#define ioc_local_lock(local) \ + do { \ + gf_log (local->inode->table->xl->name, GF_LOG_TRACE, \ + "locked local(%p)", local); \ + pthread_mutex_lock (&local->local_lock); \ + } while (0) -#define ioc_local_unlock(local) \ - do { \ - gf_log (local->inode->table->xl->name, GF_LOG_TRACE, \ - "unlocked local(%p)", local); \ - pthread_mutex_unlock (&local->local_lock); \ - } while (0) +#define ioc_local_unlock(local) \ + do { \ + gf_log (local->inode->table->xl->name, GF_LOG_TRACE, \ + "unlocked local(%p)", local); \ + pthread_mutex_unlock (&local->local_lock); \ + } while (0) -#define ioc_page_lock(page) \ - do { \ - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, \ - "locked page(%p)", page); \ - pthread_mutex_lock (&page->page_lock); \ - } while (0) +#define ioc_page_lock(page) \ + do { \ + gf_log (page->inode->table->xl->name, GF_LOG_TRACE, \ + "locked page(%p)", page); \ + pthread_mutex_lock (&page->page_lock); \ + } while (0) -#define ioc_page_unlock(page) \ - do { \ - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, \ - "unlocked page(%p)", page); \ - pthread_mutex_unlock (&page->page_lock); \ - } while (0) +#define ioc_page_unlock(page) \ + do { \ + gf_log (page->inode->table->xl->name, GF_LOG_TRACE, \ + "unlocked page(%p)", page); \ + pthread_mutex_unlock (&page->page_lock); \ + } while (0) static inline uint64_t time_elapsed (struct timeval *now, - struct timeval *then) + struct timeval *then) { - uint64_t sec = now->tv_sec - then->tv_sec; + uint64_t sec = now->tv_sec - then->tv_sec; - if (sec) - return sec; - - return 0; + if (sec) + return sec; + + return 0; } ioc_inode_t * ioc_inode_search (ioc_table_t *table, inode_t *inode); -void +void ioc_inode_destroy (ioc_inode_t *ioc_inode); ioc_inode_t * ioc_inode_update (ioc_table_t *table, inode_t *inode, uint32_t weight); -int64_t -ioc_page_destroy (ioc_page_t *page); +int64_t +__ioc_page_destroy (ioc_page_t *page); -int32_t +int64_t __ioc_inode_flush (ioc_inode_t *ioc_inode); void @@ -315,10 +318,10 @@ ioc_inode_flush (ioc_inode_t *ioc_inode); void ioc_inode_wakeup (call_frame_t *frame, ioc_inode_t *ioc_inode, - struct stat *stbuf); + struct iatt *stbuf); int8_t -ioc_cache_still_valid (ioc_inode_t *ioc_inode, struct stat *stbuf); +ioc_cache_still_valid (ioc_inode_t *ioc_inode, struct iatt *stbuf); int32_t ioc_prune (ioc_table_t *table); diff --git a/xlators/performance/io-cache/src/ioc-inode.c b/xlators/performance/io-cache/src/ioc-inode.c index 602077202..86a54bb14 100644 --- a/xlators/performance/io-cache/src/ioc-inode.c +++ b/xlators/performance/io-cache/src/ioc-inode.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -23,7 +14,9 @@ #endif #include "io-cache.h" +#include "ioc-mem-types.h" +extern int ioc_log2_page_size; /* * str_to_ptr - convert a string to pointer @@ -33,10 +26,14 @@ void * str_to_ptr (char *string) { - void *ptr = NULL; + void *ptr = NULL; + + GF_VALIDATE_OR_GOTO ("io-cache", string, out); ptr = (void *)strtoul (string, NULL, 16); - return ptr; + +out: + return ptr; } @@ -48,97 +45,127 @@ str_to_ptr (char *string) char * ptr_to_str (void *ptr) { - char *str = NULL; - asprintf (&str, "%p", ptr); - return str; + int ret = 0; + char *str = NULL; + + GF_VALIDATE_OR_GOTO ("io-cache", ptr, out); + + ret = gf_asprintf (&str, "%p", ptr); + if (-1 == ret) { + gf_log ("io-cache", GF_LOG_WARNING, + "asprintf failed while converting ptr to str"); + str = NULL; + goto out; + } + +out: + return str; } + void -ioc_inode_wakeup (call_frame_t *frame, ioc_inode_t *ioc_inode, - struct stat *stbuf) +ioc_inode_wakeup (call_frame_t *frame, ioc_inode_t *ioc_inode, + struct iatt *stbuf) { - ioc_waitq_t *waiter = NULL, *waited = NULL; - ioc_waitq_t *page_waitq = NULL; - int8_t cache_still_valid = 1; - ioc_local_t *local = NULL; - int8_t need_fault = 0; - ioc_page_t *waiter_page = NULL; + ioc_waitq_t *waiter = NULL, *waited = NULL; + ioc_waitq_t *page_waitq = NULL; + int8_t cache_still_valid = 1; + ioc_local_t *local = NULL; + int8_t need_fault = 0; + ioc_page_t *waiter_page = NULL; + + GF_VALIDATE_OR_GOTO ("io-cache", frame, out); local = frame->local; - ioc_inode_lock (ioc_inode); - { - waiter = ioc_inode->waitq; - ioc_inode->waitq = NULL; - } - ioc_inode_unlock (ioc_inode); - - if (stbuf) - cache_still_valid = ioc_cache_still_valid (ioc_inode, stbuf); - else - cache_still_valid = 0; - - if (!waiter) { - gf_log (frame->this->name, GF_LOG_DEBUG, - "cache validate called without any " - "page waiting to be validated"); - } - - while (waiter) { - waiter_page = waiter->data; - page_waitq = NULL; - - if (waiter_page) { - if (cache_still_valid) { - /* cache valid, wake up page */ - ioc_inode_lock (ioc_inode); - { - page_waitq = - ioc_page_wakeup (waiter_page); - } - ioc_inode_unlock (ioc_inode); - if (page_waitq) - ioc_waitq_return (page_waitq); - } else { - /* cache invalid, generate page fault and set - * page->ready = 0, to avoid double faults - */ - ioc_inode_lock (ioc_inode); - - if (waiter_page->ready) { - waiter_page->ready = 0; - need_fault = 1; - } else { - gf_log (frame->this->name, - GF_LOG_TRACE, - "validate frame(%p) is waiting" - "for in-transit page = %p", - frame, waiter_page); - } - - ioc_inode_unlock (ioc_inode); - - if (need_fault) { - need_fault = 0; - ioc_page_fault (ioc_inode, frame, - local->fd, - waiter_page->offset); - } - } - } - - waited = waiter; - waiter = waiter->next; - - waited->data = NULL; - free (waited); - } + GF_VALIDATE_OR_GOTO (frame->this->name, local, out); + + if (ioc_inode == NULL) { + local->op_ret = -1; + local->op_errno = EINVAL; + gf_log (frame->this->name, GF_LOG_WARNING, "ioc_inode is NULL"); + goto out; + } + + ioc_inode_lock (ioc_inode); + { + waiter = ioc_inode->waitq; + ioc_inode->waitq = NULL; + } + ioc_inode_unlock (ioc_inode); + + if (stbuf) + cache_still_valid = ioc_cache_still_valid (ioc_inode, stbuf); + else + cache_still_valid = 0; + + if (!waiter) { + gf_log (frame->this->name, GF_LOG_WARNING, + "cache validate called without any " + "page waiting to be validated"); + } + + while (waiter) { + waiter_page = waiter->data; + page_waitq = NULL; + + if (waiter_page) { + if (cache_still_valid) { + /* cache valid, wake up page */ + ioc_inode_lock (ioc_inode); + { + page_waitq = + __ioc_page_wakeup (waiter_page, + waiter_page->op_errno); + } + ioc_inode_unlock (ioc_inode); + if (page_waitq) + ioc_waitq_return (page_waitq); + } else { + /* cache invalid, generate page fault and set + * page->ready = 0, to avoid double faults + */ + ioc_inode_lock (ioc_inode); + { + if (waiter_page->ready) { + waiter_page->ready = 0; + need_fault = 1; + } else { + gf_log (frame->this->name, + GF_LOG_TRACE, + "validate frame(%p) is " + "waiting for in-transit" + " page = %p", frame, + waiter_page); + } + } + ioc_inode_unlock (ioc_inode); + + if (need_fault) { + need_fault = 0; + ioc_page_fault (ioc_inode, frame, + local->fd, + waiter_page->offset); + } + } + } + + waited = waiter; + waiter = waiter->next; + + waited->data = NULL; + GF_FREE (waited); + } + +out: + return; } -/* - * ioc_inode_update - create a new ioc_inode_t structure and add it to - * the table table. fill in the fields which are derived + +/* + * ioc_inode_update - create a new ioc_inode_t structure and add it to + * the table table. fill in the fields which are derived * from inode_t corresponding to the file - * + * * @table: io-table structure * @inode: inode structure * @@ -147,58 +174,67 @@ ioc_inode_wakeup (call_frame_t *frame, ioc_inode_t *ioc_inode, ioc_inode_t * ioc_inode_update (ioc_table_t *table, inode_t *inode, uint32_t weight) { - ioc_inode_t *ioc_inode = NULL; - - ioc_inode = CALLOC (1, sizeof (ioc_inode_t)); - ERR_ABORT (ioc_inode); - - ioc_inode->table = table; - - /* initialize the list for pages */ - INIT_LIST_HEAD (&ioc_inode->pages); - INIT_LIST_HEAD (&ioc_inode->page_lru); - - ioc_table_lock (table); - - table->inode_count++; - list_add (&ioc_inode->inode_list, &table->inodes); - list_add_tail (&ioc_inode->inode_lru, &table->inode_lru[weight]); - - gf_log (table->xl->name, - GF_LOG_TRACE, - "adding to inode_lru[%d]", weight); - - ioc_table_unlock (table); - - pthread_mutex_init (&ioc_inode->inode_lock, NULL); - ioc_inode->weight = weight; - - return ioc_inode; + ioc_inode_t *ioc_inode = NULL; + + GF_VALIDATE_OR_GOTO ("io-cache", table, out); + + ioc_inode = GF_CALLOC (1, sizeof (ioc_inode_t), gf_ioc_mt_ioc_inode_t); + if (ioc_inode == NULL) { + goto out; + } + + ioc_inode->inode = inode; + ioc_inode->table = table; + INIT_LIST_HEAD (&ioc_inode->cache.page_lru); + pthread_mutex_init (&ioc_inode->inode_lock, NULL); + ioc_inode->weight = weight; + + ioc_table_lock (table); + { + table->inode_count++; + list_add (&ioc_inode->inode_list, &table->inodes); + list_add_tail (&ioc_inode->inode_lru, + &table->inode_lru[weight]); + } + ioc_table_unlock (table); + + gf_log (table->xl->name, GF_LOG_TRACE, + "adding to inode_lru[%d]", weight); + +out: + return ioc_inode; } -/* +/* * ioc_inode_destroy - destroy an ioc_inode_t object. * * @inode: inode to destroy * - * to be called only from ioc_forget. + * to be called only from ioc_forget. */ void ioc_inode_destroy (ioc_inode_t *ioc_inode) { - ioc_table_t *table = NULL; + ioc_table_t *table = NULL; - table = ioc_inode->table; + GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); - ioc_table_lock (table); - table->inode_count--; - list_del (&ioc_inode->inode_list); - list_del (&ioc_inode->inode_lru); - ioc_table_unlock (table); - - ioc_inode_flush (ioc_inode); + table = ioc_inode->table; - pthread_mutex_destroy (&ioc_inode->inode_lock); - free (ioc_inode); + ioc_table_lock (table); + { + table->inode_count--; + list_del (&ioc_inode->inode_list); + list_del (&ioc_inode->inode_lru); + } + ioc_table_unlock (table); + + ioc_inode_flush (ioc_inode); + rbthash_table_destroy (ioc_inode->cache.page_table); + + pthread_mutex_destroy (&ioc_inode->inode_lock); + GF_FREE (ioc_inode); +out: + return; } diff --git a/xlators/performance/io-cache/src/ioc-mem-types.h b/xlators/performance/io-cache/src/ioc-mem-types.h new file mode 100644 index 000000000..9b68f9fce --- /dev/null +++ b/xlators/performance/io-cache/src/ioc-mem-types.h @@ -0,0 +1,29 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __IOC_MT_H__ +#define __IOC_MT_H__ + +#include "mem-types.h" + +enum gf_ioc_mem_types_ { + gf_ioc_mt_iovec = gf_common_mt_end + 1, + gf_ioc_mt_ioc_table_t, + gf_ioc_mt_char, + gf_ioc_mt_ioc_waitq_t, + gf_ioc_mt_ioc_priority, + gf_ioc_mt_list_head, + gf_ioc_mt_call_pool_t, + gf_ioc_mt_ioc_inode_t, + gf_ioc_mt_ioc_fill_t, + gf_ioc_mt_ioc_newpage_t, + gf_ioc_mt_end +}; +#endif diff --git a/xlators/performance/io-cache/src/page.c b/xlators/performance/io-cache/src/page.c index be8803444..b2e20ba65 100644 --- a/xlators/performance/io-cache/src/page.c +++ b/xlators/performance/io-cache/src/page.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -27,88 +18,176 @@ #include "dict.h" #include "xlator.h" #include "io-cache.h" +#include "ioc-mem-types.h" #include <assert.h> #include <sys/time.h> +char +ioc_empty (struct ioc_cache *cache) +{ + char is_empty = -1; + + GF_VALIDATE_OR_GOTO ("io-cache", cache, out); + + is_empty = list_empty (&cache->page_lru); + +out: + return is_empty; +} + + ioc_page_t * -ioc_page_get (ioc_inode_t *ioc_inode, off_t offset) +__ioc_page_get (ioc_inode_t *ioc_inode, off_t offset) { - int8_t found = 0; - ioc_page_t *page = NULL; - ioc_table_t *table = NULL; - off_t rounded_offset = 0; + ioc_page_t *page = NULL; + ioc_table_t *table = NULL; + off_t rounded_offset = 0; + + GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); table = ioc_inode->table; + GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); + rounded_offset = floor (offset, table->page_size); - - if (list_empty (&ioc_inode->pages)) { - return NULL; - } - - list_for_each_entry (page, &ioc_inode->pages, pages) { - if (page->offset == rounded_offset) { - found = 1; - break; - } - } - - /* was previously returning ioc_inode itself.., - * 1st of its type and found one more downstairs :O */ - if (!found){ - page = NULL; - } else { - /* push the page to the end of the lru list */ - list_move_tail (&page->page_lru, &ioc_inode->page_lru); - } - - return page; + + page = rbthash_get (ioc_inode->cache.page_table, &rounded_offset, + sizeof (rounded_offset)); + + if (page != NULL) { + /* push the page to the end of the lru list */ + list_move_tail (&page->page_lru, &ioc_inode->cache.page_lru); + } + +out: + return page; +} + + +ioc_page_t * +ioc_page_get (ioc_inode_t *ioc_inode, off_t offset) +{ + ioc_page_t *page = NULL; + + if (ioc_inode == NULL) { + goto out; + } + + ioc_inode_lock (ioc_inode); + { + page = __ioc_page_get (ioc_inode, offset); + } + ioc_inode_unlock (ioc_inode); + +out: + return page; } /* - * ioc_page_destroy - + * __ioc_page_destroy - * * @page: * */ int64_t +__ioc_page_destroy (ioc_page_t *page) +{ + int64_t page_size = 0; + + GF_VALIDATE_OR_GOTO ("io-cache", page, out); + + if (page->iobref) + page_size = iobref_size (page->iobref); + + if (page->waitq) { + /* frames waiting on this page, do not destroy this page */ + page_size = -1; + page->stale = 1; + } else { + rbthash_remove (page->inode->cache.page_table, &page->offset, + sizeof (page->offset)); + list_del (&page->page_lru); + + gf_log (page->inode->table->xl->name, GF_LOG_TRACE, + "destroying page = %p, offset = %"PRId64" " + "&& inode = %p", + page, page->offset, page->inode); + + if (page->vector){ + iobref_unref (page->iobref); + GF_FREE (page->vector); + page->vector = NULL; + } + + page->inode = NULL; + } + + if (page_size != -1) { + pthread_mutex_destroy (&page->page_lock); + GF_FREE (page); + } + +out: + return page_size; +} + + +int64_t ioc_page_destroy (ioc_page_t *page) { - int64_t page_size = 0; - - page_size = iobref_size (page->iobref); - - if (page->waitq) { - /* frames waiting on this page, do not destroy this page */ - page_size = -1; - } else { - - list_del (&page->pages); - list_del (&page->page_lru); - - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, - "destroying page = %p, offset = %"PRId64" " - "&& inode = %p", - page, page->offset, page->inode); - - if (page->vector){ - iobref_unref (page->iobref); - free (page->vector); - page->vector = NULL; - } - - page->inode = NULL; - - } - - if (page_size != -1) { - pthread_mutex_destroy (&page->page_lock); - free (page); - } - - return page_size; + int64_t ret = 0; + + if (page == NULL) { + goto out; + } + + ioc_inode_lock (page->inode); + { + ret = __ioc_page_destroy (page); + } + ioc_inode_unlock (page->inode); + +out: + return ret; } +int32_t +__ioc_inode_prune (ioc_inode_t *curr, uint64_t *size_pruned, + uint64_t size_to_prune, uint32_t index) +{ + ioc_page_t *page = NULL, *next = NULL; + int32_t ret = 0; + ioc_table_t *table = NULL; + + if (curr == NULL) { + goto out; + } + + table = curr->table; + + list_for_each_entry_safe (page, next, &curr->cache.page_lru, page_lru) { + *size_pruned += page->size; + ret = __ioc_page_destroy (page); + + if (ret != -1) + table->cache_used -= ret; + + gf_log (table->xl->name, GF_LOG_TRACE, + "index = %d && table->cache_used = %"PRIu64" && table->" + "cache_size = %"PRIu64, index, table->cache_used, + table->cache_size); + + if ((*size_pruned) >= size_to_prune) + break; + } + + if (ioc_empty (&curr->cache)) { + list_del_init (&curr->inode_lru); + } + +out: + return 0; +} /* * ioc_prune - prune the cache. we have a limit to the number of pages we * can have in-memory. @@ -119,153 +198,157 @@ ioc_page_destroy (ioc_page_t *page) int32_t ioc_prune (ioc_table_t *table) { - ioc_inode_t *curr = NULL, *next_ioc_inode = NULL; - ioc_page_t *page = NULL, *next = NULL; - int32_t ret = -1; - int32_t index = 0; - uint64_t size_to_prune = 0; - uint64_t size_pruned = 0; - - ioc_table_lock (table); - { - size_to_prune = table->cache_used - table->cache_size; - /* take out the least recently used inode */ - for (index=0; index < table->max_pri; index++) { - list_for_each_entry_safe (curr, next_ioc_inode, - &table->inode_lru[index], - inode_lru) { - /* prune page-by-page for this inode, till - * we reach the equilibrium */ - ioc_inode_lock (curr); - /* { */ - - list_for_each_entry_safe (page, next, - &curr->page_lru, - page_lru) { - /* done with all pages, and not - * reached equilibrium yet?? - * continue with next inode in - * lru_list */ - size_pruned += page->size; - ret = ioc_page_destroy (page); - - if (ret != -1) - table->cache_used -= ret; - - gf_log (table->xl->name, - GF_LOG_TRACE, - "index = %d && table->cache_" - "used = %"PRIu64" && table->" - "cache_size = %"PRIu64, - index, table->cache_used, - table->cache_size); - - if (size_pruned >= size_to_prune) - break; - } /* list_for_each_entry_safe(page...) */ - if (list_empty (&curr->pages)) { - list_del_init (&curr->inode_lru); - } - - /* } */ - ioc_inode_unlock (curr); - - if (size_pruned >= size_to_prune) - break; - } /* list_for_each_entry_safe (curr...) */ - - if (size_pruned >= size_to_prune) - break; - } /* for(index=0;...) */ - - } /* ioc_inode_table locked region end */ - ioc_table_unlock (table); - - return 0; + ioc_inode_t *curr = NULL, *next_ioc_inode = NULL; + int32_t index = 0; + uint64_t size_to_prune = 0; + uint64_t size_pruned = 0; + + GF_VALIDATE_OR_GOTO ("io-cache", table, out); + + ioc_table_lock (table); + { + size_to_prune = table->cache_used - table->cache_size; + /* take out the least recently used inode */ + for (index=0; index < table->max_pri; index++) { + list_for_each_entry_safe (curr, next_ioc_inode, + &table->inode_lru[index], + inode_lru) { + /* prune page-by-page for this inode, till + * we reach the equilibrium */ + ioc_inode_lock (curr); + { + __ioc_inode_prune (curr, &size_pruned, + size_to_prune, + index); + } + ioc_inode_unlock (curr); + + if (size_pruned >= size_to_prune) + break; + } /* list_for_each_entry_safe (curr...) */ + + if (size_pruned >= size_to_prune) + break; + } /* for(index=0;...) */ + + } /* ioc_inode_table locked region end */ + ioc_table_unlock (table); + +out: + return 0; } /* - * ioc_page_create - create a new page. + * __ioc_page_create - create a new page. * - * @ioc_inode: + * @ioc_inode: * @offset: * */ ioc_page_t * -ioc_page_create (ioc_inode_t *ioc_inode, off_t offset) +__ioc_page_create (ioc_inode_t *ioc_inode, off_t offset) { - ioc_table_t *table = NULL; - ioc_page_t *page = NULL; - off_t rounded_offset = 0; - ioc_page_t *newpage = NULL; - + ioc_table_t *table = NULL; + ioc_page_t *page = NULL; + off_t rounded_offset = 0; + ioc_page_t *newpage = NULL; + + GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); + table = ioc_inode->table; + GF_VALIDATE_OR_GOTO ("io-cache", table, out); + rounded_offset = floor (offset, table->page_size); - newpage = CALLOC (1, sizeof (*newpage)); - ERR_ABORT (newpage); + newpage = GF_CALLOC (1, sizeof (*newpage), gf_ioc_mt_ioc_newpage_t); + if (newpage == NULL) { + goto out; + } + + if (!ioc_inode) { + GF_FREE (newpage); + newpage = NULL; + goto out; + } - if (ioc_inode) - table = ioc_inode->table; - else { - return NULL; - } - - newpage->offset = rounded_offset; - newpage->inode = ioc_inode; - pthread_mutex_init (&newpage->page_lock, NULL); + newpage->offset = rounded_offset; + newpage->inode = ioc_inode; + pthread_mutex_init (&newpage->page_lock, NULL); - list_add_tail (&newpage->page_lru, &ioc_inode->page_lru); - list_add_tail (&newpage->pages, &ioc_inode->pages); + rbthash_insert (ioc_inode->cache.page_table, newpage, &rounded_offset, + sizeof (rounded_offset)); - page = newpage; + list_add_tail (&newpage->page_lru, &ioc_inode->cache.page_lru); - gf_log ("io-cache", GF_LOG_TRACE, - "returning new page %p", page); - return page; + page = newpage; + + gf_log ("io-cache", GF_LOG_TRACE, + "returning new page %p", page); + +out: + return page; } -/* - * ioc_wait_on_page - pause a frame to wait till the arrival of a page. - * here we need to handle the case when the frame who calls wait_on_page - * himself has caused page_fault +/* + * ioc_wait_on_page - pause a frame to wait till the arrival of a page. + * here we need to handle the case when the frame who calls wait_on_page + * himself has caused page_fault * * @page: page to wait on * @frame: call frame who is waiting on page * */ void -ioc_wait_on_page (ioc_page_t *page, call_frame_t *frame, off_t offset, - size_t size) +__ioc_wait_on_page (ioc_page_t *page, call_frame_t *frame, off_t offset, + size_t size) { - ioc_waitq_t *waitq = NULL; - ioc_local_t *local = frame->local; - - waitq = CALLOC (1, sizeof (*waitq)); - ERR_ABORT (waitq); - - gf_log (frame->this->name, GF_LOG_TRACE, - "frame(%p) waiting on page = %p, offset=%"PRId64", " - "size=%"GF_PRI_SIZET"", - frame, page, offset, size); - - waitq->data = frame; - waitq->next = page->waitq; - waitq->pending_offset = offset; - waitq->pending_size = size; - page->waitq = waitq; - /* one frame can wait only once on a given page, - * local->wait_count is number of pages a frame is waiting on */ - ioc_local_lock (local); - { - local->wait_count++; - } - ioc_local_unlock (local); + ioc_waitq_t *waitq = NULL; + ioc_local_t *local = NULL; + + GF_VALIDATE_OR_GOTO ("io-cache", frame, out); + local = frame->local; + + GF_VALIDATE_OR_GOTO (frame->this->name, local, out); + + if (page == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + gf_log (frame->this->name, GF_LOG_WARNING, + "asked to wait on a NULL page"); + } + + waitq = GF_CALLOC (1, sizeof (*waitq), gf_ioc_mt_ioc_waitq_t); + if (waitq == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + gf_log (frame->this->name, GF_LOG_TRACE, + "frame(%p) waiting on page = %p, offset=%"PRId64", " + "size=%"GF_PRI_SIZET"", + frame, page, offset, size); + + waitq->data = frame; + waitq->next = page->waitq; + waitq->pending_offset = offset; + waitq->pending_size = size; + page->waitq = waitq; + /* one frame can wait only once on a given page, + * local->wait_count is number of pages a frame is waiting on */ + ioc_local_lock (local); + { + local->wait_count++; + } + ioc_local_unlock (local); + +out: + return; } /* - * ioc_cache_still_valid - see if cached pages ioc_inode are still valid + * ioc_cache_still_valid - see if cached pages ioc_inode are still valid * against given stbuf * * @ioc_inode: @@ -274,185 +357,212 @@ ioc_wait_on_page (ioc_page_t *page, call_frame_t *frame, off_t offset, * assumes ioc_inode is locked */ int8_t -ioc_cache_still_valid (ioc_inode_t *ioc_inode, struct stat *stbuf) +ioc_cache_still_valid (ioc_inode_t *ioc_inode, struct iatt *stbuf) { - int8_t cache_still_valid = 1; - + int8_t cache_still_valid = 1; + + GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); + #if 0 - if (!stbuf || (stbuf->st_mtime != ioc_inode->mtime) || - (stbuf->st_mtim.tv_nsec != ioc_inode->stbuf.st_mtim.tv_nsec)) - cache_still_valid = 0; + if (!stbuf || (stbuf->ia_mtime != ioc_inode->cache.mtime) || + (stbuf->st_mtim.tv_nsec != ioc_inode->stbuf.st_mtim.tv_nsec)) + cache_still_valid = 0; #else - if (!stbuf || (stbuf->st_mtime != ioc_inode->mtime)) - cache_still_valid = 0; + if (!stbuf || (stbuf->ia_mtime != ioc_inode->cache.mtime) + || (stbuf->ia_mtime_nsec != ioc_inode->cache.mtime_nsec)) + cache_still_valid = 0; #endif #if 0 - /* talk with avati@zresearch.com to enable this section */ - if (!ioc_inode->mtime && stbuf) { - cache_still_valid = 1; - ioc_inode->mtime = stbuf->st_mtime; - } + /* talk with avati@gluster.com to enable this section */ + if (!ioc_inode->mtime && stbuf) { + cache_still_valid = 1; + ioc_inode->mtime = stbuf->ia_mtime; + } #endif - return cache_still_valid; +out: + return cache_still_valid; } void ioc_waitq_return (ioc_waitq_t *waitq) { - ioc_waitq_t *trav = NULL; - ioc_waitq_t *next = NULL; - call_frame_t *frame = NULL; + ioc_waitq_t *trav = NULL; + ioc_waitq_t *next = NULL; + call_frame_t *frame = NULL; - for (trav = waitq; trav; trav = next) { - next = trav->next; + for (trav = waitq; trav; trav = next) { + next = trav->next; - frame = trav->data; - ioc_frame_return (frame); - free (trav); - } + frame = trav->data; + ioc_frame_return (frame); + GF_FREE (trav); + } } int ioc_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct stat *stbuf, struct iobref *iobref) + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) { - ioc_local_t *local = NULL; - off_t offset = 0; - ioc_inode_t *ioc_inode = NULL; - ioc_table_t *table = NULL; - ioc_page_t *page = NULL; - off_t trav_offset = 0; - size_t payload_size = 0; - int32_t destroy_size = 0; - size_t page_size = 0; - ioc_waitq_t *waitq = NULL; - size_t iobref_page_size = 0; + ioc_local_t *local = NULL; + off_t offset = 0; + ioc_inode_t *ioc_inode = NULL; + ioc_table_t *table = NULL; + ioc_page_t *page = NULL; + int32_t destroy_size = 0; + size_t page_size = 0; + ioc_waitq_t *waitq = NULL; + size_t iobref_page_size = 0; + char zero_filled = 0; + + GF_ASSERT (frame); local = frame->local; + GF_ASSERT (local); + offset = local->pending_offset; ioc_inode = local->inode; - table = ioc_inode->table; + GF_ASSERT (ioc_inode); - trav_offset = offset; - payload_size = op_ret; - - ioc_inode_lock (ioc_inode); - { - if (op_ret == -1 || - (op_ret >= 0 && - !ioc_cache_still_valid(ioc_inode, stbuf))) { - gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, - "cache for inode(%p) is invalid. flushing " - "all pages", ioc_inode); - destroy_size = __ioc_inode_flush (ioc_inode); - } - - if (op_ret >= 0) - ioc_inode->mtime = stbuf->st_mtime; - - gettimeofday (&ioc_inode->tv, NULL); - - if (op_ret < 0) { - /* error, readv returned -1 */ - page = ioc_page_get (ioc_inode, offset); - if (page) - waitq = ioc_page_error (page, op_ret, - op_errno); - } else { - gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, - "op_ret = %d", op_ret); - page = ioc_page_get (ioc_inode, offset); - if (!page) { - /* page was flushed */ - /* some serious bug ? */ - gf_log (this->name, GF_LOG_DEBUG, - "wasted copy: %"PRId64"[+%"PRId64"] " - "ioc_inode=%p", offset, - table->page_size, ioc_inode); - } else { - if (page->vector) { - iobref_unref (page->iobref); - free (page->vector); - page->vector = NULL; - } - - /* keep a copy of the page for our cache */ - page->vector = iov_dup (vector, count); - page->count = count; - if (iobref) { - page->iobref = iobref_ref (iobref); - } else { - /* TODO: we have got a response to - * our request and no data */ - gf_log (this->name, GF_LOG_CRITICAL, - "frame>root>rsp_refs is null"); - } /* if(frame->root->rsp_refs) */ - - /* page->size should indicate exactly how - * much the readv call to the child - * translator returned. earlier op_ret - * from child translator was used, which - * gave rise to a bug where reads from - * io-cached volume were resulting in 0 - * byte replies */ - page_size = iov_length(vector, count); - - page->size = page_size; + table = ioc_inode->table; + GF_ASSERT (table); + + zero_filled = ((op_ret >=0) && (stbuf->ia_mtime == 0)); + + ioc_inode_lock (ioc_inode); + { + if (op_ret == -1 || !(zero_filled || + ioc_cache_still_valid(ioc_inode, + stbuf))) { + gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, + "cache for inode(%p) is invalid. flushing " + "all pages", ioc_inode); + destroy_size = __ioc_inode_flush (ioc_inode); + } + + if ((op_ret >= 0) && !zero_filled) { + ioc_inode->cache.mtime = stbuf->ia_mtime; + ioc_inode->cache.mtime_nsec = stbuf->ia_mtime_nsec; + } + + gettimeofday (&ioc_inode->cache.tv, NULL); + + if (op_ret < 0) { + /* error, readv returned -1 */ + page = __ioc_page_get (ioc_inode, offset); + if (page) + waitq = __ioc_page_error (page, op_ret, + op_errno); + } else { + gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, + "op_ret = %d", op_ret); + page = __ioc_page_get (ioc_inode, offset); + if (!page) { + /* page was flushed */ + /* some serious bug ? */ + gf_log (frame->this->name, GF_LOG_WARNING, + "wasted copy: %"PRId64"[+%"PRId64"] " + "ioc_inode=%p", offset, + table->page_size, ioc_inode); + } else { + if (page->vector) { + iobref_unref (page->iobref); + GF_FREE (page->vector); + page->vector = NULL; + } + + /* keep a copy of the page for our cache */ + page->vector = iov_dup (vector, count); + if (page->vector == NULL) { + page = __ioc_page_get (ioc_inode, + offset); + if (page != NULL) + waitq = __ioc_page_error (page, + -1, + ENOMEM); + goto unlock; + } + + page->count = count; + if (iobref) { + page->iobref = iobref_ref (iobref); + } else { + /* TODO: we have got a response to + * our request and no data */ + gf_log (frame->this->name, + GF_LOG_CRITICAL, + "frame>root>rsp_refs is null"); + } /* if(frame->root->rsp_refs) */ + + /* page->size should indicate exactly how + * much the readv call to the child + * translator returned. earlier op_ret + * from child translator was used, which + * gave rise to a bug where reads from + * io-cached volume were resulting in 0 + * byte replies */ + page_size = iov_length(vector, count); + page->size = page_size; + page->op_errno = op_errno; iobref_page_size = iobref_size (page->iobref); - if (page->waitq) { - /* wake up all the frames waiting on - * this page, including - * the frame which triggered fault */ - waitq = ioc_page_wakeup (page); - } /* if(page->waitq) */ - } /* if(!page)...else */ - } /* if(op_ret < 0)...else */ - } /* ioc_inode locked region end */ - ioc_inode_unlock (ioc_inode); - - ioc_waitq_return (waitq); - - if (iobref_page_size) { - ioc_table_lock (table); - { - table->cache_used += iobref_page_size; - } - ioc_table_unlock (table); - } - - if (destroy_size) { - ioc_table_lock (table); - { - table->cache_used -= destroy_size; - } - ioc_table_unlock (table); - } - - if (ioc_need_prune (ioc_inode->table)) { - ioc_prune (ioc_inode->table); - } - - gf_log (this->name, GF_LOG_TRACE, "fault frame %p returned", frame); - pthread_mutex_destroy (&local->local_lock); - - fd_unref (local->fd); - - STACK_DESTROY (frame->root); - return 0; + if (page->waitq) { + /* wake up all the frames waiting on + * this page, including + * the frame which triggered fault */ + waitq = __ioc_page_wakeup (page, + op_errno); + } /* if(page->waitq) */ + } /* if(!page)...else */ + } /* if(op_ret < 0)...else */ + } /* ioc_inode locked region end */ +unlock: + ioc_inode_unlock (ioc_inode); + + ioc_waitq_return (waitq); + + if (iobref_page_size) { + ioc_table_lock (table); + { + table->cache_used += iobref_page_size; + } + ioc_table_unlock (table); + } + + if (destroy_size) { + ioc_table_lock (table); + { + table->cache_used -= destroy_size; + } + ioc_table_unlock (table); + } + + if (ioc_need_prune (ioc_inode->table)) { + ioc_prune (ioc_inode->table); + } + + gf_log (frame->this->name, GF_LOG_TRACE, "fault frame %p returned", + frame); + pthread_mutex_destroy (&local->local_lock); + + fd_unref (local->fd); + + STACK_DESTROY (frame->root); + return 0; } + /* * ioc_page_fault - - * + * * @ioc_inode: * @frame: * @fd: @@ -460,149 +570,223 @@ ioc_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * */ void -ioc_page_fault (ioc_inode_t *ioc_inode, call_frame_t *frame, fd_t *fd, - off_t offset) +ioc_page_fault (ioc_inode_t *ioc_inode, call_frame_t *frame, fd_t *fd, + off_t offset) { - ioc_table_t *table = NULL; - call_frame_t *fault_frame = NULL; - ioc_local_t *fault_local = NULL; + ioc_table_t *table = NULL; + call_frame_t *fault_frame = NULL; + ioc_local_t *fault_local = NULL; + int32_t op_ret = -1, op_errno = -1; + ioc_waitq_t *waitq = NULL; + ioc_page_t *page = NULL; + + GF_ASSERT (ioc_inode); + if (frame == NULL) { + op_ret = -1; + op_errno = EINVAL; + gf_log ("io-cache", GF_LOG_WARNING, + "page fault on a NULL frame"); + goto err; + } table = ioc_inode->table; fault_frame = copy_frame (frame); - fault_local = CALLOC (1, sizeof (ioc_local_t)); - ERR_ABORT (fault_local); - - /* NOTE: copy_frame() means, the frame the fop whose fd_ref we - * are using till now won't be valid till we get reply from server. - * we unref this fd, in fault_cbk */ - fault_local->fd = fd_ref (fd); - - fault_frame->local = fault_local; - pthread_mutex_init (&fault_local->local_lock, NULL); - - INIT_LIST_HEAD (&fault_local->fill_list); - fault_local->pending_offset = offset; - fault_local->pending_size = table->page_size; - fault_local->inode = ioc_inode; - - gf_log (frame->this->name, GF_LOG_TRACE, - "stack winding page fault for offset = %"PRId64" with " - "frame %p", offset, fault_frame); - - STACK_WIND (fault_frame, ioc_fault_cbk, FIRST_CHILD(fault_frame->this), - FIRST_CHILD(fault_frame->this)->fops->readv, fd, - table->page_size, offset); - return; + if (fault_frame == NULL) { + op_ret = -1; + op_errno = ENOMEM; + goto err; + } + + fault_local = mem_get0 (THIS->local_pool); + if (fault_local == NULL) { + op_ret = -1; + op_errno = ENOMEM; + STACK_DESTROY (fault_frame->root); + goto err; + } + + /* NOTE: copy_frame() means, the frame the fop whose fd_ref we + * are using till now won't be valid till we get reply from server. + * we unref this fd, in fault_cbk */ + fault_local->fd = fd_ref (fd); + + fault_frame->local = fault_local; + pthread_mutex_init (&fault_local->local_lock, NULL); + + INIT_LIST_HEAD (&fault_local->fill_list); + fault_local->pending_offset = offset; + fault_local->pending_size = table->page_size; + fault_local->inode = ioc_inode; + + gf_log (frame->this->name, GF_LOG_TRACE, + "stack winding page fault for offset = %"PRId64" with " + "frame %p", offset, fault_frame); + + STACK_WIND (fault_frame, ioc_fault_cbk, FIRST_CHILD(fault_frame->this), + FIRST_CHILD(fault_frame->this)->fops->readv, fd, + table->page_size, offset, 0, NULL); + return; + +err: + ioc_inode_lock (ioc_inode); + { + page = __ioc_page_get (ioc_inode, offset); + if (page != NULL) { + waitq = __ioc_page_error (page, op_ret, op_errno); + } + } + ioc_inode_unlock (ioc_inode); + + if (waitq != NULL) { + ioc_waitq_return (waitq); + } } -void -ioc_frame_fill (ioc_page_t *page, call_frame_t *frame, off_t offset, - size_t size) + +int32_t +__ioc_frame_fill (ioc_page_t *page, call_frame_t *frame, off_t offset, + size_t size, int32_t op_errno) { - ioc_local_t *local = NULL; - ioc_fill_t *fill = NULL; - off_t src_offset = 0; - off_t dst_offset = 0; - ssize_t copy_size = 0; - ioc_inode_t *ioc_inode = NULL; - ioc_fill_t *new = NULL; - int8_t found = 0; - + ioc_local_t *local = NULL; + ioc_fill_t *fill = NULL; + off_t src_offset = 0; + off_t dst_offset = 0; + ssize_t copy_size = 0; + ioc_inode_t *ioc_inode = NULL; + ioc_fill_t *new = NULL; + int8_t found = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("io-cache", frame, out); + local = frame->local; + GF_VALIDATE_OR_GOTO (frame->this->name, local, out); + + if (page == NULL) { + gf_log (frame->this->name, GF_LOG_WARNING, + "NULL page has been provided to serve read request"); + local->op_ret = -1; + local->op_errno = EINVAL; + goto out; + } + ioc_inode = page->inode; - gf_log (frame->this->name, GF_LOG_TRACE, - "frame (%p) offset = %"PRId64" && size = %"GF_PRI_SIZET" " - "&& page->size = %"GF_PRI_SIZET" && wait_count = %d", - frame, offset, size, page->size, local->wait_count); - - /* immediately move this page to the end of the page_lru list */ - list_move_tail (&page->page_lru, &ioc_inode->page_lru); - /* fill local->pending_size bytes from local->pending_offset */ - if (local->op_ret != -1 && page->size) { - if (offset > page->offset) - /* offset is offset in file, convert it to offset in - * page */ - src_offset = offset - page->offset; - /*FIXME: since offset is the offset within page is the - * else case valid? */ - else - /* local->pending_offset is in previous page. do not - * fill until we have filled all previous pages */ - dst_offset = page->offset - offset; - - /* we have to copy from offset to either end of this page - * or till the requested size */ - copy_size = min (page->size - src_offset, - size - dst_offset); - - if (copy_size < 0) { - /* if page contains fewer bytes and the required offset - is beyond the page size in the page */ - copy_size = src_offset = 0; - } - - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, - "copy_size = %"GF_PRI_SIZET" && src_offset = " - "%"PRId64" && dst_offset = %"PRId64"", - copy_size, src_offset, dst_offset); - - { - new = CALLOC (1, sizeof (*new)); - ERR_ABORT (new); - new->offset = page->offset; - new->size = copy_size; - new->iobref = iobref_ref (page->iobref); - new->count = iov_subset (page->vector, - page->count, - src_offset, - src_offset + copy_size, - NULL); - new->vector = CALLOC (new->count, - sizeof (struct iovec)); - ERR_ABORT (new->vector); - new->count = iov_subset (page->vector, - page->count, - src_offset, - src_offset + copy_size, - new->vector); - - - - /* add the ioc_fill to fill_list for this frame */ - if (list_empty (&local->fill_list)) { - /* if list is empty, then this is the first - * time we are filling frame, add the - * ioc_fill_t to the end of list */ - list_add_tail (&new->list, &local->fill_list); - } else { + gf_log (frame->this->name, GF_LOG_TRACE, + "frame (%p) offset = %"PRId64" && size = %"GF_PRI_SIZET" " + "&& page->size = %"GF_PRI_SIZET" && wait_count = %d", + frame, offset, size, page->size, local->wait_count); + + /* immediately move this page to the end of the page_lru list */ + list_move_tail (&page->page_lru, &ioc_inode->cache.page_lru); + /* fill local->pending_size bytes from local->pending_offset */ + if (local->op_ret != -1) { + local->op_errno = op_errno; + + if (page->size == 0) { + goto done; + } + + if (offset > page->offset) + /* offset is offset in file, convert it to offset in + * page */ + src_offset = offset - page->offset; + /*FIXME: since offset is the offset within page is the + * else case valid? */ + else + /* local->pending_offset is in previous page. do not + * fill until we have filled all previous pages */ + dst_offset = page->offset - offset; + + /* we have to copy from offset to either end of this page + * or till the requested size */ + copy_size = min (page->size - src_offset, + size - dst_offset); + + if (copy_size < 0) { + /* if page contains fewer bytes and the required offset + is beyond the page size in the page */ + copy_size = src_offset = 0; + } + + gf_log (page->inode->table->xl->name, GF_LOG_TRACE, + "copy_size = %"GF_PRI_SIZET" && src_offset = " + "%"PRId64" && dst_offset = %"PRId64"", + copy_size, src_offset, dst_offset); + + { + new = GF_CALLOC (1, sizeof (*new), + gf_ioc_mt_ioc_fill_t); + if (new == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + new->offset = page->offset; + new->size = copy_size; + new->iobref = iobref_ref (page->iobref); + new->count = iov_subset (page->vector, page->count, + src_offset, + src_offset + copy_size, + NULL); + + new->vector = GF_CALLOC (new->count, + sizeof (struct iovec), + gf_ioc_mt_iovec); + if (new->vector == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + + iobref_unref (new->iobref); + GF_FREE (new); + goto out; + } + + new->count = iov_subset (page->vector, page->count, + src_offset, + src_offset + copy_size, + new->vector); + + /* add the ioc_fill to fill_list for this frame */ + if (list_empty (&local->fill_list)) { + /* if list is empty, then this is the first + * time we are filling frame, add the + * ioc_fill_t to the end of list */ + list_add_tail (&new->list, &local->fill_list); + } else { found = 0; - /* list is not empty, we need to look for - * where this offset fits in list */ - list_for_each_entry (fill, &local->fill_list, - list) { - if (fill->offset > new->offset) { - found = 1; - break; - } - } - - if (found) { - found = 0; - list_add_tail (&new->list, - &fill->list); - } else { - list_add_tail (&new->list, - &local->fill_list); - } - } - } - local->op_ret += copy_size; - } + /* list is not empty, we need to look for + * where this offset fits in list */ + list_for_each_entry (fill, &local->fill_list, + list) { + if (fill->offset > new->offset) { + found = 1; + break; + } + } + + if (found) { + list_add_tail (&new->list, + &fill->list); + } else { + list_add_tail (&new->list, + &local->fill_list); + } + } + } + + local->op_ret += copy_size; + } + +done: + ret = 0; +out: + return ret; } /* - * ioc_frame_unwind - frame unwinds only from here + * ioc_frame_unwind - frame unwinds only from here * * @frame: call frame to unwind * @@ -613,66 +797,101 @@ ioc_frame_fill (ioc_page_t *page, call_frame_t *frame, off_t offset, static void ioc_frame_unwind (call_frame_t *frame) { - ioc_local_t *local = NULL; - ioc_fill_t *fill = NULL, *next = NULL; - int32_t count = 0; - struct iovec *vector = NULL; - int32_t copied = 0; - struct iobref *iobref = NULL; - struct stat stbuf = {0,}; - int32_t op_ret = 0; + ioc_local_t *local = NULL; + ioc_fill_t *fill = NULL, *next = NULL; + int32_t count = 0; + struct iovec *vector = NULL; + int32_t copied = 0; + struct iobref *iobref = NULL; + struct iatt stbuf = {0,}; + int32_t op_ret = 0, op_errno = 0; + + GF_ASSERT (frame); local = frame->local; - // ioc_local_lock (local); - iobref = iobref_new (); - - frame->local = NULL; - - if (list_empty (&local->fill_list)) { - gf_log (frame->this->name, GF_LOG_TRACE, - "frame(%p) has 0 entries in local->fill_list " - "(offset = %"PRId64" && size = %"GF_PRI_SIZET")", - frame, local->offset, local->size); - } - - list_for_each_entry (fill, &local->fill_list, list) { - count += fill->count; - } - - vector = CALLOC (count, sizeof (*vector)); - ERR_ABORT (vector); - - list_for_each_entry_safe (fill, next, &local->fill_list, list) { - memcpy (((char *)vector) + copied, - fill->vector, - fill->count * sizeof (*vector)); - - copied += (fill->count * sizeof (*vector)); - - iobref_merge (iobref, fill->iobref); - - list_del (&fill->list); - iobref_unref (fill->iobref); - free (fill->vector); - free (fill); - } - - op_ret = iov_length (vector, count); - gf_log (frame->this->name, GF_LOG_TRACE, - "frame(%p) unwinding with op_ret=%d", frame, op_ret); - - // ioc_local_unlock (local); - - STACK_UNWIND (frame, op_ret, local->op_errno, vector, count, - &stbuf, iobref); - - iobref_unref (iobref); - - pthread_mutex_destroy (&local->local_lock); - free (local); - free (vector); - - return; + if (local == NULL) { + gf_log (frame->this->name, GF_LOG_WARNING, + "local is NULL"); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + if (local->op_ret < 0) { + op_ret = local->op_ret; + op_errno = local->op_errno; + goto unwind; + } + + // ioc_local_lock (local); + iobref = iobref_new (); + if (iobref == NULL) { + op_ret = -1; + op_errno = ENOMEM; + } + + if (list_empty (&local->fill_list)) { + gf_log (frame->this->name, GF_LOG_TRACE, + "frame(%p) has 0 entries in local->fill_list " + "(offset = %"PRId64" && size = %"GF_PRI_SIZET")", + frame, local->offset, local->size); + } + + list_for_each_entry (fill, &local->fill_list, list) { + count += fill->count; + } + + vector = GF_CALLOC (count, sizeof (*vector), gf_ioc_mt_iovec); + if (vector == NULL) { + op_ret = -1; + op_errno = ENOMEM; + } + + list_for_each_entry_safe (fill, next, &local->fill_list, list) { + if ((vector != NULL) && (iobref != NULL)) { + memcpy (((char *)vector) + copied, + fill->vector, + fill->count * sizeof (*vector)); + + copied += (fill->count * sizeof (*vector)); + + iobref_merge (iobref, fill->iobref); + } + + list_del (&fill->list); + iobref_unref (fill->iobref); + GF_FREE (fill->vector); + GF_FREE (fill); + } + + if (op_ret != -1) { + op_ret = iov_length (vector, count); + } + +unwind: + gf_log (frame->this->name, GF_LOG_TRACE, + "frame(%p) unwinding with op_ret=%d", frame, op_ret); + + // ioc_local_unlock (local); + + frame->local = NULL; + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, + count, &stbuf, iobref, NULL); + + if (iobref != NULL) { + iobref_unref (iobref); + } + + if (vector != NULL) { + GF_FREE (vector); + vector = NULL; + } + + pthread_mutex_destroy (&local->local_lock); + if (local) + mem_put (local); + + return; } /* @@ -684,56 +903,119 @@ ioc_frame_unwind (call_frame_t *frame) void ioc_frame_return (call_frame_t *frame) { - ioc_local_t *local = NULL; - int32_t wait_count = 0; + ioc_local_t *local = NULL; + int32_t wait_count = 0; + + GF_ASSERT (frame); local = frame->local; - assert (local->wait_count > 0); + GF_ASSERT (local->wait_count > 0); - ioc_local_lock (local); - { - wait_count = --local->wait_count; - } - ioc_local_unlock (local); + ioc_local_lock (local); + { + wait_count = --local->wait_count; + } + ioc_local_unlock (local); - if (!wait_count) { - ioc_frame_unwind (frame); - } + if (!wait_count) { + ioc_frame_unwind (frame); + } - return; + return; } -/* +/* * ioc_page_wakeup - * @page: * * to be called only when a frame is waiting on an in-transit page */ ioc_waitq_t * -ioc_page_wakeup (ioc_page_t *page) +__ioc_page_wakeup (ioc_page_t *page, int32_t op_errno) { - ioc_waitq_t *waitq = NULL, *trav = NULL; - call_frame_t *frame = NULL; - - waitq = page->waitq; - page->waitq = NULL; - - trav = waitq; - page->ready = 1; - - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, - "page is %p && waitq = %p", page, waitq); - - for (trav = waitq; trav; trav = trav->next) { - frame = trav->data; - ioc_frame_fill (page, frame, trav->pending_offset, - trav->pending_size); - } - - return waitq; + ioc_waitq_t *waitq = NULL, *trav = NULL; + call_frame_t *frame = NULL; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("io-cache", page, out); + + waitq = page->waitq; + page->waitq = NULL; + + page->ready = 1; + + gf_log (page->inode->table->xl->name, GF_LOG_TRACE, + "page is %p && waitq = %p", page, waitq); + + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; + ret = __ioc_frame_fill (page, frame, trav->pending_offset, + trav->pending_size, op_errno); + if (ret == -1) { + break; + } + } + + if (page->stale) { + __ioc_page_destroy (page); + } + +out: + return waitq; } + +/* + * ioc_page_error - + * @page: + * @op_ret: + * @op_errno: + * + */ +ioc_waitq_t * +__ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno) +{ + ioc_waitq_t *waitq = NULL, *trav = NULL; + call_frame_t *frame = NULL; + int64_t ret = 0; + ioc_table_t *table = NULL; + ioc_local_t *local = NULL; + + GF_VALIDATE_OR_GOTO ("io-cache", page, out); + + waitq = page->waitq; + page->waitq = NULL; + + gf_log (page->inode->table->xl->name, GF_LOG_WARNING, + "page error for page = %p & waitq = %p", page, waitq); + + for (trav = waitq; trav; trav = trav->next) { + + frame = trav->data; + + local = frame->local; + ioc_local_lock (local); + { + if (local->op_ret != -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + } + ioc_local_unlock (local); + } + + table = page->inode->table; + ret = __ioc_page_destroy (page); + + if (ret != -1) { + table->cache_used -= ret; + } + +out: + return waitq; +} + /* * ioc_page_error - * @page: @@ -744,39 +1026,18 @@ ioc_page_wakeup (ioc_page_t *page) ioc_waitq_t * ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno) { - ioc_waitq_t *waitq = NULL, *trav = NULL; - call_frame_t *frame = NULL; - int64_t ret = 0; - ioc_table_t *table = NULL; - ioc_local_t *local = NULL; - - waitq = page->waitq; - page->waitq = NULL; - - gf_log (page->inode->table->xl->name, GF_LOG_DEBUG, - "page error for page = %p & waitq = %p", page, waitq); - - for (trav = waitq; trav; trav = trav->next) { - - frame = trav->data; - - local = frame->local; - ioc_local_lock (local); - { - if (local->op_ret != -1) { - local->op_ret = op_ret; - local->op_errno = op_errno; - } - } - ioc_local_unlock (local); - } - - table = page->inode->table; - ret = ioc_page_destroy (page); - - if (ret != -1) { - table->cache_used -= ret; - } - - return waitq; + ioc_waitq_t *waitq = NULL; + + if (page == NULL) { + goto out; + } + + ioc_inode_lock (page->inode); + { + waitq = __ioc_page_error (page, op_ret, op_errno); + } + ioc_inode_unlock (page->inode); + +out: + return waitq; } diff --git a/xlators/performance/io-threads/src/Makefile.am b/xlators/performance/io-threads/src/Makefile.am index 38dea3eb7..d63042e7c 100644 --- a/xlators/performance/io-threads/src/Makefile.am +++ b/xlators/performance/io-threads/src/Makefile.am @@ -1,14 +1,15 @@ xlator_LTLIBRARIES = io-threads.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -io_threads_la_LDFLAGS = -module -avoidversion +io_threads_la_LDFLAGS = -module -avoid-version io_threads_la_SOURCES = io-threads.c io_threads_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = io-threads.h +noinst_HEADERS = io-threads.h iot-mem-types.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c index 98e212ba2..bbcf4ed26 100644 --- a/xlators/performance/io-threads/src/io-threads.c +++ b/xlators/performance/io-threads/src/io-threads.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -31,358 +22,415 @@ #include <stdlib.h> #include <sys/time.h> #include <time.h> +#include "locking.h" -typedef void *(*iot_worker_fn)(void*); +void *iot_worker (void *arg); +int iot_workers_scale (iot_conf_t *conf); +int __iot_workers_scale (iot_conf_t *conf); +struct volume_options options[]; -void -_iot_queue (iot_worker_t *worker, iot_request_t *req); +call_stub_t * +__iot_dequeue (iot_conf_t *conf, int *pri, struct timespec *sleep) +{ + call_stub_t *stub = NULL; + int i = 0; + struct timeval curtv = {0,}, difftv = {0,}; + + *pri = -1; + sleep->tv_sec = 0; + sleep->tv_nsec = 0; + for (i = 0; i < IOT_PRI_MAX; i++) { + if (list_empty (&conf->reqs[i]) || + (conf->ac_iot_count[i] >= conf->ac_iot_limit[i])) + continue; + + if (i == IOT_PRI_LEAST) { + pthread_mutex_lock(&conf->throttle.lock); + if (!conf->throttle.sample_time.tv_sec) { + /* initialize */ + gettimeofday(&conf->throttle.sample_time, NULL); + } else { + /* + * Maintain a running count of least priority + * operations that are handled over a particular + * time interval. The count is provided via + * state dump and is used as a measure against + * least priority op throttling. + */ + gettimeofday(&curtv, NULL); + timersub(&curtv, &conf->throttle.sample_time, + &difftv); + if (difftv.tv_sec >= IOT_LEAST_THROTTLE_DELAY) { + conf->throttle.cached_rate = + conf->throttle.sample_cnt; + conf->throttle.sample_cnt = 0; + conf->throttle.sample_time = curtv; + } + + /* + * If we're over the configured rate limit, + * provide an absolute time to the caller that + * represents the soonest we're allowed to + * return another least priority request. + */ + if (conf->throttle.rate_limit && + conf->throttle.sample_cnt >= + conf->throttle.rate_limit) { + struct timeval delay; + delay.tv_sec = IOT_LEAST_THROTTLE_DELAY; + delay.tv_usec = 0; + + timeradd(&conf->throttle.sample_time, + &delay, &curtv); + TIMEVAL_TO_TIMESPEC(&curtv, sleep); + + pthread_mutex_unlock( + &conf->throttle.lock); + break; + } + } + conf->throttle.sample_cnt++; + pthread_mutex_unlock(&conf->throttle.lock); + } + + stub = list_entry (conf->reqs[i].next, call_stub_t, list); + conf->ac_iot_count[i]++; + *pri = i; + break; + } -iot_request_t * -iot_init_request (call_stub_t *stub); + if (!stub) + return NULL; -void -iot_startup_workers (iot_worker_t **workers, int start_idx, int count, - iot_worker_fn workerfunc); + conf->queue_size--; + conf->queue_sizes[*pri]--; + list_del_init (&stub->list); -void * -iot_worker_unordered (void *arg); + return stub; +} -void * -iot_worker_ordered (void *arg); void -iot_startup_worker (iot_worker_t *worker, iot_worker_fn workerfunc); +__iot_enqueue (iot_conf_t *conf, call_stub_t *stub, int pri) +{ + if (pri < 0 || pri >= IOT_PRI_MAX) + pri = IOT_PRI_MAX-1; -void -iot_destroy_request (iot_request_t * req); + list_add_tail (&stub->list, &conf->reqs[pri]); + conf->queue_size++; + conf->queue_sizes[pri]++; -/* I know this function modularizes things a bit too much, - * but it is easier on the eyes to read this than see all that locking, - * queueing, and thread firing in the same curly block, as was the - * case before this function. - */ -void -iot_request_queue_and_thread_fire (iot_worker_t *worker, - iot_worker_fn workerfunc, iot_request_t *req) -{ - pthread_mutex_lock (&worker->qlock); - { - if (iot_worker_active (worker)) - _iot_queue (worker, req); - else { - iot_startup_worker (worker, workerfunc); - _iot_queue (worker, req); - } - } - pthread_mutex_unlock (&worker->qlock); + return; } -int -iot_unordered_request_balancer (iot_conf_t *conf) -{ - long int rand = 0; - int idx = 0; +void * +iot_worker (void *data) +{ + iot_conf_t *conf = NULL; + xlator_t *this = NULL; + call_stub_t *stub = NULL; + struct timespec sleep_till = {0, }; + int ret = 0; + int pri = -1; + char timeout = 0; + char bye = 0; + struct timespec sleep = {0,}; + + conf = data; + this = conf->this; + THIS = this; + + for (;;) { + sleep_till.tv_sec = time (NULL) + conf->idle_time; + + pthread_mutex_lock (&conf->mutex); + { + if (pri != -1) { + conf->ac_iot_count[pri]--; + pri = -1; + } + while (conf->queue_size == 0) { + conf->sleep_count++; + + ret = pthread_cond_timedwait (&conf->cond, + &conf->mutex, + &sleep_till); + conf->sleep_count--; + + if (ret == ETIMEDOUT) { + timeout = 1; + break; + } + } + + if (timeout) { + if (conf->curr_count > IOT_MIN_THREADS) { + conf->curr_count--; + bye = 1; + gf_log (conf->this->name, GF_LOG_DEBUG, + "timeout, terminated. conf->curr_count=%d", + conf->curr_count); + } else { + timeout = 0; + } + } + + stub = __iot_dequeue (conf, &pri, &sleep); + if (!stub && (sleep.tv_sec || sleep.tv_nsec)) { + pthread_cond_timedwait(&conf->cond, + &conf->mutex, &sleep); + pthread_mutex_unlock(&conf->mutex); + continue; + } + } + pthread_mutex_unlock (&conf->mutex); - /* Decide which thread will service the request. - * FIXME: This should change into some form of load-balancing. - * */ - rand = random (); + if (stub) /* guard against spurious wakeups */ + call_resume (stub); - /* If scaling is on, we can choose from any thread - * that has been allocated upto, max_o_threads, but - * with scaling off, we'll never have threads more - * than min_o_threads. - */ - if (iot_unordered_scaling_on (conf)) - idx = (rand % conf->max_u_threads); - else - idx = (rand % conf->min_u_threads); + if (bye) + break; + } - return idx; + if (pri != -1) { + pthread_mutex_lock (&conf->mutex); + { + conf->ac_iot_count[pri]--; + } + pthread_mutex_unlock (&conf->mutex); + } + return NULL; } -void -iot_schedule_unordered (iot_conf_t *conf, inode_t *inode, call_stub_t *stub) +int +do_iot_schedule (iot_conf_t *conf, call_stub_t *stub, int pri) { - int32_t idx = 0; - iot_worker_t *selected_worker = NULL; - iot_request_t *req = NULL; - - idx = iot_unordered_request_balancer (conf); - selected_worker = conf->uworkers[idx]; + int ret = 0; - req = iot_init_request (stub); - iot_request_queue_and_thread_fire (selected_worker, - iot_worker_unordered, req); -} - - -/* Only to be used with ordered requests. - */ -uint64_t -iot_create_inode_worker_assoc (iot_conf_t * conf, inode_t * inode) -{ - long int rand = 0; - uint64_t idx = 0; + pthread_mutex_lock (&conf->mutex); + { + __iot_enqueue (conf, stub, pri); - rand = random (); - /* If scaling is on, we can choose from any thread - * that has been allocated upto, max_o_threads, but - * with scaling off, we'll never have threads more - * than min_o_threads. - */ - if (iot_ordered_scaling_on (conf)) - idx = (rand % conf->max_o_threads); - else - idx = (rand % conf->min_o_threads); + pthread_cond_signal (&conf->cond); - __inode_ctx_put (inode, conf->this, idx); + ret = __iot_workers_scale (conf); + } + pthread_mutex_unlock (&conf->mutex); - return idx; + return ret; } +char* +iot_get_pri_meaning (iot_pri_t pri) +{ + char *name = NULL; + switch (pri) { + case IOT_PRI_HI: + name = "fast"; + break; + case IOT_PRI_NORMAL: + name = "normal"; + break; + case IOT_PRI_LO: + name = "slow"; + break; + case IOT_PRI_LEAST: + name = "least priority"; + break; + case IOT_PRI_MAX: + name = "invalid"; + break; + } + return name; +} -/* Assumes inode lock is held. */ int -iot_ordered_request_balancer (iot_conf_t *conf, inode_t *inode, uint64_t *idx) +iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) { - int ret = 0; + int ret = -1; + iot_pri_t pri = IOT_PRI_MAX - 1; + iot_conf_t *conf = this->private; - if (__inode_ctx_get (inode, conf->this, idx) < 0) - *idx = iot_create_inode_worker_assoc (conf, inode); - else { - /* Sanity check to ensure the idx received from the inode - * context is within bounds. We're a bit optimistic in - * assuming that if an index is within bounds, it is - * not corrupted. idx is uint so we dont check for less - * than 0. - */ - if ((*idx >= (uint64_t)conf->max_o_threads)) { - gf_log (conf->this->name, GF_LOG_DEBUG, - "inode context returned insane thread index %" - PRIu64, *idx); - ret = -1; - } + if ((frame->root->pid < GF_CLIENT_PID_MAX) && conf->least_priority) { + pri = IOT_PRI_LEAST; + goto out; } - return ret; -} - - -void -iot_schedule_ordered (iot_conf_t *conf, inode_t *inode, call_stub_t *stub) -{ - uint64_t idx = 0; - iot_worker_t *selected_worker = NULL; - iot_request_t *req = NULL; - int balstatus = 0; - - if (inode == NULL) { - gf_log (conf->this->name, GF_LOG_DEBUG, - "Got NULL inode for ordered request"); - STACK_UNWIND (stub->frame, -1, EINVAL, NULL); - call_stub_destroy (stub); - return; - } - req = iot_init_request (stub); - LOCK (&inode->lock); - { - balstatus = iot_ordered_request_balancer (conf, inode, &idx); - if (balstatus < 0) { - gf_log (conf->this->name, GF_LOG_DEBUG, - "Insane worker index. Unwinding stack"); - STACK_UNWIND (stub->frame, -1, ECANCELED, NULL); - iot_destroy_request (req); - call_stub_destroy (stub); - goto unlock_out; - } - /* inode lock once acquired, cannot be left here - * because other gluster main threads might be - * contending on it to append a request for this file. - * So we'll also leave the lock only after we've - * added the request to the worker queue. - */ - selected_worker = conf->oworkers[idx]; - iot_request_queue_and_thread_fire (selected_worker, - iot_worker_ordered, req); + switch (stub->fop) { + case GF_FOP_OPEN: + case GF_FOP_STAT: + case GF_FOP_FSTAT: + case GF_FOP_LOOKUP: + case GF_FOP_ACCESS: + case GF_FOP_READLINK: + case GF_FOP_OPENDIR: + case GF_FOP_STATFS: + case GF_FOP_READDIR: + case GF_FOP_READDIRP: + pri = IOT_PRI_HI; + break; + + case GF_FOP_CREATE: + case GF_FOP_FLUSH: + case GF_FOP_LK: + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + case GF_FOP_UNLINK: + case GF_FOP_SETATTR: + case GF_FOP_FSETATTR: + case GF_FOP_MKNOD: + case GF_FOP_MKDIR: + case GF_FOP_RMDIR: + case GF_FOP_SYMLINK: + case GF_FOP_RENAME: + case GF_FOP_LINK: + case GF_FOP_SETXATTR: + case GF_FOP_GETXATTR: + case GF_FOP_FGETXATTR: + case GF_FOP_FSETXATTR: + case GF_FOP_REMOVEXATTR: + case GF_FOP_FREMOVEXATTR: + pri = IOT_PRI_NORMAL; + break; + + case GF_FOP_READ: + case GF_FOP_WRITE: + case GF_FOP_FSYNC: + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + case GF_FOP_FSYNCDIR: + case GF_FOP_XATTROP: + case GF_FOP_FXATTROP: + case GF_FOP_RCHECKSUM: + case GF_FOP_FALLOCATE: + case GF_FOP_DISCARD: + case GF_FOP_ZEROFILL: + pri = IOT_PRI_LO; + break; + + case GF_FOP_NULL: + case GF_FOP_FORGET: + case GF_FOP_RELEASE: + case GF_FOP_RELEASEDIR: + case GF_FOP_GETSPEC: + case GF_FOP_MAXVALUE: + //fail compilation on missing fop + //new fop must choose priority. + break; } -unlock_out: - UNLOCK (&inode->lock); +out: + gf_log (this->name, GF_LOG_DEBUG, "%s scheduled as %s fop", + gf_fop_list[stub->fop], iot_get_pri_meaning (pri)); + ret = do_iot_schedule (this->private, stub, pri); + return ret; } - int iot_lookup_cbk (call_frame_t *frame, void * cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, dict_t *xattr) + inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) { - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, xattr); + STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); return 0; } int iot_lookup_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xattr_req) + dict_t *xdata) { STACK_WIND (frame, iot_lookup_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->lookup, - loc, xattr_req); + loc, xdata); return 0; } int -iot_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +iot_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_lookup_stub (frame, iot_lookup_wrapper, loc, xattr_req); + stub = fop_lookup_stub (frame, iot_lookup_wrapper, loc, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create lookup stub (out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_unordered ((iot_conf_t *)this->private, loc->inode, stub); - return 0; -} - + ret = iot_schedule (frame, this, stub); -int -iot_chmod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - - -int -iot_chmod_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - mode_t mode) -{ - STACK_WIND (frame, iot_chmod_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->chmod, - loc, mode); - return 0; -} - - -int -iot_chmod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode) -{ - call_stub_t *stub = NULL; - fd_t *fd = NULL; - - stub = fop_chmod_stub (frame, iot_chmod_wrapper, loc, mode); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create chmod stub" - "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; +out: + if (ret < 0) { + if (stub != NULL) { + call_stub_destroy (stub); + } + STACK_UNWIND_STRICT (lookup, frame, -1, -ret, NULL, NULL, NULL, + NULL); } - fd = fd_lookup (loc->inode, frame->root->pid); - if (fd == NULL) - iot_schedule_unordered ((iot_conf_t *)this->private, - loc->inode, stub); - else { - iot_schedule_ordered ((iot_conf_t *)this->private, loc->inode, - stub); - fd_unref (fd); - } return 0; } int -iot_fchmod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +iot_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, buf); + STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop, + xdata); return 0; } int -iot_fchmod_wrapper (call_frame_t *frame, xlator_t *this, - fd_t *fd, mode_t mode) +iot_setattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { - STACK_WIND (frame, iot_fchmod_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fchmod, fd, mode); + STACK_WIND (frame, iot_setattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, + loc, stbuf, valid, xdata); return 0; } int -iot_fchmod (call_frame_t *frame, xlator_t *this, fd_t *fd, mode_t mode) +iot_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_fchmod_stub (frame, iot_fchmod_wrapper, fd, mode); + stub = fop_setattr_stub (frame, iot_setattr_wrapper, loc, stbuf, valid, + xdata); if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create fchmod stub" - "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + gf_log (this->name, GF_LOG_ERROR, "Cannot create setattr stub" + "(Out of memory)"); + ret = -ENOMEM; + goto out; } - iot_schedule_ordered ((iot_conf_t *)this->private, fd->inode, stub); - return 0; -} - - -int -iot_chown_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - - -int -iot_chown_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, uid_t uid, - gid_t gid) -{ - STACK_WIND (frame, iot_chown_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->chown, - loc, uid, gid); - return 0; -} + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + if (stub != NULL) { + call_stub_destroy (stub); + } -int -iot_chown (call_frame_t *frame, xlator_t *this, loc_t *loc, uid_t uid, - gid_t gid) -{ - call_stub_t *stub = NULL; - fd_t *fd = NULL; - - stub = fop_chown_stub (frame, iot_chown_wrapper, loc, uid, gid); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create chown stub" - "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; - } - - fd = fd_lookup (loc->inode, frame->root->pid); - if (fd == NULL) - iot_schedule_unordered ((iot_conf_t *)this->private, - loc->inode, stub); - else { - iot_schedule_ordered ((iot_conf_t *)this->private, loc->inode, - stub); - fd_unref (fd); + STACK_UNWIND_STRICT (setattr, frame, -1, -ret, NULL, NULL, NULL); } return 0; @@ -390,116 +438,152 @@ iot_chown (call_frame_t *frame, xlator_t *this, loc_t *loc, uid_t uid, int -iot_fchown_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +iot_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, buf); + STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno, preop, postop, + xdata); return 0; } int -iot_fchown_wrapper (call_frame_t *frame, xlator_t *this, - fd_t *fd, uid_t uid, gid_t gid) +iot_fsetattr_wrapper (call_frame_t *frame, xlator_t *this, + fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata) { - STACK_WIND (frame, iot_fchown_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fchown, fd, uid, gid); + STACK_WIND (frame, iot_fsetattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsetattr, fd, stbuf, valid, + xdata); return 0; } int -iot_fchown (call_frame_t *frame, xlator_t *this, fd_t *fd, uid_t uid, gid_t gid) +iot_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_fchown_stub (frame, iot_fchown_wrapper, fd, uid, gid); + stub = fop_fsetattr_stub (frame, iot_fsetattr_wrapper, fd, stbuf, + valid, xdata); if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create fchown stub" + gf_log (this->name, GF_LOG_ERROR, "cannot create fsetattr stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_ordered ((iot_conf_t *)this->private, fd->inode, stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (fsetattr, frame, -1, -ret, NULL, NULL, + NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, xdata); return 0; } int iot_access_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t mask) + int32_t mask, dict_t *xdata) { STACK_WIND (frame, iot_access_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->access, loc, mask); + FIRST_CHILD (this)->fops->access, loc, mask, xdata); return 0; } int -iot_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) +iot_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_access_stub (frame, iot_access_wrapper, loc, mask); + stub = fop_access_stub (frame, iot_access_wrapper, loc, mask, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create access stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_unordered ((iot_conf_t *)this->private, loc->inode, stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (access, frame, -1, -ret, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, const char *path) + int32_t op_ret, int32_t op_errno, const char *path, + struct iatt *stbuf, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, path); + STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, path, stbuf, + xdata); return 0; } int iot_readlink_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - size_t size) + size_t size, dict_t *xdata) { STACK_WIND (frame, iot_readlink_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->readlink, - loc, size); + loc, size, xdata); return 0; } int -iot_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size) +iot_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_readlink_stub (frame, iot_readlink_wrapper, loc, size); + stub = fop_readlink_stub (frame, iot_readlink_wrapper, loc, size, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create readlink stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; + } + + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (readlink, frame, -1, -ret, NULL, NULL, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } } - iot_schedule_unordered ((iot_conf_t *)this->private, loc->inode, stub); return 0; } @@ -507,39 +591,53 @@ iot_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size) int iot_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf) + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); return 0; } int iot_mknod_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev) + dev_t rdev, mode_t umask, dict_t *xdata) { STACK_WIND (frame, iot_mknod_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->mknod, loc, mode, rdev); + FIRST_CHILD (this)->fops->mknod, loc, mode, rdev, umask, + xdata); return 0; } int iot_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev) + dev_t rdev, mode_t umask, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_mknod_stub (frame, iot_mknod_wrapper, loc, mode, rdev); + stub = fop_mknod_stub (frame, iot_mknod_wrapper, loc, mode, rdev, + umask, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create mknod stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_unordered ((iot_conf_t *)this->private, loc->inode, stub); + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (mknod, frame, -1, -ret, NULL, NULL, NULL, + NULL, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } @@ -547,72 +645,99 @@ iot_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, int iot_mkdir_cbk (call_frame_t *frame, void * cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf) + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); return 0; } int -iot_mkdir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode) +iot_mkdir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { STACK_WIND (frame, iot_mkdir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->mkdir, loc, mode); + FIRST_CHILD (this)->fops->mkdir, loc, mode, umask, xdata); return 0; } int -iot_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode) +iot_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_mkdir_stub (frame, iot_mkdir_wrapper, loc, mode); + stub = fop_mkdir_stub (frame, iot_mkdir_wrapper, loc, mode, umask, + xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create mkdir stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_unordered ((iot_conf_t *)this->private, loc->inode, stub); + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (mkdir, frame, -1, -ret, NULL, NULL, NULL, + NULL, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, preparent, + postparent, xdata); return 0; } int -iot_rmdir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc) +iot_rmdir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata) { STACK_WIND (frame, iot_rmdir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->rmdir, loc); + FIRST_CHILD (this)->fops->rmdir, loc, flags, xdata); return 0; } int -iot_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc) +iot_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_rmdir_stub (frame, iot_rmdir_wrapper, loc); + stub = fop_rmdir_stub (frame, iot_rmdir_wrapper, loc, flags, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create rmdir stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_unordered ((iot_conf_t *)this->private, loc->inode, stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (rmdir, frame, -1, -ret, NULL, NULL, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } @@ -620,114 +745,157 @@ iot_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc) int iot_symlink_cbk (call_frame_t *frame, void * cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf) + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); return 0; } int iot_symlink_wrapper (call_frame_t *frame, xlator_t *this, const char *linkname, - loc_t *loc) + loc_t *loc, mode_t umask, dict_t *xdata) { STACK_WIND (frame, iot_symlink_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->symlink, linkname, loc); + FIRST_CHILD (this)->fops->symlink, linkname, loc, umask, + xdata); return 0; } int iot_symlink (call_frame_t *frame, xlator_t *this, const char *linkname, - loc_t *loc) + loc_t *loc, mode_t umask, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_symlink_stub (frame, iot_symlink_wrapper, linkname, loc); + stub = fop_symlink_stub (frame, iot_symlink_wrapper, linkname, loc, + umask, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create symlink stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); - return 0; + ret = -ENOMEM; + goto out; + } + + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (symlink, frame, -1, -ret, NULL, NULL, NULL, + NULL, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } } - iot_schedule_unordered ((iot_conf_t *)this->private, loc->inode, stub); return 0; } int iot_rename_cbk (call_frame_t *frame, void * cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, buf); + STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf, preoldparent, + postoldparent, prenewparent, postnewparent, xdata); return 0; } int iot_rename_wrapper (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) + loc_t *newloc, dict_t *xdata) { STACK_WIND (frame, iot_rename_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->rename, oldloc, newloc); + FIRST_CHILD (this)->fops->rename, oldloc, newloc, xdata); return 0; } int -iot_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc) +iot_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_rename_stub (frame, iot_rename_wrapper, oldloc, newloc); + stub = fop_rename_stub (frame, iot_rename_wrapper, oldloc, newloc, xdata); if (!stub) { gf_log (this->name, GF_LOG_DEBUG, "cannot create rename stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; + } + + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (rename, frame, -1, -ret, NULL, NULL, NULL, + NULL, NULL, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } } - iot_schedule_unordered ((iot_conf_t *)this->private, oldloc->inode, - stub); return 0; } int iot_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, fd_t *fd) + int32_t op_errno, fd_t *fd, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, fd); + STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); return 0; } int iot_open_wrapper (call_frame_t * frame, xlator_t * this, loc_t *loc, - int32_t flags, fd_t * fd) + int32_t flags, fd_t * fd, dict_t *xdata) { STACK_WIND (frame, iot_open_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->open, loc, flags, fd); + FIRST_CHILD (this)->fops->open, loc, flags, fd, + xdata); return 0; } int iot_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd) + fd_t *fd, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_open_stub (frame, iot_open_wrapper, loc, flags, fd); + stub = fop_open_stub (frame, iot_open_wrapper, loc, flags, fd, + xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create open call stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, 0); - return 0; + ret = -ENOMEM; + goto out; + } + + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (open, frame, -1, -ret, NULL, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } } - iot_schedule_unordered ((iot_conf_t *)this->private, loc->inode, stub); return 0; } @@ -736,42 +904,57 @@ iot_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, int iot_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct stat *stbuf) + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, fd, inode, stbuf); + STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, stbuf, + preparent, postparent, xdata); return 0; } int iot_create_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, mode_t mode, fd_t *fd) + int32_t flags, mode_t mode, mode_t umask, fd_t *fd, + dict_t *xdata) { STACK_WIND (frame, iot_create_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, - loc, flags, mode, fd); + loc, flags, mode, umask, fd, xdata); return 0; } int iot_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, fd_t *fd) + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; stub = fop_create_stub (frame, iot_create_wrapper, loc, flags, mode, - fd); + umask, fd, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create \"create\" call stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, 0); - return 0; + ret = -ENOMEM; + goto out; + } + + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (create, frame, -1, -ret, NULL, NULL, NULL, + NULL, NULL, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } } - iot_schedule_unordered ((iot_conf_t *)this->private, loc->inode, stub); return 0; } @@ -779,9 +962,11 @@ iot_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, int iot_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct stat *stbuf, struct iobref *iobref) + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf, iobref); + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, + stbuf, iobref, xdata); return 0; } @@ -789,120 +974,155 @@ iot_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int iot_readv_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) + off_t offset, uint32_t flags, dict_t *xdata) { STACK_WIND (frame, iot_readv_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, - fd, size, offset); + fd, size, offset, flags, xdata); return 0; } int iot_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) + off_t offset, uint32_t flags, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_readv_stub (frame, iot_readv_wrapper, fd, size, offset); + stub = fop_readv_stub (frame, iot_readv_wrapper, fd, size, offset, + flags, xdata); if (!stub) { - gf_log (this->name, GF_LOG_ERROR, + gf_log (this->name, GF_LOG_ERROR, "cannot create readv call stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, 0); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_ordered ((iot_conf_t *)this->private, fd->inode, stub); + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (readv, frame, -1, -ret, NULL, -1, NULL, + NULL, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata); return 0; } int -iot_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd) +iot_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { STACK_WIND (frame, iot_flush_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->flush, - fd); + fd, xdata); return 0; } int -iot_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +iot_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_flush_stub (frame, iot_flush_wrapper, fd); + stub = fop_flush_stub (frame, iot_flush_wrapper, fd, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create flush_cbk call stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_ordered ((iot_conf_t *)this->private, fd->inode, stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (flush, frame, -1, -ret, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); return 0; } int iot_fsync_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) + int32_t datasync, dict_t *xdata) { STACK_WIND (frame, iot_fsync_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsync, - fd, datasync); + fd, datasync, xdata); return 0; } int -iot_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync) +iot_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_fsync_stub (frame, iot_fsync_wrapper, fd, datasync); + stub = fop_fsync_stub (frame, iot_fsync_wrapper, fd, datasync, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create fsync_cbk call stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM); - return 0; + ret = -1; + goto out; } - iot_schedule_ordered ((iot_conf_t *)this->private, fd->inode, stub); + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (fsync, frame, -1, -ret, NULL, NULL, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *stbuf) + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, stbuf); + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); return 0; } @@ -910,12 +1130,13 @@ iot_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int iot_writev_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, - off_t offset, struct iobref *iobref) + off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { STACK_WIND (frame, iot_writev_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, - fd, vector, count, offset, iobref); + fd, vector, count, offset, flags, iobref, xdata); return 0; } @@ -923,22 +1144,31 @@ iot_writev_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, int iot_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) + uint32_t flags, struct iobref *iobref, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_writev_stub (frame, iot_writev_wrapper, - fd, vector, count, offset, iobref); + stub = fop_writev_stub (frame, iot_writev_wrapper, fd, vector, + count, offset, flags, iobref, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create writev call stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_ordered ((iot_conf_t *)this->private, fd->inode, stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (writev, frame, -1, -ret, NULL, NULL, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } @@ -946,182 +1176,203 @@ iot_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t iot_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct flock *flock) + int32_t op_ret, int32_t op_errno, struct gf_flock *flock, + dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, flock); + STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, flock, xdata); return 0; } int iot_lk_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t cmd, struct flock *flock) + int32_t cmd, struct gf_flock *flock, dict_t *xdata) { STACK_WIND (frame, iot_lk_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lk, - fd, cmd, flock); + fd, cmd, flock, xdata); return 0; } int iot_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct flock *flock) + struct gf_flock *flock, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_lk_stub (frame, iot_lk_wrapper, fd, cmd, flock); + stub = fop_lk_stub (frame, iot_lk_wrapper, fd, cmd, flock, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create fop_lk call stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_ordered ((iot_conf_t *)this->private, fd->inode, stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (lk, frame, -1, -ret, NULL, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) + int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, buf); + STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata); return 0; } int -iot_stat_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc) +iot_stat_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { STACK_WIND (frame, iot_stat_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat, - loc); + loc, xdata); return 0; } int -iot_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) +iot_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { call_stub_t *stub = NULL; - fd_t *fd = NULL; + int ret = -1; - stub = fop_stat_stub (frame, iot_stat_wrapper, loc); + stub = fop_stat_stub (frame, iot_stat_wrapper, loc, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create fop_stat call stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -1; + goto out; } - fd = fd_lookup (loc->inode, frame->root->pid); - /* File is not open, so we can send it through unordered pool. - */ - if (fd == NULL) - iot_schedule_unordered ((iot_conf_t *)this->private, - loc->inode, stub); - else { - iot_schedule_ordered ((iot_conf_t *)this->private, loc->inode, - stub); - fd_unref (fd); - } + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (stat, frame, -1, -ret, NULL, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) + int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, buf); + STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf, xdata); return 0; } int -iot_fstat_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd) +iot_fstat_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { STACK_WIND (frame, iot_fstat_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat, - fd); + fd, xdata); return 0; } int -iot_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd) +iot_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_fstat_stub (frame, iot_fstat_wrapper, fd); + stub = fop_fstat_stub (frame, iot_fstat_wrapper, fd, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create fop_fstat call stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_ordered ((iot_conf_t *)this->private, fd->inode, stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (fstat, frame, -1, -ret, NULL, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, buf); + STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, + postbuf, xdata); return 0; } int iot_truncate_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset) + off_t offset, dict_t *xdata) { STACK_WIND (frame, iot_truncate_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, - loc, offset); + loc, offset, xdata); return 0; } int -iot_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) +iot_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { call_stub_t *stub; - fd_t *fd = NULL; - - stub = fop_truncate_stub (frame, iot_truncate_wrapper, loc, offset); + int ret = -1; + stub = fop_truncate_stub (frame, iot_truncate_wrapper, loc, offset, + xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create fop_stat call stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; } - fd = fd_lookup (loc->inode, frame->root->pid); - if (fd == NULL) - iot_schedule_unordered ((iot_conf_t *)this->private, - loc->inode, stub); - else { - iot_schedule_ordered ((iot_conf_t *)this->private, loc->inode, - stub); - fd_unref (fd); + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (truncate, frame, -1, -ret, NULL, NULL, + NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } } return 0; @@ -1130,173 +1381,107 @@ iot_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) int iot_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, buf); + STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, + postbuf, xdata); return 0; } int iot_ftruncate_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset) + off_t offset, dict_t *xdata) { STACK_WIND (frame, iot_ftruncate_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, - fd, offset); + fd, offset, xdata); return 0; } int -iot_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) +iot_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - call_stub_t *stub; + call_stub_t *stub = NULL; + int ret = -1; - stub = fop_ftruncate_stub (frame, iot_ftruncate_wrapper, fd, offset); + stub = fop_ftruncate_stub (frame, iot_ftruncate_wrapper, fd, offset, + xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create fop_ftruncate call stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_ordered ((iot_conf_t *)this->private, fd->inode, stub); - - return 0; -} - - -int -iot_utimens_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct stat *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (ftruncate, frame, -1, -ret, NULL, NULL, NULL); -int -iot_utimens_wrapper (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct timespec tv[2]) -{ - STACK_WIND (frame, iot_utimens_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->utimens, - loc, tv); - return 0; -} - - -int -iot_utimens (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct timespec tv[2]) -{ - call_stub_t *stub; - fd_t *fd = NULL; - - stub = fop_utimens_stub (frame, iot_utimens_wrapper, loc, tv); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fop_utimens call stub" - "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; - } - - fd = fd_lookup (loc->inode, frame->root->pid); - if (fd == NULL) - iot_schedule_unordered ((iot_conf_t *)this->private, - loc->inode, stub); - else { - iot_schedule_ordered ((iot_conf_t *)this->private, loc->inode, - stub); - fd_unref (fd); + if (stub != NULL) { + call_stub_destroy (stub); + } } - return 0; } -int -iot_checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, uint8_t *file_checksum, - uint8_t *dir_checksum) -{ - STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum); - return 0; -} - - -int -iot_checksum_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags) -{ - STACK_WIND (frame, iot_checksum_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->checksum, - loc, flags); - - return 0; -} - - -int -iot_checksum (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags) -{ - call_stub_t *stub = NULL; - - stub = fop_checksum_stub (frame, iot_checksum_wrapper, loc, flags); - - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fop_checksum call stub" - "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); - return 0; - } - iot_schedule_unordered ((iot_conf_t *)this->private, loc->inode, stub); - - return 0; -} - int iot_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, preparent, + postparent, xdata); return 0; } int -iot_unlink_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc) +iot_unlink_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t xflag, dict_t *xdata) { STACK_WIND (frame, iot_unlink_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, - loc); - + loc, xflag, xdata); return 0; } int -iot_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) +iot_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag, + dict_t *xdata) { call_stub_t *stub = NULL; - stub = fop_unlink_stub (frame, iot_unlink_wrapper, loc); + int ret = -1; + + stub = fop_unlink_stub (frame, iot_unlink_wrapper, loc, xflag, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create fop_unlink call stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM); - return 0; + ret = -1; + goto out; } - iot_schedule_unordered((iot_conf_t *)this->private, loc->inode, stub); + + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (unlink, frame, -1, -ret, NULL, NULL, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } @@ -1305,827 +1490,1133 @@ iot_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) int iot_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct stat *buf) + struct iatt *buf, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); return 0; } int -iot_link_wrapper (call_frame_t *frame, xlator_t *this, loc_t *old, loc_t *new) +iot_link_wrapper (call_frame_t *frame, xlator_t *this, loc_t *old, loc_t *new, + dict_t *xdata) { STACK_WIND (frame, iot_link_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->link, old, new); + FIRST_CHILD (this)->fops->link, old, new, xdata); return 0; } int -iot_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc) +iot_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_link_stub (frame, iot_link_wrapper, oldloc, newloc); + stub = fop_link_stub (frame, iot_link_wrapper, oldloc, newloc, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create link stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_unordered ((iot_conf_t *)this->private, oldloc->inode, - stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (link, frame, -1, -ret, NULL, NULL, NULL, + NULL, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, fd); + STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, xdata); return 0; } int -iot_opendir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +iot_opendir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { STACK_WIND (frame, iot_opendir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->opendir, loc, fd); + FIRST_CHILD (this)->fops->opendir, loc, fd, xdata); return 0; } int -iot_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +iot_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_opendir_stub (frame, iot_opendir_wrapper, loc, fd); + stub = fop_opendir_stub (frame, iot_opendir_wrapper, loc, fd, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create opendir stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_unordered ((iot_conf_t *)this->private, loc->inode, stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (opendir, frame, -1, -ret, NULL, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, xdata); return 0; } int iot_fsyncdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - int datasync) + int datasync, dict_t *xdata) { STACK_WIND (frame, iot_fsyncdir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsyncdir, fd, datasync); + FIRST_CHILD (this)->fops->fsyncdir, fd, datasync, xdata); return 0; } int -iot_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) +iot_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_fsyncdir_stub (frame, iot_fsyncdir_wrapper, fd, datasync); + stub = fop_fsyncdir_stub (frame, iot_fsyncdir_wrapper, fd, datasync, + xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create fsyncdir stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_ordered ((iot_conf_t *)this->private, fd->inode, stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (fsyncdir, frame, -1, -ret, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct statvfs *buf) + int32_t op_ret, int32_t op_errno, struct statvfs *buf, + dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, buf); + STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata); return 0; } int -iot_statfs_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc) +iot_statfs_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) { STACK_WIND (frame, iot_statfs_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->statfs, loc); + FIRST_CHILD (this)->fops->statfs, loc, xdata); return 0; } int -iot_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) +iot_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_statfs_stub (frame, iot_statfs_wrapper, loc); + stub = fop_statfs_stub (frame, iot_statfs_wrapper, loc, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create statfs stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_unordered ((iot_conf_t *)this->private, loc->inode, stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (statfs, frame, -1, -ret, NULL, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata); return 0; } int iot_setxattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *dict, int32_t flags) + dict_t *dict, int32_t flags, dict_t *xdata) { STACK_WIND (frame, iot_setxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->setxattr, loc, dict, flags); + FIRST_CHILD (this)->fops->setxattr, loc, dict, flags, xdata); return 0; } int iot_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags) + int32_t flags, dict_t *xdata) { call_stub_t *stub = NULL; - fd_t *fd = NULL; + int ret = -1; stub = fop_setxattr_stub (frame, iot_setxattr_wrapper, loc, dict, - flags); + flags, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create setxattr stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; - } - fd = fd_lookup (loc->inode, frame->root->pid); - if (fd == NULL) - iot_schedule_unordered ((iot_conf_t *)this->private, - loc->inode, stub); - else { - iot_schedule_ordered ((iot_conf_t *)this->private, loc->inode, - stub); - fd_unref (fd); + ret = -ENOMEM; + goto out; } + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (setxattr, frame, -1, -ret, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, dict); + STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata); return 0; } int iot_getxattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) + const char *name, dict_t *xdata) { STACK_WIND (frame, iot_getxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->getxattr, loc, name); + FIRST_CHILD (this)->fops->getxattr, loc, name, xdata); return 0; } int iot_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) + const char *name, dict_t *xdata) { call_stub_t *stub = NULL; - fd_t *fd = NULL; + int ret = -1; - stub = fop_getxattr_stub (frame, iot_getxattr_wrapper, loc, name); + stub = fop_getxattr_stub (frame, iot_getxattr_wrapper, loc, name, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create getxattr stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; } - fd = fd_lookup (loc->inode, frame->root->pid); - if (!fd) - iot_schedule_unordered ((iot_conf_t *)this->private, - loc->inode, stub); - else { - iot_schedule_ordered ((iot_conf_t *)this->private, loc->inode, - stub); - fd_unref (fd); - } + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (getxattr, frame, -1, -ret, NULL, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, dict); + STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, xdata); return 0; } int iot_fgetxattr_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name) + const char *name, dict_t *xdata) { STACK_WIND (frame, iot_fgetxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fgetxattr, fd, name); + FIRST_CHILD (this)->fops->fgetxattr, fd, name, xdata); return 0; } int iot_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name) + const char *name, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_fgetxattr_stub (frame, iot_fgetxattr_wrapper, fd, name); + stub = fop_fgetxattr_stub (frame, iot_fgetxattr_wrapper, fd, name, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create fgetxattr stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_ordered ((iot_conf_t *)this->private, fd->inode, stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (fgetxattr, frame, -1, -ret, NULL, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata); return 0; } int iot_fsetxattr_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - dict_t *dict, int32_t flags) + dict_t *dict, int32_t flags, dict_t *xdata) { STACK_WIND (frame, iot_fsetxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsetxattr, fd, dict, flags); + FIRST_CHILD (this)->fops->fsetxattr, fd, dict, flags, + xdata); return 0; } int iot_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, - int32_t flags) + int32_t flags, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; stub = fop_fsetxattr_stub (frame, iot_fsetxattr_wrapper, fd, dict, - flags); + flags, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR, "cannot create fsetxattr stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_ordered ((iot_conf_t *)this->private, fd->inode, stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (fsetxattr, frame, -1, -ret, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int iot_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno); + STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata); return 0; } int iot_removexattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) + const char *name, dict_t *xdata) { STACK_WIND (frame, iot_removexattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->removexattr, loc, name); + FIRST_CHILD (this)->fops->removexattr, loc, name, xdata); return 0; } int iot_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) + const char *name, dict_t *xdata) { call_stub_t *stub = NULL; - fd_t *fd = NULL; + int ret = -1; stub = fop_removexattr_stub (frame, iot_removexattr_wrapper, loc, - name); + name, xdata); if (!stub) { gf_log (this->name, GF_LOG_ERROR,"cannot get removexattr fop" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM); - return 0; + ret = -ENOMEM; + goto out; } - fd = fd_lookup (loc->inode, frame->root->pid); - if (!fd) - iot_schedule_unordered ((iot_conf_t *)this->private, - loc->inode, stub); - else { - iot_schedule_ordered ((iot_conf_t *)this->private, - loc->inode, stub); - fd_unref (fd); - } + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (removexattr, frame, -1, -ret, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int -iot_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries) +iot_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, entries); + STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata); return 0; } int -iot_readdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset) +iot_fremovexattr_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { - STACK_WIND (frame, iot_readdir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->readdir, fd, size, offset); + STACK_WIND (frame, iot_fremovexattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fremovexattr, fd, name, xdata); return 0; } int -iot_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) +iot_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_readdir_stub (frame, iot_readdir_wrapper, fd, size, offset); + stub = fop_fremovexattr_stub (frame, iot_fremovexattr_wrapper, fd, + name, xdata); if (!stub) { - gf_log (this->private, GF_LOG_ERROR,"cannot get readdir stub" + gf_log (this->name, GF_LOG_ERROR,"cannot get fremovexattr fop" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_ordered ((iot_conf_t *)this->private, fd->inode, stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (fremovexattr, frame, -1, -ret, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int -iot_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) +iot_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, xattr); + STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); return 0; } int -iot_xattrop_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr) +iot_readdirp_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t offset, dict_t *xdata) { - STACK_WIND (frame, iot_xattrop_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->xattrop, loc, optype, xattr); + STACK_WIND (frame, iot_readdirp_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readdirp, fd, size, offset, xdata); return 0; } int -iot_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr) +iot_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { call_stub_t *stub = NULL; - fd_t *fd = NULL; + int ret = -1; - stub = fop_xattrop_stub (frame, iot_xattrop_wrapper, loc, optype, - xattr); + stub = fop_readdirp_stub (frame, iot_readdirp_wrapper, fd, size, + offset, xdata); if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create xattrop stub" + gf_log (this->private, GF_LOG_ERROR,"cannot get readdir stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; } - fd = fd_lookup (loc->inode, frame->root->pid); - if (!fd) - iot_schedule_unordered ((iot_conf_t *)this->private, - loc->inode, stub); - else { - iot_schedule_ordered ((iot_conf_t *)this->private, - loc->inode, stub); - fd_unref (fd); - } + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (readdirp, frame, -1, -ret, NULL, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } int -iot_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) +iot_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, xattr); + STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, entries, xdata); return 0; } + int -iot_fxattrop_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr) +iot_readdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t offset, dict_t *xdata) { - STACK_WIND (frame, iot_fxattrop_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fxattrop, fd, optype, xattr); + STACK_WIND (frame, iot_readdir_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readdir, fd, size, offset, xdata); return 0; } + int -iot_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr) +iot_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { call_stub_t *stub = NULL; + int ret = -1; - stub = fop_fxattrop_stub (frame, iot_fxattrop_wrapper, fd, optype, - xattr); + stub = fop_readdir_stub (frame, iot_readdir_wrapper, fd, size, offset, + xdata); if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create fxattrop stub" + gf_log (this->private, GF_LOG_ERROR,"cannot get readdir stub" "(out of memory)"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + ret = -ENOMEM; + goto out; } - iot_schedule_ordered ((iot_conf_t *)this->private, fd->inode, stub); + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (readdir, frame, -1, -ret, NULL, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } return 0; } - -/* Must be called with worker lock held */ -void -_iot_queue (iot_worker_t *worker, iot_request_t *req) +int +iot_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - list_add_tail (&req->list, &worker->rqlist); - - /* dq_cond */ - worker->queue_size++; - pthread_cond_broadcast (&worker->dq_cond); + STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, xdata); + return 0; } -iot_request_t * -iot_init_request (call_stub_t *stub) +int +iot_inodelk_wrapper (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, int32_t cmd, struct gf_flock *lock, + dict_t *xdata) { - iot_request_t *req = NULL; - - req = CALLOC (1, sizeof (iot_request_t)); - ERR_ABORT (req); - req->stub = stub; - - return req; + STACK_WIND (frame, iot_inodelk_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->inodelk, volume, loc, cmd, lock, + xdata); + return 0; } -void -iot_destroy_request (iot_request_t * req) +int +iot_inodelk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock, + dict_t *xdata) { - if (req == NULL) - return; + call_stub_t *stub = NULL; + int ret = -1; - FREE (req); -} + stub = fop_inodelk_stub (frame, iot_inodelk_wrapper, + volume, loc, cmd, lock, xdata); + if (!stub) { + ret = -ENOMEM; + goto out; + } + + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (inodelk, frame, -1, -ret, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } + return 0; +} -/* Must be called with worker lock held. */ -gf_boolean_t -iot_can_ordered_exit (iot_worker_t * worker) +int +iot_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - gf_boolean_t allow_exit = _gf_false; - iot_conf_t *conf = NULL; + STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno, xdata); + return 0; +} - conf = worker->conf; - /* We dont want this thread to exit if its index is - * below the min thread count. - */ - if (worker->thread_idx >= conf->min_o_threads) - allow_exit = _gf_true; - return allow_exit; +int +iot_finodelk_wrapper (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) +{ + STACK_WIND (frame, iot_finodelk_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->finodelk, volume, fd, cmd, lock, + xdata); + return 0; } -/* Must be called with worker lock held. */ -gf_boolean_t -iot_ordered_exit (iot_worker_t *worker) +int +iot_finodelk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock, + dict_t *xdata) { - gf_boolean_t allow_exit = _gf_false; + call_stub_t *stub = NULL; + int ret = -1; - allow_exit = iot_can_ordered_exit (worker); - if (allow_exit) { - worker->state = IOT_STATE_DEAD; - worker->thread = 0; + stub = fop_finodelk_stub (frame, iot_finodelk_wrapper, + volume, fd, cmd, lock, xdata); + if (!stub) { + gf_log (this->private, GF_LOG_ERROR,"cannot get finodelk stub" + "(out of memory)"); + ret = -ENOMEM; + goto out; } - return allow_exit; -} + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (finodelk, frame, -1, -ret, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } + return 0; +} int -iot_ordered_request_wait (iot_worker_t * worker) +iot_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - struct timeval tv; - struct timespec ts; - int waitres = 0; - int retstat = 0; + STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, xdata); + return 0; +} - gettimeofday (&tv, NULL); - ts.tv_sec = tv.tv_sec + worker->conf->o_idle_time; - /* Slightly skew the idle time for threads so that, we dont - * have all of them rushing to exit at the same time, if - * they've been idle. - */ - ts.tv_nsec = skew_usec_idle_time (tv.tv_usec) * 1000; - waitres = pthread_cond_timedwait (&worker->dq_cond, &worker->qlock, - &ts); - if (waitres == ETIMEDOUT) - if (iot_ordered_exit (worker)) - retstat = -1; - return retstat; +int +iot_entrylk_wrapper (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +{ + STACK_WIND (frame, iot_entrylk_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->entrylk, + volume, loc, basename, cmd, type, xdata); + return 0; } -call_stub_t * -iot_dequeue_ordered (iot_worker_t *worker) +int +iot_entrylk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - call_stub_t *stub = NULL; - iot_request_t *req = NULL; - int waitstat = 0; + call_stub_t *stub = NULL; + int ret = -1; - pthread_mutex_lock (&worker->qlock); - { - while (!worker->queue_size) { - waitstat = 0; - waitstat = iot_ordered_request_wait (worker); - /* We must've timed out and are now required to - * exit. - */ - if (waitstat == -1) - goto out; - } + stub = fop_entrylk_stub (frame, iot_entrylk_wrapper, + volume, loc, basename, cmd, type, xdata); + if (!stub) { + gf_log (this->private, GF_LOG_ERROR,"cannot get entrylk stub" + "(out of memory)"); + ret = -ENOMEM; + goto out; + } - list_for_each_entry (req, &worker->rqlist, list) - break; - list_del (&req->list); - stub = req->stub; + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (entrylk, frame, -1, -ret, NULL); - worker->queue_size--; + if (stub != NULL) { + call_stub_destroy (stub); + } } -out: - pthread_mutex_unlock (&worker->qlock); + return 0; +} + +int +iot_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + STACK_UNWIND_STRICT (fentrylk, frame, op_ret, op_errno, xdata); + return 0; +} - FREE (req); - return stub; +int +iot_fentrylk_wrapper (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +{ + STACK_WIND (frame, iot_fentrylk_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fentrylk, + volume, fd, basename, cmd, type, xdata); + return 0; } -void * -iot_worker_ordered (void *arg) +int +iot_fentrylk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - iot_worker_t *worker = arg; call_stub_t *stub = NULL; + int ret = -1; - while (1) { + stub = fop_fentrylk_stub (frame, iot_fentrylk_wrapper, + volume, fd, basename, cmd, type, xdata); + if (!stub) { + gf_log (this->private, GF_LOG_ERROR,"cannot get fentrylk stub" + "(out of memory)"); + ret = -ENOMEM; + goto out; + } - stub = iot_dequeue_ordered (worker); - /* If stub is NULL, we must've timed out waiting for a - * request and have now been allowed to exit. - */ - if (stub == NULL) - break; - call_resume (stub); - } + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (fentrylk, frame, -1, -ret, NULL); - return NULL; + if (stub != NULL) { + call_stub_destroy (stub); + } + } + return 0; } -/* Must be called with worker lock held. */ -gf_boolean_t -iot_can_unordered_exit (iot_worker_t * worker) +int +iot_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata) { - gf_boolean_t allow_exit = _gf_false; - iot_conf_t *conf = NULL; + STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, xattr, xdata); + return 0; +} - conf = worker->conf; - /* We dont want this thread to exit if its index is - * below the min thread count. - */ - if (worker->thread_idx >= conf->min_u_threads) - allow_exit = _gf_true; - return allow_exit; +int +iot_xattrop_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + STACK_WIND (frame, iot_xattrop_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->xattrop, loc, optype, xattr, xdata); + return 0; } -/* Must be called with worker lock held. */ -gf_boolean_t -iot_unordered_exit (iot_worker_t *worker) +int +iot_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - gf_boolean_t allow_exit = _gf_false; + call_stub_t *stub = NULL; + int ret = -1; - allow_exit = iot_can_unordered_exit (worker); - if (allow_exit) { - worker->state = IOT_STATE_DEAD; - worker->thread = 0; + stub = fop_xattrop_stub (frame, iot_xattrop_wrapper, loc, optype, + xattr, xdata); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot create xattrop stub" + "(out of memory)"); + ret = -ENOMEM; + goto out; } - return allow_exit; + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (xattrop, frame, -1, -ret, NULL, NULL); + + if (stub != NULL) { + call_stub_destroy (stub); + } + } + return 0; } int -iot_unordered_request_wait (iot_worker_t * worker) +iot_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata) { - struct timeval tv; - struct timespec ts; - int waitres = 0; - int retstat = 0; - - gettimeofday (&tv, NULL); - ts.tv_sec = tv.tv_sec + worker->conf->u_idle_time; - /* Slightly skew the idle time for threads so that, we dont - * have all of them rushing to exit at the same time, if - * they've been idle. - */ - ts.tv_nsec = skew_usec_idle_time (tv.tv_usec) * 1000; - waitres = pthread_cond_timedwait (&worker->dq_cond, &worker->qlock, - &ts); - if (waitres == ETIMEDOUT) - if (iot_unordered_exit (worker)) - retstat = -1; + STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, xattr, xdata); + return 0; +} - return retstat; +int +iot_fxattrop_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + STACK_WIND (frame, iot_fxattrop_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fxattrop, fd, optype, xattr, xdata); + return 0; } -call_stub_t * -iot_dequeue_unordered (iot_worker_t *worker) +int +iot_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - call_stub_t *stub= NULL; - iot_request_t *req = NULL; - int waitstat = 0; + call_stub_t *stub = NULL; + int ret = -1; - pthread_mutex_lock (&worker->qlock); - { - while (!worker->queue_size) { - waitstat = 0; - waitstat = iot_unordered_request_wait (worker); - /* If -1, request wait must've timed - * out. - */ - if (waitstat == -1) - goto out; + stub = fop_fxattrop_stub (frame, iot_fxattrop_wrapper, fd, optype, + xattr, xdata); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot create fxattrop stub" + "(out of memory)"); + ret = -ENOMEM; + goto out; + } + + ret = iot_schedule (frame, this, stub); +out: + if (ret < 0) { + STACK_UNWIND_STRICT (fxattrop, frame, -1, -ret, NULL, NULL); + if (stub != NULL) { + call_stub_destroy (stub); } + } + return 0; +} - list_for_each_entry (req, &worker->rqlist, list) - break; - list_del (&req->list); - stub = req->stub; - worker->queue_size--; +int32_t +iot_rchecksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, uint32_t weak_checksum, + uint8_t *strong_checksum, dict_t *xdata) +{ + STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno, weak_checksum, + strong_checksum, xdata); + return 0; +} + + +int32_t +iot_rchecksum_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, int32_t len, dict_t *xdata) +{ + STACK_WIND (frame, iot_rchecksum_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata); + return 0; +} + + +int32_t +iot_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + int32_t len, dict_t *xdata) +{ + call_stub_t *stub = NULL; + int ret = -1; + + stub = fop_rchecksum_stub (frame, iot_rchecksum_wrapper, fd, offset, + len, xdata); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot create rchecksum stub" + "(out of memory)"); + ret = -ENOMEM; + goto out; } + + ret = iot_schedule (frame, this, stub); out: - pthread_mutex_unlock (&worker->qlock); + if (ret < 0) { + STACK_UNWIND_STRICT (rchecksum, frame, -1, -ret, -1, NULL, NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } - FREE (req); + return 0; +} - return stub; +int +iot_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ + STACK_UNWIND_STRICT (fallocate, frame, op_ret, op_errno, preop, postop, + xdata); + return 0; } -void * -iot_worker_unordered (void *arg) +int +iot_fallocate_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + STACK_WIND (frame, iot_fallocate_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fallocate, fd, mode, offset, len, + xdata); + return 0; +} + + +int +iot_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) { - iot_worker_t *worker = arg; call_stub_t *stub = NULL; + int ret = -1; - while (1) { + stub = fop_fallocate_stub(frame, iot_fallocate_wrapper, fd, mode, offset, + len, xdata); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot create fallocate stub" + "(out of memory)"); + ret = -ENOMEM; + goto out; + } - stub = iot_dequeue_unordered (worker); - /* If no request was received, we must've timed out, - * and can exit. */ - if (stub == NULL) - break; + ret = iot_schedule (frame, this, stub); - call_resume (stub); - } - return NULL; +out: + if (ret < 0) { + STACK_UNWIND_STRICT (fallocate, frame, -1, -ret, NULL, NULL, + NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } + return 0; } - -iot_worker_t ** -allocate_worker_array (int count) +int +iot_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) { - iot_worker_t **warr = NULL; + STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, preop, postop, + xdata); + return 0; +} - warr = CALLOC (count, sizeof (iot_worker_t *)); - ERR_ABORT (warr); - return warr; +int +iot_discard_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + STACK_WIND (frame, iot_discard_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->discard, fd, offset, len, xdata); + return 0; } -iot_worker_t * -allocate_worker (iot_conf_t * conf) +int +iot_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { - iot_worker_t *wrk = NULL; + call_stub_t *stub = NULL; + int ret = -1; + + stub = fop_discard_stub(frame, iot_discard_wrapper, fd, offset, len, + xdata); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot create discard stub" + "(out of memory)"); + ret = -ENOMEM; + goto out; + } - wrk = CALLOC (1, sizeof (iot_worker_t)); - ERR_ABORT (wrk); + ret = iot_schedule (frame, this, stub); - INIT_LIST_HEAD (&wrk->rqlist); - wrk->conf = conf; - pthread_cond_init (&wrk->dq_cond, NULL); - pthread_mutex_init (&wrk->qlock, NULL); - wrk->state = IOT_STATE_DEAD; +out: + if (ret < 0) { + STACK_UNWIND_STRICT (discard, frame, -1, -ret, NULL, NULL, + NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } + return 0; +} - return wrk; +int +iot_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ + STACK_UNWIND_STRICT (zerofill, frame, op_ret, op_errno, preop, postop, + xdata); + return 0; } +int +iot_zerofill_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata) +{ + STACK_WIND (frame, iot_zerofill_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->zerofill, fd, offset, len, xdata); + return 0; +} -void -allocate_workers (iot_conf_t *conf, iot_worker_t **workers, int start_alloc_idx, - int count) +int +iot_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { - int i; - int end_count; + call_stub_t *stub = NULL; + int ret = -1; - end_count = count + start_alloc_idx; - for (i = start_alloc_idx; i < end_count; i++) { - workers[i] = allocate_worker (conf); - workers[i]->thread_idx = i; + stub = fop_zerofill_stub(frame, iot_zerofill_wrapper, fd, + offset, len, xdata); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot create zerofill stub" + "(out of memory)"); + ret = -ENOMEM; + goto out; } + + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (zerofill, frame, -1, -ret, NULL, NULL, + NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } + return 0; } -void -iot_startup_worker (iot_worker_t *worker, iot_worker_fn workerfunc) +int +__iot_workers_scale (iot_conf_t *conf) { - worker->state = IOT_STATE_ACTIVE; - pthread_create (&worker->thread, &worker->conf->w_attr, workerfunc, - worker); + int scale = 0; + int diff = 0; + pthread_t thread; + int ret = 0; + int i = 0; + + for (i = 0; i < IOT_PRI_MAX; i++) + scale += min (conf->queue_sizes[i], conf->ac_iot_limit[i]); + + if (scale < IOT_MIN_THREADS) + scale = IOT_MIN_THREADS; + + if (scale > conf->max_count) + scale = conf->max_count; + + if (conf->curr_count < scale) { + diff = scale - conf->curr_count; + } + + while (diff) { + diff --; + + ret = gf_thread_create (&thread, &conf->w_attr, iot_worker, conf); + if (ret == 0) { + conf->curr_count++; + gf_log (conf->this->name, GF_LOG_DEBUG, + "scaled threads to %d (queue_size=%d/%d)", + conf->curr_count, conf->queue_size, scale); + } else { + break; + } + } + + return diff; } -void -iot_startup_workers (iot_worker_t **workers, int start_idx, int count, - iot_worker_fn workerfunc) +int +iot_workers_scale (iot_conf_t *conf) { - int i = 0; - int end_idx = 0; + int ret = -1; + + if (conf == NULL) { + ret = -EINVAL; + goto out; + } - end_idx = start_idx + count; - for (i = start_idx; i < end_idx; i++) - iot_startup_worker (workers[i], workerfunc); + pthread_mutex_lock (&conf->mutex); + { + ret = __iot_workers_scale (conf); + } + pthread_mutex_unlock (&conf->mutex); +out: + return ret; } @@ -2134,49 +2625,133 @@ set_stack_size (iot_conf_t *conf) { int err = 0; size_t stacksize = IOT_THREAD_STACK_SIZE; + xlator_t *this = NULL; + + this = THIS; pthread_attr_init (&conf->w_attr); err = pthread_attr_setstacksize (&conf->w_attr, stacksize); if (err == EINVAL) { - gf_log (conf->this->name, GF_LOG_WARNING, + err = pthread_attr_getstacksize (&conf->w_attr, &stacksize); + if (!err) + gf_log (this->name, GF_LOG_WARNING, + "Using default thread stack size %zd", + stacksize); + else + gf_log (this->name, GF_LOG_WARNING, "Using default thread stack size"); } + + conf->stack_size = stacksize; } -void -workers_init (iot_conf_t *conf) +int32_t +mem_acct_init (xlator_t *this) { - /* Initialize un-ordered workers */ - conf->uworkers = allocate_worker_array (conf->max_u_threads); - allocate_workers (conf, conf->uworkers, 0, conf->max_u_threads); + int ret = -1; - /* Initialize ordered workers */ - conf->oworkers = allocate_worker_array (conf->max_o_threads); - allocate_workers (conf, conf->oworkers, 0, conf->max_o_threads); + if (!this) + return ret; - set_stack_size (conf); - iot_startup_workers (conf->oworkers, 0, conf->min_o_threads, - iot_worker_ordered); - iot_startup_workers (conf->uworkers, 0, conf->min_u_threads, - iot_worker_unordered); + ret = xlator_mem_acct_init (this, gf_iot_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; + } + + return ret; +} + +int +iot_priv_dump (xlator_t *this) +{ + iot_conf_t *conf = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + + if (!this) + return 0; + + conf = this->private; + if (!conf) + return 0; + + snprintf (key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, + this->name); + + gf_proc_dump_add_section(key_prefix); + + gf_proc_dump_write("maximum_threads_count", "%d", conf->max_count); + gf_proc_dump_write("current_threads_count", "%d", conf->curr_count); + gf_proc_dump_write("sleep_count", "%d", conf->sleep_count); + gf_proc_dump_write("idle_time", "%d", conf->idle_time); + gf_proc_dump_write("stack_size", "%zd", conf->stack_size); + gf_proc_dump_write("high_priority_threads", "%d", + conf->ac_iot_limit[IOT_PRI_HI]); + gf_proc_dump_write("normal_priority_threads", "%d", + conf->ac_iot_limit[IOT_PRI_NORMAL]); + gf_proc_dump_write("low_priority_threads", "%d", + conf->ac_iot_limit[IOT_PRI_LO]); + gf_proc_dump_write("least_priority_threads", "%d", + conf->ac_iot_limit[IOT_PRI_LEAST]); + + gf_proc_dump_write("cached least rate", "%u", + conf->throttle.cached_rate); + gf_proc_dump_write("least rate limit", "%u", conf->throttle.rate_limit); + + return 0; +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ + iot_conf_t *conf = NULL; + int ret = -1; + + conf = this->private; + if (!conf) + goto out; + + GF_OPTION_RECONF ("thread-count", conf->max_count, options, int32, out); + + GF_OPTION_RECONF ("high-prio-threads", + conf->ac_iot_limit[IOT_PRI_HI], options, int32, out); + + GF_OPTION_RECONF ("normal-prio-threads", + conf->ac_iot_limit[IOT_PRI_NORMAL], options, int32, + out); + + GF_OPTION_RECONF ("low-prio-threads", + conf->ac_iot_limit[IOT_PRI_LO], options, int32, out); + + GF_OPTION_RECONF ("least-prio-threads", + conf->ac_iot_limit[IOT_PRI_LEAST], options, int32, + out); + GF_OPTION_RECONF ("enable-least-priority", conf->least_priority, + options, bool, out); + + GF_OPTION_RECONF("least-rate-limit", conf->throttle.rate_limit, options, + int32, out); + + ret = 0; +out: + return ret; } int init (xlator_t *this) { - iot_conf_t *conf = NULL; - dict_t *options = this->options; - int thread_count = IOT_DEFAULT_THREADS; - gf_boolean_t autoscaling = IOT_SCALING_OFF; - char *scalestr = NULL; - int min_threads, max_threads; + iot_conf_t *conf = NULL; + int ret = -1; + int i = 0; if (!this->children || this->children->next) { gf_log ("io-threads", GF_LOG_ERROR, "FATAL: iot not configured with exactly one child"); - return -1; + goto out; } if (!this->parents) { @@ -2184,106 +2759,75 @@ init (xlator_t *this) "dangling volume. check volfile "); } - conf = (void *) CALLOC (1, sizeof (*conf)); - ERR_ABORT (conf); + conf = (void *) GF_CALLOC (1, sizeof (*conf), + gf_iot_mt_iot_conf_t); + if (conf == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory"); + goto out; + } - if ((dict_get_str (options, "autoscaling", &scalestr)) == 0) { - if ((gf_string2boolean (scalestr, &autoscaling)) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "'autoscaling' option must be" - " boolean"); - return -1; - } + if ((ret = pthread_cond_init(&conf->cond, NULL)) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "pthread_cond_init failed (%d)", ret); + goto out; } - if (dict_get (options, "thread-count")) { - thread_count = data_to_int32 (dict_get (options, - "thread-count")); - if (scalestr != NULL) - gf_log (this->name, GF_LOG_WARNING, - "'thread-count' is specified with " - "'autoscaling' on. Ignoring" - "'thread-count' option."); - if (thread_count < 2) - thread_count = IOT_MIN_THREADS; - } - - min_threads = IOT_DEFAULT_THREADS; - max_threads = IOT_MAX_THREADS; - if (dict_get (options, "min-threads")) - min_threads = data_to_int32 (dict_get (options, - "min-threads")); - - if (dict_get (options, "max-threads")) - max_threads = data_to_int32 (dict_get (options, - "max-threads")); - - if (min_threads > max_threads) { - gf_log (this->name, GF_LOG_ERROR, " min-threads must be less " - "than max-threads"); - return -1; - } - - /* If autoscaling is off, then adjust the min and max - * threads according to thread-count. - * This is based on the assumption that despite autoscaling - * being off, we still want to have separate pools for data - * and meta-data threads. - */ - if (!autoscaling) - max_threads = min_threads = thread_count; - - /* If user specifies an odd number of threads, increase it by - * one. The reason for having an even number of threads is - * explained later. - */ - if (max_threads % 2) - max_threads++; - - if(min_threads % 2) - min_threads++; - - /* If the user wants to have only a single thread for - * some strange reason, make sure we set this count to - * 2. Explained later. - */ - if (min_threads < IOT_MIN_THREADS) - min_threads = IOT_MIN_THREADS; - - /* Again, have atleast two. Read on. */ - if (max_threads < IOT_MIN_THREADS) - max_threads = IOT_MIN_THREADS; - - /* This is why we need atleast two threads. - * We're dividing the specified thread pool into - * 2 halves, equally between ordered and unordered - * pools. - */ - - /* Init params for un-ordered workers. */ - pthread_mutex_init (&conf->utlock, NULL); - conf->max_u_threads = max_threads / 2; - conf->min_u_threads = min_threads / 2; - conf->u_idle_time = IOT_DEFAULT_IDLE; - conf->u_scaling = autoscaling; - - /* Init params for ordered workers. */ - pthread_mutex_init (&conf->otlock, NULL); - conf->max_o_threads = max_threads / 2; - conf->min_o_threads = min_threads / 2; - conf->o_idle_time = IOT_DEFAULT_IDLE; - conf->o_scaling = autoscaling; - - gf_log (this->name, GF_LOG_DEBUG, - "io-threads: Autoscaling: %s, " - "min_threads: %d, max_threads: %d", - (autoscaling) ? "on":"off", min_threads, max_threads); + if ((ret = pthread_mutex_init(&conf->mutex, NULL)) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "pthread_mutex_init failed (%d)", ret); + goto out; + } + + set_stack_size (conf); + + GF_OPTION_INIT ("thread-count", conf->max_count, int32, out); + + GF_OPTION_INIT ("high-prio-threads", + conf->ac_iot_limit[IOT_PRI_HI], int32, out); + + GF_OPTION_INIT ("normal-prio-threads", + conf->ac_iot_limit[IOT_PRI_NORMAL], int32, out); + + GF_OPTION_INIT ("low-prio-threads", + conf->ac_iot_limit[IOT_PRI_LO], int32, out); + + GF_OPTION_INIT ("least-prio-threads", + conf->ac_iot_limit[IOT_PRI_LEAST], int32, out); + + GF_OPTION_INIT ("idle-time", conf->idle_time, int32, out); + GF_OPTION_INIT ("enable-least-priority", conf->least_priority, + bool, out); + + GF_OPTION_INIT("least-rate-limit", conf->throttle.rate_limit, int32, + out); + if ((ret = pthread_mutex_init(&conf->throttle.lock, NULL)) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "pthread_mutex_init failed (%d)", ret); + goto out; + } conf->this = this; - workers_init (conf); + + for (i = 0; i < IOT_PRI_MAX; i++) { + INIT_LIST_HEAD (&conf->reqs[i]); + } + + ret = iot_workers_scale (conf); + + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "cannot initialize worker threads, exiting init"); + goto out; + } this->private = conf; - return 0; + ret = 0; +out: + if (ret) + GF_FREE (conf); + + return ret; } @@ -2292,89 +2836,129 @@ fini (xlator_t *this) { iot_conf_t *conf = this->private; - FREE (conf); + GF_FREE (conf); this->private = NULL; return; } -/* - * O - Goes to ordered threadpool. - * U - Goes to un-ordered threadpool. - * V - Variable, depends on whether the file is open. - * If it is, then goes to ordered, otherwise to - * un-ordered. - */ -struct xlator_fops fops = { - .open = iot_open, /* U */ - .create = iot_create, /* U */ - .readv = iot_readv, /* O */ - .writev = iot_writev, /* O */ - .flush = iot_flush, /* O */ - .fsync = iot_fsync, /* O */ - .lk = iot_lk, /* O */ - .stat = iot_stat, /* V */ - .fstat = iot_fstat, /* O */ - .truncate = iot_truncate, /* V */ - .ftruncate = iot_ftruncate, /* O */ - .utimens = iot_utimens, /* V */ - .checksum = iot_checksum, /* U */ - .unlink = iot_unlink, /* U */ - .lookup = iot_lookup, /* U */ - .chmod = iot_chmod, /* V */ - .fchmod = iot_fchmod, /* O */ - .chown = iot_chown, /* V */ - .fchown = iot_fchown, /* O */ - .access = iot_access, /* U */ - .readlink = iot_readlink, /* U */ - .mknod = iot_mknod, /* U */ - .mkdir = iot_mkdir, /* U */ - .rmdir = iot_rmdir, /* U */ - .symlink = iot_symlink, /* U */ - .rename = iot_rename, /* U */ - .link = iot_link, /* U */ - .opendir = iot_opendir, /* U */ - .fsyncdir = iot_fsyncdir, /* O */ - .statfs = iot_statfs, /* U */ - .setxattr = iot_setxattr, /* U */ - .getxattr = iot_getxattr, /* U */ - .fgetxattr = iot_fgetxattr, /* O */ - .fsetxattr = iot_fsetxattr, /* O */ - .removexattr = iot_removexattr, /* U */ - .readdir = iot_readdir, /* O */ - .xattrop = iot_xattrop, /* U */ +struct xlator_dumpops dumpops = { + .priv = iot_priv_dump, }; -struct xlator_mops mops = { +struct xlator_fops fops = { + .open = iot_open, + .create = iot_create, + .readv = iot_readv, + .writev = iot_writev, + .flush = iot_flush, + .fsync = iot_fsync, + .lk = iot_lk, + .stat = iot_stat, + .fstat = iot_fstat, + .truncate = iot_truncate, + .ftruncate = iot_ftruncate, + .unlink = iot_unlink, + .lookup = iot_lookup, + .setattr = iot_setattr, + .fsetattr = iot_fsetattr, + .access = iot_access, + .readlink = iot_readlink, + .mknod = iot_mknod, + .mkdir = iot_mkdir, + .rmdir = iot_rmdir, + .symlink = iot_symlink, + .rename = iot_rename, + .link = iot_link, + .opendir = iot_opendir, + .fsyncdir = iot_fsyncdir, + .statfs = iot_statfs, + .setxattr = iot_setxattr, + .getxattr = iot_getxattr, + .fgetxattr = iot_fgetxattr, + .fsetxattr = iot_fsetxattr, + .removexattr = iot_removexattr, + .fremovexattr = iot_fremovexattr, + .readdir = iot_readdir, + .readdirp = iot_readdirp, + .inodelk = iot_inodelk, + .finodelk = iot_finodelk, + .entrylk = iot_entrylk, + .fentrylk = iot_fentrylk, + .xattrop = iot_xattrop, + .fxattrop = iot_fxattrop, + .rchecksum = iot_rchecksum, + .fallocate = iot_fallocate, + .discard = iot_discard, + .zerofill = iot_zerofill, }; -struct xlator_cbks cbks = { -}; +struct xlator_cbks cbks; struct volume_options options[] = { - { .key = {"thread-count"}, - .type = GF_OPTION_TYPE_INT, - .min = IOT_MIN_THREADS, - .max = IOT_MAX_THREADS + { .key = {"thread-count"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "16", + .description = "Number of threads in IO threads translator which " + "perform concurrent IO operations" + + }, + { .key = {"high-prio-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "16", + .description = "Max number of threads in IO threads translator which " + "perform high priority IO operations at a given time" + + }, + { .key = {"normal-prio-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "16", + .description = "Max number of threads in IO threads translator which " + "perform normal priority IO operations at a given time" + + }, + { .key = {"low-prio-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "16", + .description = "Max number of threads in IO threads translator which " + "perform low priority IO operations at a given time" + }, - { .key = {"autoscaling"}, - .type = GF_OPTION_TYPE_BOOL + { .key = {"least-prio-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "1", + .description = "Max number of threads in IO threads translator which " + "perform least priority IO operations at a given time" + }, + { .key = {"enable-least-priority"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Enable/Disable least priority" }, - { .key = {"min-threads"}, - .type = GF_OPTION_TYPE_INT, - .min = IOT_MIN_THREADS, - .max = IOT_MAX_THREADS, - .description = "Minimum number of threads must be greater than or " - "equal to 2. If the specified value is less than 2 " - "it is adjusted upwards to 2. This is a requirement" - " for the current model of threading in io-threads." + {.key = {"idle-time"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 0x7fffffff, + .default_value = "120", }, - { .key = {"max-threads"}, - .type = GF_OPTION_TYPE_INT, - .min = IOT_MIN_THREADS, - .max = IOT_MAX_THREADS, - .description = "Maximum number of threads is advisory only so the " - "user specified value will be used." + {.key = {"least-rate-limit"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "0", + .description = "Max number of least priority operations to handle " + "per-second" + }, + { .key = {NULL}, }, - { .key = {NULL} }, }; diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h index f02109428..1a9dee9ae 100644 --- a/xlators/performance/io-threads/src/io-threads.h +++ b/xlators/performance/io-threads/src/io-threads.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __IOT_H @@ -34,123 +25,68 @@ #include "common-utils.h" #include "list.h" #include <stdlib.h> +#include "locking.h" +#include "iot-mem-types.h" +#include <semaphore.h> +#include "statedump.h" -#define min(a,b) ((a)<(b)?(a):(b)) -#define max(a,b) ((a)>(b)?(a):(b)) struct iot_conf; -struct iot_worker; -struct iot_request; - -struct iot_request { - struct list_head list; /* Attaches this request to the list of - requests. - */ - call_stub_t *stub; -}; -#define IOT_STATE_ACTIVE 1 -#define IOT_STATE_DEAD 2 -#define iot_worker_active(wrk) ((wrk)->state == IOT_STATE_ACTIVE) +#define MAX_IDLE_SKEW 4 /* In secs */ +#define skew_sec_idle_time(sec) ((sec) + (random () % MAX_IDLE_SKEW)) +#define IOT_DEFAULT_IDLE 120 /* In secs. */ -#define MAX_IDLE_SKEW 1000 /* usecs */ -#define skew_usec_idle_time(usec) ((usec) + (random () % MAX_IDLE_SKEW)) -#define IOT_DEFAULT_IDLE 180 /* In secs. */ - -#define IOT_MIN_THREADS 2 +#define IOT_MIN_THREADS 1 #define IOT_DEFAULT_THREADS 16 -#define IOT_MAX_THREADS 256 +#define IOT_MAX_THREADS 64 -#define IOT_SCALING_OFF _gf_false -#define IOT_SCALING_ON _gf_true -#define iot_ordered_scaling_on(conf) ((conf)->o_scaling == IOT_SCALING_ON) -#define iot_unordered_scaling_on(conf) ((conf)->u_scaling == IOT_SCALING_ON) #define IOT_THREAD_STACK_SIZE ((size_t)(1024*1024)) -struct iot_worker { - struct list_head rqlist; /* List of requests assigned to me. */ - struct iot_conf *conf; - int64_t q,dq; - pthread_cond_t dq_cond; - pthread_mutex_t qlock; - int32_t queue_size; - pthread_t thread; - int state; /* What state is the thread in. */ - int thread_idx; /* Thread's index into the worker - array. Since this will be thread - local data, for ensuring that - number of threads dont fall below - a minimum, we just dont allow - threads with specific indices to - exit. Helps us in eliminating one - place where otherwise a lock - would have been required to update - centralized state inside conf. - */ + +typedef enum { + IOT_PRI_HI = 0, /* low latency */ + IOT_PRI_NORMAL, /* normal */ + IOT_PRI_LO, /* bulk */ + IOT_PRI_LEAST, /* least */ + IOT_PRI_MAX, +} iot_pri_t; + +#define IOT_LEAST_THROTTLE_DELAY 1 /* sample interval in seconds */ +struct iot_least_throttle { + struct timeval sample_time; /* timestamp of current sample */ + uint32_t sample_cnt; /* sample count for active interval */ + uint32_t cached_rate; /* the most recently measured rate */ + int32_t rate_limit; /* user-specified rate limit */ + pthread_mutex_t lock; }; struct iot_conf { - int32_t thread_count; - struct iot_worker **workers; + pthread_mutex_t mutex; + pthread_cond_t cond; + + int32_t max_count; /* configured maximum */ + int32_t curr_count; /* actual number of threads running */ + int32_t sleep_count; + + int32_t idle_time; /* in seconds */ + + struct list_head reqs[IOT_PRI_MAX]; + + int32_t ac_iot_limit[IOT_PRI_MAX]; + int32_t ac_iot_count[IOT_PRI_MAX]; + int queue_sizes[IOT_PRI_MAX]; + int queue_size; + pthread_attr_t w_attr; + gf_boolean_t least_priority; /*Enable/Disable least-priority */ xlator_t *this; - /* Config state for ordered threads. */ - pthread_mutex_t otlock; /* Used to sync any state that needs - to be changed by the ordered - threads. - */ - - int max_o_threads; /* Max. number of ordered threads */ - int min_o_threads; /* Min. number of ordered threads. - Ordered thread count never falls - below this threshold. - */ - - int o_idle_time; /* in Secs. The idle time after - which an ordered thread exits. - */ - gf_boolean_t o_scaling; /* Set to IOT_SCALING_OFF if user - does not want thread scaling on - ordered threads. If scaling is - off, io-threads maintains at - least min_o_threads number of - threads and never lets any thread - exit. - */ - struct iot_worker **oworkers; /* Ordered thread pool. */ - - - /* Config state for unordered threads */ - pthread_mutex_t utlock; /* Used for scaling un-ordered - threads. */ - struct iot_worker **uworkers; /* Un-ordered thread pool. */ - int max_u_threads; /* Number of unordered threads will - not be higher than this. */ - int min_u_threads; /* Number of unordered threads - should not fall below this value. - */ - int u_idle_time; /* If an unordered thread does not - get a request for this amount of - secs, it should try to die. - */ - gf_boolean_t u_scaling; /* Set to IOT_SCALING_OFF if user - does not want thread scaling on - unordered threads. If scaling is - off, io-threads maintains at - least min_u_threads number of - threads and never lets any thread - exit. - */ - - pthread_attr_t w_attr; /* Used to reduce the stack size of - the pthread worker down from the - default of 8MiB. - */ + size_t stack_size; + + struct iot_least_throttle throttle; }; typedef struct iot_conf iot_conf_t; -typedef struct iot_worker iot_worker_t; -typedef struct iot_request iot_request_t; #endif /* __IOT_H */ diff --git a/xlators/performance/io-threads/src/iot-mem-types.h b/xlators/performance/io-threads/src/iot-mem-types.h new file mode 100644 index 000000000..4fa8302d1 --- /dev/null +++ b/xlators/performance/io-threads/src/iot-mem-types.h @@ -0,0 +1,22 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef __IOT_MEM_TYPES_H__ +#define __IOT_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_iot_mem_types_ { + gf_iot_mt_iot_conf_t = gf_common_mt_end + 1, + gf_iot_mt_end +}; +#endif + diff --git a/xlators/performance/stat-prefetch/Makefile.am b/xlators/performance/md-cache/Makefile.am index af437a64d..af437a64d 100644 --- a/xlators/performance/stat-prefetch/Makefile.am +++ b/xlators/performance/md-cache/Makefile.am diff --git a/xlators/performance/md-cache/src/Makefile.am b/xlators/performance/md-cache/src/Makefile.am new file mode 100644 index 000000000..8c9f5a858 --- /dev/null +++ b/xlators/performance/md-cache/src/Makefile.am @@ -0,0 +1,25 @@ +xlator_LTLIBRARIES = md-cache.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +md_cache_la_LDFLAGS = -module -avoid-version + +md_cache_la_SOURCES = md-cache.c +md_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = md-cache-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(CONTRIBDIR)/rbtree + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = + + +stat-prefetch-compat: + mkdir -p $(DESTDIR)$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + rm -rf $(DESTDIR)$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance/stat-prefetch.so + ln -s ./md-cache.so $(DESTDIR)$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance/stat-prefetch.so + + +install-exec-local: stat-prefetch-compat diff --git a/xlators/performance/md-cache/src/md-cache-mem-types.h b/xlators/performance/md-cache/src/md-cache-mem-types.h new file mode 100644 index 000000000..6634cf962 --- /dev/null +++ b/xlators/performance/md-cache/src/md-cache-mem-types.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef __MDC_MEM_TYPES_H__ +#define __MDC_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_mdc_mem_types_ { + gf_mdc_mt_mdc_local_t = gf_common_mt_end + 1, + gf_mdc_mt_md_cache_t, + gf_mdc_mt_mdc_conf_t, + gf_mdc_mt_end +}; +#endif + diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c new file mode 100644 index 000000000..84c363ad9 --- /dev/null +++ b/xlators/performance/md-cache/src/md-cache.c @@ -0,0 +1,2303 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "md-cache-mem-types.h" +#include "glusterfs-acl.h" +#include <assert.h> +#include <sys/time.h> + + +/* TODO: + - cache symlink() link names and nuke symlink-cache + - send proper postbuf in setattr_cbk even when op_ret = -1 +*/ + + +struct mdc_conf { + int timeout; + gf_boolean_t cache_posix_acl; + gf_boolean_t cache_selinux; + gf_boolean_t force_readdirp; +}; + + +static struct mdc_key { + const char *name; + int load; + int check; +} mdc_keys[] = { + { + .name = POSIX_ACL_ACCESS_XATTR, + .load = 0, + .check = 1, + }, + { + .name = POSIX_ACL_DEFAULT_XATTR, + .load = 0, + .check = 1, + }, + { + .name = GF_SELINUX_XATTR_KEY, + .load = 0, + .check = 1, + }, + { + .name = "security.capability", + .load = 0, + .check = 1, + }, + { + .name = "gfid-req", + .load = 0, + .check = 1, + }, + { + .name = NULL, + .load = 0, + .check = 0, + } +}; + + +static uint64_t +gfid_to_ino (uuid_t gfid) +{ + uint64_t ino = 0; + int i = 0, j = 0; + + for (i = 15; i > (15 - 8); i--) { + ino += (uint64_t)(gfid[i]) << j; + j += 8; + } + + return ino; +} + + +struct mdc_local; +typedef struct mdc_local mdc_local_t; + +#define MDC_STACK_UNWIND(fop, frame, params ...) do { \ + mdc_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + if (frame) { \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT (fop, frame, params); \ + mdc_local_wipe (__xl, __local); \ + } while (0) + + +struct md_cache { + ia_prot_t md_prot; + uint32_t md_nlink; + uint32_t md_uid; + uint32_t md_gid; + uint32_t md_atime; + uint32_t md_atime_nsec; + uint32_t md_mtime; + uint32_t md_mtime_nsec; + uint32_t md_ctime; + uint32_t md_ctime_nsec; + uint64_t md_rdev; + uint64_t md_size; + uint64_t md_blocks; + dict_t *xattr; + char *linkname; + time_t ia_time; + time_t xa_time; + gf_lock_t lock; +}; + + +struct mdc_local { + loc_t loc; + loc_t loc2; + fd_t *fd; + char *linkname; + char *key; + dict_t *xattr; +}; + + +int +__mdc_inode_ctx_get (xlator_t *this, inode_t *inode, struct md_cache **mdc_p) +{ + int ret = 0; + struct md_cache *mdc = NULL; + uint64_t mdc_int = 0; + + ret = __inode_ctx_get (inode, this, &mdc_int); + mdc = (void *) (long) (mdc_int); + if (ret == 0 && mdc_p) + *mdc_p = mdc; + + return ret; +} + + +int +mdc_inode_ctx_get (xlator_t *this, inode_t *inode, struct md_cache **mdc_p) +{ + int ret; + + LOCK(&inode->lock); + { + ret = __mdc_inode_ctx_get (this, inode, mdc_p); + } + UNLOCK(&inode->lock); + + return ret; +} + + +int +__mdc_inode_ctx_set (xlator_t *this, inode_t *inode, struct md_cache *mdc) +{ + int ret = 0; + uint64_t mdc_int = 0; + + mdc_int = (long) mdc; + ret = __inode_ctx_set (inode, this, &mdc_int); + + return ret; +} + + +int +mdc_inode_ctx_set (xlator_t *this, inode_t *inode, struct md_cache *mdc) +{ + int ret; + + LOCK(&inode->lock); + { + ret = __mdc_inode_ctx_set (this, inode, mdc); + } + UNLOCK(&inode->lock); + + return ret; +} + + +mdc_local_t * +mdc_local_get (call_frame_t *frame) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (local) + goto out; + + local = GF_CALLOC (sizeof (*local), 1, gf_mdc_mt_mdc_local_t); + if (!local) + goto out; + + frame->local = local; +out: + return local; +} + + +void +mdc_local_wipe (xlator_t *this, mdc_local_t *local) +{ + if (!local) + return; + + loc_wipe (&local->loc); + + loc_wipe (&local->loc2); + + if (local->fd) + fd_unref (local->fd); + + GF_FREE (local->linkname); + + GF_FREE (local->key); + + if (local->xattr) + dict_unref (local->xattr); + + GF_FREE (local); + return; +} + + +int +mdc_inode_wipe (xlator_t *this, inode_t *inode) +{ + int ret = 0; + uint64_t mdc_int = 0; + struct md_cache *mdc = NULL; + + ret = inode_ctx_del (inode, this, &mdc_int); + if (ret != 0) + goto out; + + mdc = (void *) (long) mdc_int; + + if (mdc->xattr) + dict_unref (mdc->xattr); + + GF_FREE (mdc->linkname); + + GF_FREE (mdc); + + ret = 0; +out: + return ret; +} + + +struct md_cache * +mdc_inode_prep (xlator_t *this, inode_t *inode) +{ + int ret = 0; + struct md_cache *mdc = NULL; + + LOCK (&inode->lock); + { + ret = __mdc_inode_ctx_get (this, inode, &mdc); + if (ret == 0) + goto unlock; + + mdc = GF_CALLOC (sizeof (*mdc), 1, gf_mdc_mt_md_cache_t); + if (!mdc) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto unlock; + } + + LOCK_INIT (&mdc->lock); + + ret = __mdc_inode_ctx_set (this, inode, mdc); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + GF_FREE (mdc); + mdc = NULL; + } + } +unlock: + UNLOCK (&inode->lock); + + return mdc; +} + + +static gf_boolean_t +is_md_cache_iatt_valid (xlator_t *this, struct md_cache *mdc) +{ + struct mdc_conf *conf = NULL; + time_t now = 0; + gf_boolean_t ret = _gf_true; + conf = this->private; + + time (&now); + + LOCK (&mdc->lock); + { + if (now >= (mdc->ia_time + conf->timeout)) + ret = _gf_false; + } + UNLOCK (&mdc->lock); + + return ret; +} + + +static gf_boolean_t +is_md_cache_xatt_valid (xlator_t *this, struct md_cache *mdc) +{ + struct mdc_conf *conf = NULL; + time_t now = 0; + gf_boolean_t ret = _gf_true; + + conf = this->private; + + time (&now); + + LOCK (&mdc->lock); + { + if (now >= (mdc->xa_time + conf->timeout)) + ret = _gf_false; + } + UNLOCK (&mdc->lock); + + return ret; +} + + +void +mdc_from_iatt (struct md_cache *mdc, struct iatt *iatt) +{ + mdc->md_prot = iatt->ia_prot; + mdc->md_nlink = iatt->ia_nlink; + mdc->md_uid = iatt->ia_uid; + mdc->md_gid = iatt->ia_gid; + mdc->md_atime = iatt->ia_atime; + mdc->md_atime_nsec = iatt->ia_atime_nsec; + mdc->md_mtime = iatt->ia_mtime; + mdc->md_mtime_nsec = iatt->ia_mtime_nsec; + mdc->md_ctime = iatt->ia_ctime; + mdc->md_ctime_nsec = iatt->ia_ctime_nsec; + mdc->md_rdev = iatt->ia_rdev; + mdc->md_size = iatt->ia_size; + mdc->md_blocks = iatt->ia_blocks; +} + + +void +mdc_to_iatt (struct md_cache *mdc, struct iatt *iatt) +{ + iatt->ia_prot = mdc->md_prot; + iatt->ia_nlink = mdc->md_nlink; + iatt->ia_uid = mdc->md_uid; + iatt->ia_gid = mdc->md_gid; + iatt->ia_atime = mdc->md_atime; + iatt->ia_atime_nsec = mdc->md_atime_nsec; + iatt->ia_mtime = mdc->md_mtime; + iatt->ia_mtime_nsec = mdc->md_mtime_nsec; + iatt->ia_ctime = mdc->md_ctime; + iatt->ia_ctime_nsec = mdc->md_ctime_nsec; + iatt->ia_rdev = mdc->md_rdev; + iatt->ia_size = mdc->md_size; + iatt->ia_blocks = mdc->md_blocks; +} + + +int +mdc_inode_iatt_set_validate(xlator_t *this, inode_t *inode, struct iatt *prebuf, + struct iatt *iatt) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + mdc = mdc_inode_prep (this, inode); + if (!mdc) + goto out; + + LOCK (&mdc->lock); + { + if (!iatt || !iatt->ia_ctime) { + mdc->ia_time = 0; + goto unlock; + } + + /* + * Invalidate the inode if the mtime or ctime has changed + * and the prebuf doesn't match the value we have cached. + * TODO: writev returns with a NULL iatt due to + * performance/write-behind, causing invalidation on writes. + */ + if (IA_ISREG(inode->ia_type) && + ((iatt->ia_mtime != mdc->md_mtime) || + (iatt->ia_ctime != mdc->md_ctime))) + if (!prebuf || (prebuf->ia_ctime != mdc->md_ctime) || + (prebuf->ia_mtime != mdc->md_mtime)) + inode_invalidate(inode); + + mdc_from_iatt (mdc, iatt); + + time (&mdc->ia_time); + } +unlock: + UNLOCK (&mdc->lock); + ret = 0; +out: + return ret; +} + +int mdc_inode_iatt_set(xlator_t *this, inode_t *inode, struct iatt *iatt) +{ + return mdc_inode_iatt_set_validate(this, inode, NULL, iatt); +} + +int +mdc_inode_iatt_get (xlator_t *this, inode_t *inode, struct iatt *iatt) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + if (mdc_inode_ctx_get (this, inode, &mdc) != 0) + goto out; + + if (!is_md_cache_iatt_valid (this, mdc)) + goto out; + + LOCK (&mdc->lock); + { + mdc_to_iatt (mdc, iatt); + } + UNLOCK (&mdc->lock); + + uuid_copy (iatt->ia_gfid, inode->gfid); + iatt->ia_ino = gfid_to_ino (inode->gfid); + iatt->ia_dev = 42; + iatt->ia_type = inode->ia_type; + + ret = 0; +out: + return ret; +} + +struct updatedict { + dict_t *dict; + int ret; +}; + +static int +updatefn(dict_t *dict, char *key, data_t *value, void *data) +{ + struct updatedict *u = data; + const char *mdc_key; + int i = 0; + + for (mdc_key = mdc_keys[i].name; (mdc_key = mdc_keys[i].name); i++) { + if (!mdc_keys[i].check) + continue; + if (strcmp(mdc_key, key)) + continue; + + if (!u->dict) { + u->dict = dict_new(); + if (!u->dict) { + u->ret = -1; + return -1; + } + } + + if (dict_set(u->dict, key, value) < 0) { + u->ret = -1; + return -1; + } + + break; + } + return 0; +} + +static int +mdc_dict_update(dict_t **tgt, dict_t *src) +{ + struct updatedict u = { + .dict = *tgt, + .ret = 0, + }; + + dict_foreach(src, updatefn, &u); + + if (*tgt) + return u.ret; + + if ((u.ret < 0) && u.dict) { + dict_unref(u.dict); + return u.ret; + } + + *tgt = u.dict; + + return u.ret; +} + +int +mdc_inode_xatt_set (xlator_t *this, inode_t *inode, dict_t *dict) +{ + int ret = -1; + struct md_cache *mdc = NULL; + dict_t *newdict = NULL; + + mdc = mdc_inode_prep (this, inode); + if (!mdc) + goto out; + + if (!dict) + goto out; + + LOCK (&mdc->lock); + { + if (mdc->xattr) { + dict_unref (mdc->xattr); + mdc->xattr = NULL; + } + + ret = mdc_dict_update(&newdict, dict); + if (ret < 0) { + UNLOCK(&mdc->lock); + goto out; + } + + if (newdict) + mdc->xattr = newdict; + + time (&mdc->xa_time); + } + UNLOCK (&mdc->lock); + ret = 0; +out: + return ret; +} + + +int +mdc_inode_xatt_update (xlator_t *this, inode_t *inode, dict_t *dict) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + mdc = mdc_inode_prep (this, inode); + if (!mdc) + goto out; + + if (!dict) + goto out; + + LOCK (&mdc->lock); + { + ret = mdc_dict_update(&mdc->xattr, dict); + if (ret < 0) { + UNLOCK(&mdc->lock); + goto out; + } + + time (&mdc->xa_time); + } + UNLOCK (&mdc->lock); + + ret = 0; +out: + return ret; +} + + +int +mdc_inode_xatt_unset (xlator_t *this, inode_t *inode, char *name) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + mdc = mdc_inode_prep (this, inode); + if (!mdc) + goto out; + + if (!name) + goto out; + + LOCK (&mdc->lock); + { + dict_del (mdc->xattr, name); + } + UNLOCK (&mdc->lock); + + ret = 0; +out: + return ret; +} + + +int +mdc_inode_xatt_get (xlator_t *this, inode_t *inode, dict_t **dict) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + if (mdc_inode_ctx_get (this, inode, &mdc) != 0) + goto out; + + if (!is_md_cache_xatt_valid (this, mdc)) + goto out; + + LOCK (&mdc->lock); + { + ret = 0; + /* Missing xattr only means no keys were there, i.e + a negative cache for the "loaded" keys + */ + if (!mdc->xattr) + goto unlock; + + if (dict) + *dict = dict_ref (mdc->xattr); + } +unlock: + UNLOCK (&mdc->lock); + +out: + return ret; +} + + +int +mdc_inode_iatt_invalidate (xlator_t *this, inode_t *inode) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + if (mdc_inode_ctx_get (this, inode, &mdc) != 0) + goto out; + + LOCK (&mdc->lock); + { + mdc->ia_time = 0; + } + UNLOCK (&mdc->lock); + +out: + return ret; +} + + +int +mdc_inode_xatt_invalidate (xlator_t *this, inode_t *inode) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + if (mdc_inode_ctx_get (this, inode, &mdc) != 0) + goto out; + + LOCK (&mdc->lock); + { + mdc->xa_time = 0; + } + UNLOCK (&mdc->lock); + +out: + return ret; +} + + +void +mdc_load_reqs (xlator_t *this, dict_t *dict) +{ + const char *mdc_key = NULL; + int i = 0; + int ret = 0; + + for (mdc_key = mdc_keys[i].name; (mdc_key = mdc_keys[i].name); i++) { + if (!mdc_keys[i].load) + continue; + ret = dict_set_int8 (dict, (char *)mdc_key, 0); + if (ret) + return; + } +} + + +struct checkpair { + int ret; + dict_t *rsp; +}; + + +static int +is_mdc_key_satisfied (const char *key) +{ + const char *mdc_key = NULL; + int i = 0; + + if (!key) + return 0; + + for (mdc_key = mdc_keys[i].name; (mdc_key = mdc_keys[i].name); i++) { + if (!mdc_keys[i].load) + continue; + if (strcmp (mdc_key, key) == 0) + return 1; + } + + return 0; +} + + +static int +checkfn (dict_t *this, char *key, data_t *value, void *data) +{ + struct checkpair *pair = data; + + if (!is_mdc_key_satisfied (key)) + pair->ret = 0; + + return 0; +} + + +int +mdc_xattr_satisfied (xlator_t *this, dict_t *req, dict_t *rsp) +{ + struct checkpair pair = { + .ret = 1, + .rsp = rsp, + }; + + dict_foreach (req, checkfn, &pair); + + return pair.ret; +} + + +int +mdc_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *dict, struct iatt *postparent) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + if (local->loc.parent) { + mdc_inode_iatt_set (this, local->loc.parent, postparent); + } + + if (local->loc.inode) { + mdc_inode_iatt_set (this, local->loc.inode, stbuf); + mdc_inode_xatt_set (this, local->loc.inode, dict); + } +out: + MDC_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, + dict, postparent); + return 0; +} + + +int +mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) +{ + int ret = 0; + struct iatt stbuf = {0, }; + struct iatt postparent = {0, }; + dict_t *xattr_rsp = NULL; + dict_t *xattr_alloc = NULL; + mdc_local_t *local = NULL; + + + local = mdc_local_get (frame); + if (!local) + goto uncached; + + if (!loc->name) + /* A nameless discovery is dangerous to cache. We + perform nameless lookup with the intention of + re-establishing an inode "properly" + */ + goto uncached; + + loc_copy (&local->loc, loc); + + ret = mdc_inode_iatt_get (this, loc->inode, &stbuf); + if (ret != 0) + goto uncached; + + if (xdata) { + ret = mdc_inode_xatt_get (this, loc->inode, &xattr_rsp); + if (ret != 0) + goto uncached; + + if (!mdc_xattr_satisfied (this, xdata, xattr_rsp)) + goto uncached; + } + + MDC_STACK_UNWIND (lookup, frame, 0, 0, loc->inode, &stbuf, + xattr_rsp, &postparent); + + if (xattr_rsp) + dict_unref (xattr_rsp); + + return 0; + +uncached: + if (!xdata) + xdata = xattr_alloc = dict_new (); + if (xdata) + mdc_load_reqs (this, xdata); + + STACK_WIND (frame, mdc_lookup_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, loc, xdata); + + if (xattr_rsp) + dict_unref (xattr_rsp); + if (xattr_alloc) + dict_unref (xattr_alloc); + return 0; +} + + +int +mdc_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + if (op_ret != 0) + goto out; + + local = frame->local; + if (!local) + goto out; + + mdc_inode_iatt_set (this, local->loc.inode, buf); + +out: + MDC_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); + + return 0; +} + + +int +mdc_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int ret; + struct iatt stbuf; + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + if (!local) + goto uncached; + + loc_copy (&local->loc, loc); + + ret = mdc_inode_iatt_get (this, loc->inode, &stbuf); + if (ret != 0) + goto uncached; + + MDC_STACK_UNWIND (stat, frame, 0, 0, &stbuf, xdata); + + return 0; + +uncached: + STACK_WIND (frame, mdc_stat_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat, + loc, xdata); + return 0; +} + + +int +mdc_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + if (op_ret != 0) + goto out; + + local = frame->local; + if (!local) + goto out; + + mdc_inode_iatt_set (this, local->fd->inode, buf); + +out: + MDC_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); + + return 0; +} + + +int +mdc_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int ret; + struct iatt stbuf; + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + if (!local) + goto uncached; + + local->fd = fd_ref (fd); + + ret = mdc_inode_iatt_get (this, fd->inode, &stbuf); + if (ret != 0) + goto uncached; + + MDC_STACK_UNWIND (fstat, frame, 0, 0, &stbuf, xdata); + + return 0; + +uncached: + STACK_WIND (frame, mdc_fstat_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat, + fd, xdata); + return 0; +} + + +int +mdc_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + mdc_inode_iatt_set_validate(this, local->loc.inode, prebuf, postbuf); + +out: + MDC_STACK_UNWIND (truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + + +int +mdc_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + local->loc.inode = inode_ref (loc->inode); + + STACK_WIND (frame, mdc_truncate_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, + loc, offset, xdata); + return 0; +} + + +int +mdc_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + +out: + MDC_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + + +int +mdc_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + local->fd = fd_ref (fd); + + STACK_WIND (frame, mdc_ftruncate_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, + fd, offset, xdata); + return 0; +} + + +int +mdc_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + if (local->loc.parent) { + mdc_inode_iatt_set (this, local->loc.parent, postparent); + } + + if (local->loc.inode) { + mdc_inode_iatt_set (this, local->loc.inode, buf); + mdc_inode_xatt_set (this, local->loc.inode, local->xattr); + } +out: + MDC_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +mdc_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, + mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + loc_copy (&local->loc, loc); + local->xattr = dict_ref (xdata); + + STACK_WIND (frame, mdc_mknod_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, + loc, mode, rdev, umask, xdata); + return 0; +} + + +int +mdc_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + if (local->loc.parent) { + mdc_inode_iatt_set (this, local->loc.parent, postparent); + } + + if (local->loc.inode) { + mdc_inode_iatt_set (this, local->loc.inode, buf); + mdc_inode_xatt_set (this, local->loc.inode, local->xattr); + } +out: + MDC_STACK_UNWIND (mkdir, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +mdc_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, + mode_t mode, mode_t umask, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + loc_copy (&local->loc, loc); + local->xattr = dict_ref (xdata); + + STACK_WIND (frame, mdc_mkdir_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, + loc, mode, umask, xdata); + return 0; +} + + +int +mdc_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + if (local->loc.parent) { + mdc_inode_iatt_set (this, local->loc.parent, postparent); + } + + if (local->loc.inode) { + mdc_inode_iatt_set (this, local->loc.inode, NULL); + } + +out: + MDC_STACK_UNWIND (unlink, frame, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + + +int +mdc_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + loc_copy (&local->loc, loc); + + STACK_WIND (frame, mdc_unlink_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, + loc, xflag, xdata); + return 0; +} + + +int +mdc_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + if (local->loc.parent) { + mdc_inode_iatt_set (this, local->loc.parent, postparent); + } + +out: + MDC_STACK_UNWIND (rmdir, frame, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + + +int +mdc_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flag, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + loc_copy (&local->loc, loc); + + STACK_WIND (frame, mdc_rmdir_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir, + loc, flag, xdata); + return 0; +} + + +int +mdc_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + if (local->loc.parent) { + mdc_inode_iatt_set (this, local->loc.parent, postparent); + } + + if (local->loc.inode) { + mdc_inode_iatt_set (this, local->loc.inode, buf); + } +out: + MDC_STACK_UNWIND (symlink, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +mdc_symlink (call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + loc_copy (&local->loc, loc); + + local->linkname = gf_strdup (linkname); + + STACK_WIND (frame, mdc_symlink_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, + linkname, loc, umask, xdata); + return 0; +} + + +int +mdc_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + if (local->loc.parent) { + mdc_inode_iatt_set (this, local->loc.parent, postoldparent); + } + + if (local->loc.inode) { + /* TODO: fix dht_rename() not to return linkfile + attributes before setting attributes here + */ + + mdc_inode_iatt_set (this, local->loc.inode, NULL); + } + + if (local->loc2.parent) { + mdc_inode_iatt_set (this, local->loc2.parent, postnewparent); + } +out: + MDC_STACK_UNWIND (rename, frame, op_ret, op_errno, buf, + preoldparent, postoldparent, prenewparent, + postnewparent, xdata); + return 0; +} + + +int +mdc_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + loc_copy (&local->loc, oldloc); + loc_copy (&local->loc2, newloc); + + STACK_WIND (frame, mdc_rename_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, + oldloc, newloc, xdata); + return 0; +} + + +int +mdc_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + if (local->loc.inode) { + mdc_inode_iatt_set (this, local->loc.inode, buf); + } + + if (local->loc2.parent) { + mdc_inode_iatt_set (this, local->loc2.parent, postparent); + } +out: + MDC_STACK_UNWIND (link, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +mdc_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + loc_copy (&local->loc, oldloc); + loc_copy (&local->loc2, newloc); + + STACK_WIND (frame, mdc_link_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + return 0; +} + + +int +mdc_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + if (local->loc.parent) { + mdc_inode_iatt_set (this, local->loc.parent, postparent); + } + + if (local->loc.inode) { + mdc_inode_iatt_set (this, inode, buf); + mdc_inode_xatt_set (this, local->loc.inode, local->xattr); + } +out: + MDC_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +mdc_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + loc_copy (&local->loc, loc); + local->xattr = dict_ref (xdata); + + STACK_WIND (frame, mdc_create_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; +} + + +int +mdc_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + mdc_inode_iatt_set (this, local->fd->inode, stbuf); + +out: + MDC_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, + stbuf, iobref, xdata); + + return 0; +} + + +int +mdc_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + local->fd = fd_ref (fd); + + STACK_WIND (frame, mdc_readv_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; +} + + +int +mdc_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret == -1) + goto out; + + if (!local) + goto out; + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + +out: + MDC_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + + +int +mdc_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + local->fd = fd_ref (fd); + + STACK_WIND (frame, mdc_writev_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, + fd, vector, count, offset, flags, iobref, xdata); + return 0; +} + + +int +mdc_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) { + mdc_inode_iatt_set (this, local->loc.inode, NULL); + goto out; + } + + if (!local) + goto out; + + mdc_inode_iatt_set_validate(this, local->loc.inode, prebuf, postbuf); + +out: + MDC_STACK_UNWIND (setattr, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + + +int +mdc_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int valid, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + loc_copy (&local->loc, loc); + + STACK_WIND (frame, mdc_setattr_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->setattr, + loc, stbuf, valid, xdata); + return 0; +} + + +int +mdc_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + +out: + MDC_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + + +int +mdc_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int valid, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + local->fd = fd_ref (fd); + + STACK_WIND (frame, mdc_fsetattr_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetattr, + fd, stbuf, valid, xdata); + return 0; +} + + +int +mdc_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + +out: + MDC_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + + +int +mdc_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + local->fd = fd_ref (fd); + + STACK_WIND (frame, mdc_fsync_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, + fd, datasync, xdata); + return 0; +} + + +int +mdc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + mdc_inode_xatt_update (this, local->loc.inode, local->xattr); + + mdc_inode_iatt_invalidate (this, local->loc.inode); + +out: + MDC_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); + + return 0; +} + + +int +mdc_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xattr, int flags, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + loc_copy (&local->loc, loc); + local->xattr = dict_ref (xattr); + + STACK_WIND (frame, mdc_setxattr_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, + loc, xattr, flags, xdata); + return 0; +} + + +int +mdc_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + mdc_inode_xatt_update (this, local->fd->inode, local->xattr); + + mdc_inode_iatt_invalidate (this, local->fd->inode); +out: + MDC_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata); + + return 0; +} + + +int +mdc_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *xattr, int flags, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + local->fd = fd_ref (fd); + local->xattr = dict_ref (xattr); + + STACK_WIND (frame, mdc_fsetxattr_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetxattr, + fd, xattr, flags, xdata); + return 0; +} + +int +mdc_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + if (op_ret != 0) + goto out; + + local = frame->local; + if (!local) + goto out; + + mdc_inode_xatt_update (this, local->loc.inode, xattr); + +out: + MDC_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); + + return 0; +} + + +int +mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, + dict_t *xdata) +{ + int ret; + int op_errno = ENODATA; + mdc_local_t *local = NULL; + dict_t *xattr = NULL; + + local = mdc_local_get (frame); + if (!local) + goto uncached; + + loc_copy (&local->loc, loc); + + if (!is_mdc_key_satisfied (key)) + goto uncached; + + ret = mdc_inode_xatt_get (this, loc->inode, &xattr); + if (ret != 0) + goto uncached; + + if (!xattr || !dict_get (xattr, (char *)key)) { + ret = -1; + op_errno = ENODATA; + } + + MDC_STACK_UNWIND (getxattr, frame, ret, op_errno, xattr, xdata); + + return 0; + +uncached: + STACK_WIND (frame, mdc_getxattr_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->getxattr, + loc, key, xdata); + return 0; +} + + +int +mdc_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + if (op_ret != 0) + goto out; + + local = frame->local; + if (!local) + goto out; + + mdc_inode_xatt_update (this, local->fd->inode, xattr); + +out: + MDC_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, xdata); + + return 0; +} + + +int +mdc_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, + dict_t *xdata) +{ + int ret; + mdc_local_t *local = NULL; + dict_t *xattr = NULL; + int op_errno = ENODATA; + + local = mdc_local_get (frame); + if (!local) + goto uncached; + + local->fd = fd_ref (fd); + + if (!is_mdc_key_satisfied (key)) + goto uncached; + + ret = mdc_inode_xatt_get (this, fd->inode, &xattr); + if (ret != 0) + goto uncached; + + if (!xattr || !dict_get (xattr, (char *)key)) { + ret = -1; + op_errno = ENODATA; + } + + MDC_STACK_UNWIND (fgetxattr, frame, ret, op_errno, xattr, xdata); + + return 0; + +uncached: + STACK_WIND (frame, mdc_fgetxattr_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fgetxattr, + fd, key, xdata); + return 0; +} + +int +mdc_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + if (local->key) + mdc_inode_xatt_unset (this, local->loc.inode, local->key); + else + mdc_inode_xatt_invalidate (this, local->loc.inode); + + mdc_inode_iatt_invalidate (this, local->loc.inode); +out: + MDC_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata); + + return 0; +} + + +int +mdc_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + loc_copy (&local->loc, loc); + + local->key = gf_strdup (name); + + STACK_WIND (frame, mdc_removexattr_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr, + loc, name, xdata); + return 0; +} + + +int +mdc_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + if (local->key) + mdc_inode_xatt_unset (this, local->fd->inode, local->key); + else + mdc_inode_xatt_invalidate (this, local->fd->inode); + + mdc_inode_iatt_invalidate (this, local->fd->inode); +out: + MDC_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata); + + return 0; +} + + +int +mdc_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + local->fd = fd_ref (fd); + + local->key = gf_strdup (name); + + STACK_WIND (frame, mdc_fremovexattr_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fremovexattr, + fd, name, xdata); + return 0; +} + + +int +mdc_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + + if (op_ret <= 0) + goto unwind; + + list_for_each_entry (entry, &entries->list, list) { + if (!entry->inode) + continue; + mdc_inode_iatt_set (this, entry->inode, &entry->d_stat); + mdc_inode_xatt_set (this, entry->inode, entry->dict); + } + +unwind: + STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} + + +int +mdc_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t offset, dict_t *xdata) +{ + dict_t *xattr_alloc = NULL; + + if (!xdata) + xdata = xattr_alloc = dict_new (); + if (xdata) + mdc_load_reqs (this, xdata); + + STACK_WIND (frame, mdc_readdirp_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->readdirp, + fd, size, offset, xdata); + if (xattr_alloc) + dict_unref (xattr_alloc); + return 0; +} + +int +mdc_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *entries, dict_t *xdata) +{ + STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, entries, xdata); + return 0; +} + +int +mdc_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t offset, dict_t *xdata) +{ + int need_unref = 0; + struct mdc_conf *conf = this->private; + + if (!conf->force_readdirp) { + STACK_WIND(frame, mdc_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, fd, size, offset, + xdata); + return 0; + } + + if (!xdata) { + xdata = dict_new (); + need_unref = 1; + } + + if (xdata) + mdc_load_reqs (this, xdata); + + STACK_WIND(frame, mdc_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, + xdata); + + if (need_unref && xdata) + dict_unref (xdata); + + return 0; +} + +int +mdc_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + +out: + MDC_STACK_UNWIND (fallocate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +int mdc_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + mdc_local_t *local; + + local = mdc_local_get(frame); + local->fd = fd_ref(fd); + + STACK_WIND(frame, mdc_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, + xdata); + + return 0; +} + +int +mdc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + +out: + MDC_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +int mdc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + mdc_local_t *local; + + local = mdc_local_get(frame); + local->fd = fd_ref(fd); + + STACK_WIND(frame, mdc_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, + xdata); + + return 0; +} + +int +mdc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + +out: + MDC_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +int mdc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + mdc_local_t *local; + + local = mdc_local_get(frame); + local->fd = fd_ref(fd); + + STACK_WIND(frame, mdc_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, + xdata); + + return 0; +} + + +int +mdc_forget (xlator_t *this, inode_t *inode) +{ + mdc_inode_wipe (this, inode); + + return 0; +} + + +int +is_strpfx (const char *str1, const char *str2) +{ + /* is one of the string a prefix of the other? */ + int i; + + for (i = 0; str1[i] == str2[i]; i++) { + if (!str1[i] || !str2[i]) + break; + } + + return !(str1[i] && str2[i]); +} + + +int +mdc_key_load_set (struct mdc_key *keys, char *pattern, gf_boolean_t val) +{ + struct mdc_key *key = NULL; + + for (key = keys; key->name; key++) { + if (is_strpfx (key->name, pattern)) + key->load = val; + } + + return 0; +} + + +int +reconfigure (xlator_t *this, dict_t *options) +{ + struct mdc_conf *conf = NULL; + + conf = this->private; + + GF_OPTION_RECONF ("md-cache-timeout", conf->timeout, options, int32, out); + + GF_OPTION_RECONF ("cache-selinux", conf->cache_selinux, options, bool, out); + mdc_key_load_set (mdc_keys, "security.", conf->cache_selinux); + + GF_OPTION_RECONF ("cache-posix-acl", conf->cache_posix_acl, options, bool, out); + mdc_key_load_set (mdc_keys, "system.posix_acl_", conf->cache_posix_acl); + + GF_OPTION_RECONF("force-readdirp", conf->force_readdirp, options, bool, out); + +out: + return 0; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + ret = xlator_mem_acct_init (this, gf_mdc_mt_end + 1); + return ret; +} + +int +init (xlator_t *this) +{ + struct mdc_conf *conf = NULL; + + conf = GF_CALLOC (sizeof (*conf), 1, gf_mdc_mt_mdc_conf_t); + if (!conf) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory"); + return -1; + } + + GF_OPTION_INIT ("md-cache-timeout", conf->timeout, int32, out); + + GF_OPTION_INIT ("cache-selinux", conf->cache_selinux, bool, out); + mdc_key_load_set (mdc_keys, "security.", conf->cache_selinux); + + GF_OPTION_INIT ("cache-posix-acl", conf->cache_posix_acl, bool, out); + mdc_key_load_set (mdc_keys, "system.posix_acl_", conf->cache_posix_acl); + + GF_OPTION_INIT("force-readdirp", conf->force_readdirp, bool, out); +out: + this->private = conf; + + return 0; +} + + +void +fini (xlator_t *this) +{ + return; +} + + +struct xlator_fops fops = { + .lookup = mdc_lookup, + .stat = mdc_stat, + .fstat = mdc_fstat, + .truncate = mdc_truncate, + .ftruncate = mdc_ftruncate, + .mknod = mdc_mknod, + .mkdir = mdc_mkdir, + .unlink = mdc_unlink, + .rmdir = mdc_rmdir, + .symlink = mdc_symlink, + .rename = mdc_rename, + .link = mdc_link, + .create = mdc_create, + .readv = mdc_readv, + .writev = mdc_writev, + .setattr = mdc_setattr, + .fsetattr = mdc_fsetattr, + .fsync = mdc_fsync, + .setxattr = mdc_setxattr, + .fsetxattr = mdc_fsetxattr, + .getxattr = mdc_getxattr, + .fgetxattr = mdc_fgetxattr, + .removexattr = mdc_removexattr, + .fremovexattr= mdc_fremovexattr, + .readdirp = mdc_readdirp, + .readdir = mdc_readdir, + .fallocate = mdc_fallocate, + .discard = mdc_discard, + .zerofill = mdc_zerofill, +}; + + +struct xlator_cbks cbks = { + .forget = mdc_forget, +}; + +struct volume_options options[] = { + { .key = {"cache-selinux"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + }, + { .key = {"cache-posix-acl"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + }, + { .key = {"md-cache-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 60, + .default_value = "1", + .description = "Time period after which cache has to be refreshed", + }, + { .key = {"force-readdirp"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .description = "Convert all readdir requests to readdirplus to " + "collect stat info on each entry.", + }, + { .key = {NULL} }, +}; diff --git a/xlators/performance/open-behind/Makefile.am b/xlators/performance/open-behind/Makefile.am new file mode 100644 index 000000000..af437a64d --- /dev/null +++ b/xlators/performance/open-behind/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src diff --git a/xlators/performance/open-behind/src/Makefile.am b/xlators/performance/open-behind/src/Makefile.am new file mode 100644 index 000000000..125285707 --- /dev/null +++ b/xlators/performance/open-behind/src/Makefile.am @@ -0,0 +1,15 @@ +xlator_LTLIBRARIES = open-behind.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +open_behind_la_LDFLAGS = -module -avoid-version + +open_behind_la_SOURCES = open-behind.c +open_behind_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = open-behind-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/performance/open-behind/src/open-behind-mem-types.h b/xlators/performance/open-behind/src/open-behind-mem-types.h new file mode 100644 index 000000000..1e94296f4 --- /dev/null +++ b/xlators/performance/open-behind/src/open-behind-mem-types.h @@ -0,0 +1,21 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __OB_MEM_TYPES_H__ +#define __OB_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_ob_mem_types_ { + gf_ob_mt_fd_t = gf_common_mt_end + 1, + gf_ob_mt_conf_t, + gf_ob_mt_end +}; +#endif diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c new file mode 100644 index 000000000..7e5b57278 --- /dev/null +++ b/xlators/performance/open-behind/src/open-behind.c @@ -0,0 +1,1001 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "open-behind-mem-types.h" +#include "xlator.h" +#include "statedump.h" +#include "call-stub.h" +#include "defaults.h" + +typedef struct ob_conf { + gf_boolean_t use_anonymous_fd; /* use anonymous FDs wherever safe + e.g - fstat() readv() + + whereas for fops like writev(), lk(), + the fd is important for side effects + like mandatory locks + */ + gf_boolean_t lazy_open; /* delay backend open as much as possible */ +} ob_conf_t; + + +typedef struct ob_fd { + call_frame_t *open_frame; + loc_t loc; + dict_t *xdata; + int flags; + int op_errno; + struct list_head list; +} ob_fd_t; + + +ob_fd_t * +__ob_fd_ctx_get (xlator_t *this, fd_t *fd) +{ + uint64_t value = 0; + int ret = -1; + ob_fd_t *ob_fd = NULL; + + ret = __fd_ctx_get (fd, this, &value); + if (ret) + return NULL; + + ob_fd = (void *) ((long) value); + + return ob_fd; +} + + +ob_fd_t * +ob_fd_ctx_get (xlator_t *this, fd_t *fd) +{ + ob_fd_t *ob_fd = NULL; + + LOCK (&fd->lock); + { + ob_fd = __ob_fd_ctx_get (this, fd); + } + UNLOCK (&fd->lock); + + return ob_fd; +} + + +int +__ob_fd_ctx_set (xlator_t *this, fd_t *fd, ob_fd_t *ob_fd) +{ + uint64_t value = 0; + int ret = -1; + + value = (long) ((void *) ob_fd); + + ret = __fd_ctx_set (fd, this, value); + + return ret; +} + + +int +ob_fd_ctx_set (xlator_t *this, fd_t *fd, ob_fd_t *ob_fd) +{ + int ret = -1; + + LOCK (&fd->lock); + { + ret = __ob_fd_ctx_set (this, fd, ob_fd); + } + UNLOCK (&fd->lock); + + return ret; +} + + +ob_fd_t * +ob_fd_new (void) +{ + ob_fd_t *ob_fd = NULL; + + ob_fd = GF_CALLOC (1, sizeof (*ob_fd), gf_ob_mt_fd_t); + + INIT_LIST_HEAD (&ob_fd->list); + + return ob_fd; +} + + +void +ob_fd_free (ob_fd_t *ob_fd) +{ + loc_wipe (&ob_fd->loc); + + if (ob_fd->xdata) + dict_unref (ob_fd->xdata); + + if (ob_fd->open_frame) + STACK_DESTROY (ob_fd->open_frame->root); + + GF_FREE (ob_fd); +} + + +int +ob_wake_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd_ret, dict_t *xdata) +{ + fd_t *fd = NULL; + struct list_head list; + ob_fd_t *ob_fd = NULL; + call_stub_t *stub = NULL, *tmp = NULL; + + fd = frame->local; + frame->local = NULL; + + INIT_LIST_HEAD (&list); + + LOCK (&fd->lock); + { + ob_fd = __ob_fd_ctx_get (this, fd); + + list_splice_init (&ob_fd->list, &list); + + if (op_ret < 0) { + /* mark fd BAD for ever */ + ob_fd->op_errno = op_errno; + } else { + __fd_ctx_del (fd, this, NULL); + ob_fd_free (ob_fd); + } + } + UNLOCK (&fd->lock); + + list_for_each_entry_safe (stub, tmp, &list, list) { + list_del_init (&stub->list); + + if (op_ret < 0) + call_unwind_error (stub, -1, op_errno); + else + call_resume (stub); + } + + fd_unref (fd); + + STACK_DESTROY (frame->root); + + return 0; +} + + +int +ob_fd_wake (xlator_t *this, fd_t *fd) +{ + call_frame_t *frame = NULL; + ob_fd_t *ob_fd = NULL; + + LOCK (&fd->lock); + { + ob_fd = __ob_fd_ctx_get (this, fd); + if (!ob_fd) + goto unlock; + + frame = ob_fd->open_frame; + ob_fd->open_frame = NULL; + } +unlock: + UNLOCK (&fd->lock); + + if (frame) { + frame->local = fd_ref (fd); + + STACK_WIND (frame, ob_wake_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->open, + &ob_fd->loc, ob_fd->flags, fd, ob_fd->xdata); + } + + return 0; +} + + +int +open_and_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) +{ + ob_fd_t *ob_fd = NULL; + int op_errno = 0; + + if (!fd) + goto nofd; + + LOCK (&fd->lock); + { + ob_fd = __ob_fd_ctx_get (this, fd); + if (!ob_fd) + goto unlock; + + if (ob_fd->op_errno) { + op_errno = ob_fd->op_errno; + goto unlock; + } + + list_add_tail (&stub->list, &ob_fd->list); + } +unlock: + UNLOCK (&fd->lock); + +nofd: + if (op_errno) + call_unwind_error (stub, -1, op_errno); + else if (ob_fd) + ob_fd_wake (this, fd); + else + call_resume (stub); + + return 0; +} + + +int +ob_open_behind (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + fd_t *fd, dict_t *xdata) +{ + ob_fd_t *ob_fd = NULL; + int ret = -1; + ob_conf_t *conf = NULL; + + + conf = this->private; + + if (flags & O_TRUNC) { + STACK_WIND (frame, default_open_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->open, + loc, flags, fd, xdata); + return 0; + } + + ob_fd = ob_fd_new (); + if (!ob_fd) + goto enomem; + + ob_fd->open_frame = copy_frame (frame); + if (!ob_fd->open_frame) + goto enomem; + ret = loc_copy (&ob_fd->loc, loc); + if (ret) + goto enomem; + + ob_fd->flags = flags; + if (xdata) + ob_fd->xdata = dict_ref (xdata); + + ret = ob_fd_ctx_set (this, fd, ob_fd); + if (ret) + goto enomem; + + fd_ref (fd); + + STACK_UNWIND_STRICT (open, frame, 0, 0, fd, xdata); + + if (!conf->lazy_open) + ob_fd_wake (this, fd); + + fd_unref (fd); + + return 0; +enomem: + if (ob_fd) { + if (ob_fd->open_frame) + STACK_DESTROY (ob_fd->open_frame->root); + loc_wipe (&ob_fd->loc); + if (ob_fd->xdata) + dict_unref (ob_fd->xdata); + GF_FREE (ob_fd); + } + + return -1; +} + + +int +ob_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + fd_t *fd, dict_t *xdata) +{ + fd_t *old_fd = NULL; + int ret = -1; + int op_errno = 0; + call_stub_t *stub = NULL; + + old_fd = fd_lookup (fd->inode, 0); + if (old_fd) { + /* open-behind only when this is the first FD */ + stub = fop_open_stub (frame, default_open_resume, + loc, flags, fd, xdata); + if (!stub) { + op_errno = ENOMEM; + fd_unref (old_fd); + goto err; + } + + open_and_resume (this, old_fd, stub); + + fd_unref (old_fd); + + return 0; + } + + ret = ob_open_behind (frame, this, loc, flags, fd, xdata); + if (ret) { + op_errno = ENOMEM; + goto err; + } + + return 0; +err: + gf_log (this->name, GF_LOG_ERROR, "%s: %s", loc->path, + strerror (op_errno)); + + STACK_UNWIND_STRICT (open, frame, -1, op_errno, 0, 0); + + return 0; +} + + +fd_t * +ob_get_wind_fd (xlator_t *this, fd_t *fd) +{ + ob_conf_t *conf = NULL; + ob_fd_t *ob_fd = NULL; + + conf = this->private; + + ob_fd = ob_fd_ctx_get (this, fd); + + if (ob_fd && conf->use_anonymous_fd) + return fd_anonymous (fd->inode); + + return fd_ref (fd); +} + + +int +ob_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + call_stub_t *stub = NULL; + fd_t *wind_fd = NULL; + + wind_fd = ob_get_wind_fd (this, fd); + + stub = fop_readv_stub (frame, default_readv_resume, wind_fd, + size, offset, flags, xdata); + fd_unref (wind_fd); + + if (!stub) + goto err; + + open_and_resume (this, wind_fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT (readv, frame, -1, ENOMEM, 0, 0, 0, 0, 0); + + return 0; +} + + +int +ob_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov, + int count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + call_stub_t *stub = NULL; + + stub = fop_writev_stub (frame, default_writev_resume, fd, iov, count, + offset, flags, iobref, xdata); + if (!stub) + goto err; + + open_and_resume (this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT (writev, frame, -1, ENOMEM, 0, 0, 0); + + return 0; +} + + +int +ob_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + call_stub_t *stub = NULL; + fd_t *wind_fd = NULL; + + wind_fd = ob_get_wind_fd (this, fd); + + stub = fop_fstat_stub (frame, default_fstat_resume, wind_fd, xdata); + + fd_unref (wind_fd); + + if (!stub) + goto err; + + open_and_resume (this, wind_fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT (fstat, frame, -1, ENOMEM, 0, 0); + + return 0; +} + + +int +ob_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + call_stub_t *stub = NULL; + ob_fd_t *ob_fd = NULL; + gf_boolean_t unwind = _gf_false; + + LOCK (&fd->lock); + { + ob_fd = __ob_fd_ctx_get (this, fd); + if (ob_fd && ob_fd->open_frame) + /* if open() was never wound to backend, + no need to wind flush() either. + */ + unwind = _gf_true; + } + UNLOCK (&fd->lock); + + if (unwind) + goto unwind; + + stub = fop_flush_stub (frame, default_flush_resume, fd, xdata); + if (!stub) + goto err; + + open_and_resume (this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM, 0); + + return 0; + +unwind: + STACK_UNWIND_STRICT (flush, frame, 0, 0, 0); + + return 0; +} + + +int +ob_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int flag, + dict_t *xdata) +{ + call_stub_t *stub = NULL; + + stub = fop_fsync_stub (frame, default_fsync_resume, fd, flag, xdata); + if (!stub) + goto err; + + open_and_resume (this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0); + + return 0; +} + + +int +ob_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd, + struct gf_flock *flock, dict_t *xdata) +{ + call_stub_t *stub = NULL; + + stub = fop_lk_stub (frame, default_lk_resume, fd, cmd, flock, xdata); + if (!stub) + goto err; + + open_and_resume (this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT (lk, frame, -1, ENOMEM, 0, 0); + + return 0; +} + +int +ob_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + call_stub_t *stub = NULL; + + stub = fop_ftruncate_stub (frame, default_ftruncate_resume, fd, offset, + xdata); + if (!stub) + goto err; + + open_and_resume (this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOMEM, 0, 0, 0); + + return 0; +} + + +int +ob_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr, + int flags, dict_t *xdata) +{ + call_stub_t *stub = NULL; + + stub = fop_fsetxattr_stub (frame, default_fsetxattr_resume, fd, xattr, + flags, xdata); + if (!stub) + goto err; + + open_and_resume (this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT (fsetxattr, frame, -1, ENOMEM, 0); + + return 0; +} + + +int +ob_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + call_stub_t *stub = NULL; + + stub = fop_fgetxattr_stub (frame, default_fgetxattr_resume, fd, name, + xdata); + if (!stub) + goto err; + + open_and_resume (this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOMEM, 0, 0); + + return 0; +} + + +int +ob_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + call_stub_t *stub = NULL; + + stub = fop_fremovexattr_stub (frame, default_fremovexattr_resume, fd, + name, xdata); + if (!stub) + goto err; + + open_and_resume (this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT (fremovexattr, frame, -1, ENOMEM, 0); + + return 0; +} + + +int +ob_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int cmd, struct gf_flock *flock, dict_t *xdata) +{ + call_stub_t *stub = NULL; + + stub = fop_finodelk_stub (frame, default_finodelk_resume, volume, fd, + cmd, flock, xdata); + if (!stub) + goto err; + + open_and_resume (this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT (finodelk, frame, -1, ENOMEM, 0); + + return 0; +} + + +int +ob_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + call_stub_t *stub = NULL; + + stub = fop_fentrylk_stub (frame, default_fentrylk_resume, volume, fd, + basename, cmd, type, xdata); + if (!stub) + goto err; + + open_and_resume (this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOMEM, 0); + + return 0; +} + + +int +ob_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + call_stub_t *stub = NULL; + + stub = fop_fxattrop_stub (frame, default_fxattrop_resume, fd, optype, + xattr, xdata); + if (!stub) + goto err; + + open_and_resume (this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT (fxattrop, frame, -1, ENOMEM, 0, 0); + + return 0; +} + + +int +ob_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *iatt, int valid, dict_t *xdata) +{ + call_stub_t *stub = NULL; + + stub = fop_fsetattr_stub (frame, default_fsetattr_resume, fd, + iatt, valid, xdata); + if (!stub) + goto err; + + open_and_resume (this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOMEM, 0, 0, 0); + + return 0; +} + +int +ob_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + call_stub_t *stub; + + stub = fop_fallocate_stub(frame, default_fallocate_resume, fd, mode, + offset, len, xdata); + if (!stub) + goto err; + + open_and_resume(this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +} + +int +ob_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + call_stub_t *stub; + + stub = fop_discard_stub(frame, default_discard_resume, fd, offset, len, + xdata); + if (!stub) + goto err; + + open_and_resume(this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT(discard, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +} + +int +ob_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + call_stub_t *stub; + + stub = fop_zerofill_stub(frame, default_zerofill_resume, fd, + offset, len, xdata); + if (!stub) + goto err; + + open_and_resume(this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT(zerofill, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +} + + +int +ob_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) +{ + fd_t *fd = NULL; + call_stub_t *stub = NULL; + + stub = fop_unlink_stub (frame, default_unlink_resume, loc, + xflags, xdata); + if (!stub) + goto err; + + fd = fd_lookup (loc->inode, 0); + + open_and_resume (this, fd, stub); + if (fd) + fd_unref (fd); + + return 0; +err: + STACK_UNWIND_STRICT (unlink, frame, -1, ENOMEM, 0, 0, 0); + + return 0; +} + + +int +ob_rename (call_frame_t *frame, xlator_t *this, loc_t *src, loc_t *dst, + dict_t *xdata) +{ + fd_t *fd = NULL; + call_stub_t *stub = NULL; + + stub = fop_rename_stub (frame, default_rename_resume, src, dst, xdata); + if (!stub) + goto err; + + if (dst->inode) + fd = fd_lookup (dst->inode, 0); + + open_and_resume (this, fd, stub); + if (fd) + fd_unref (fd); + + return 0; +err: + STACK_UNWIND_STRICT (rename, frame, -1, ENOMEM, 0, 0, 0, 0, 0, 0); + + return 0; +} + + +int +ob_release (xlator_t *this, fd_t *fd) +{ + ob_fd_t *ob_fd = NULL; + + ob_fd = ob_fd_ctx_get (this, fd); + + ob_fd_free (ob_fd); + + return 0; +} + + +int +ob_priv_dump (xlator_t *this) +{ + ob_conf_t *conf = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + + conf = this->private; + + if (!conf) + return -1; + + gf_proc_dump_build_key (key_prefix, "xlator.performance.open-behind", + "priv"); + + gf_proc_dump_add_section (key_prefix); + + gf_proc_dump_write ("use_anonymous_fd", "%d", conf->use_anonymous_fd); + + gf_proc_dump_write ("lazy_open", "%d", conf->lazy_open); + + return 0; +} + + +int +ob_fdctx_dump (xlator_t *this, fd_t *fd) +{ + ob_fd_t *ob_fd = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; + int ret = 0; + + ret = TRY_LOCK (&fd->lock); + if (ret) + return 0; + + ob_fd = __ob_fd_ctx_get (this, fd); + if (!ob_fd) { + UNLOCK (&fd->lock); + return 0; + } + + gf_proc_dump_build_key (key_prefix, "xlator.performance.open-behind", + "file"); + gf_proc_dump_add_section (key_prefix); + + gf_proc_dump_write ("fd", "%p", fd); + + gf_proc_dump_write ("open_frame", "%p", ob_fd->open_frame); + + gf_proc_dump_write ("open_frame.root.unique", "%p", + ob_fd->open_frame->root->unique); + + gf_proc_dump_write ("loc.path", "%s", ob_fd->loc.path); + + gf_proc_dump_write ("loc.ino", "%s", uuid_utoa (ob_fd->loc.gfid)); + + gf_proc_dump_write ("flags", "%p", ob_fd->open_frame); + + UNLOCK (&fd->lock); + + return 0; +} + + +int +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + ret = xlator_mem_acct_init (this, gf_ob_mt_end + 1); + + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Memory accounting failed"); + + return ret; +} + + +int +reconfigure (xlator_t *this, dict_t *options) +{ + ob_conf_t *conf = NULL; + int ret = -1; + + conf = this->private; + + GF_OPTION_RECONF ("use-anonymous-fd", conf->use_anonymous_fd, options, + bool, out); + + GF_OPTION_RECONF ("lazy-open", conf->lazy_open, options, bool, out); + + ret = 0; +out: + return ret; +} + + +int +init (xlator_t *this) +{ + ob_conf_t *conf = NULL; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: volume (%s) not configured with exactly one " + "child", this->name); + return -1; + } + + if (!this->parents) + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + + conf = GF_CALLOC (1, sizeof (*conf), gf_ob_mt_conf_t); + if (!conf) + goto err; + + GF_OPTION_INIT ("use-anonymous-fd", conf->use_anonymous_fd, bool, err); + + GF_OPTION_INIT ("lazy-open", conf->lazy_open, bool, err); + + this->private = conf; + + return 0; +err: + if (conf) + GF_FREE (conf); + + return -1; +} + + +void +fini (xlator_t *this) +{ + ob_conf_t *conf = NULL; + + conf = this->private; + + GF_FREE (conf); + + return; +} + + +struct xlator_fops fops = { + .open = ob_open, + .readv = ob_readv, + .writev = ob_writev, + .flush = ob_flush, + .fsync = ob_fsync, + .fstat = ob_fstat, + .ftruncate = ob_ftruncate, + .fsetxattr = ob_fsetxattr, + .fgetxattr = ob_fgetxattr, + .fremovexattr = ob_fremovexattr, + .finodelk = ob_finodelk, + .fentrylk = ob_fentrylk, + .fxattrop = ob_fxattrop, + .fsetattr = ob_fsetattr, + .fallocate = ob_fallocate, + .discard = ob_discard, + .zerofill = ob_zerofill, + .unlink = ob_unlink, + .rename = ob_rename, + .lk = ob_lk, +}; + +struct xlator_cbks cbks = { + .release = ob_release, +}; + +struct xlator_dumpops dumpops = { + .priv = ob_priv_dump, + .fdctx = ob_fdctx_dump, +}; + + +struct volume_options options[] = { + { .key = {"use-anonymous-fd"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "yes", + .description = "For read operations, use anonymous FD when " + "original FD is open-behind and not yet opened in the backend.", + }, + { .key = {"lazy-open"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "yes", + .description = "Perform open in the backend only when a necessary " + "FOP arrives (e.g writev on the FD, unlink of the file). When option " + "is disabled, perform backend open right after unwinding open().", + }, + { .key = {NULL} } + +}; diff --git a/xlators/performance/quick-read/Makefile.am b/xlators/performance/quick-read/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/performance/quick-read/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/performance/quick-read/src/Makefile.am b/xlators/performance/quick-read/src/Makefile.am new file mode 100644 index 000000000..4906f408a --- /dev/null +++ b/xlators/performance/quick-read/src/Makefile.am @@ -0,0 +1,15 @@ +xlator_LTLIBRARIES = quick-read.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +quick_read_la_LDFLAGS = -module -avoid-version + +quick_read_la_SOURCES = quick-read.c +quick_read_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = quick-read.h quick-read-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/performance/quick-read/src/quick-read-mem-types.h b/xlators/performance/quick-read/src/quick-read-mem-types.h new file mode 100644 index 000000000..78547f641 --- /dev/null +++ b/xlators/performance/quick-read/src/quick-read-mem-types.h @@ -0,0 +1,27 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __QR_MEM_TYPES_H__ +#define __QR_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_qr_mem_types_ { + gf_qr_mt_qr_inode_t = gf_common_mt_end + 1, + gf_qr_mt_content_t, + gf_qr_mt_qr_fd_ctx_t, + gf_qr_mt_iovec, + gf_qr_mt_qr_conf_t, + gf_qr_mt_qr_priority_t, + gf_qr_mt_qr_private_t, + gf_qr_mt_qr_unlink_ctx_t, + gf_qr_mt_end +}; +#endif diff --git a/xlators/performance/quick-read/src/quick-read.c b/xlators/performance/quick-read/src/quick-read.c new file mode 100644 index 000000000..445ea8658 --- /dev/null +++ b/xlators/performance/quick-read/src/quick-read.c @@ -0,0 +1,1147 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "quick-read.h" +#include "statedump.h" + +qr_inode_t *qr_inode_ctx_get (xlator_t *this, inode_t *inode); +void __qr_inode_prune (qr_inode_table_t *table, qr_inode_t *qr_inode); + + +int +__qr_inode_ctx_set (xlator_t *this, inode_t *inode, qr_inode_t *qr_inode) +{ + uint64_t value = 0; + int ret = -1; + + value = (long) qr_inode; + + ret = __inode_ctx_set (inode, this, &value); + + return ret; +} + + +qr_inode_t * +__qr_inode_ctx_get (xlator_t *this, inode_t *inode) +{ + qr_inode_t *qr_inode = NULL; + uint64_t value = 0; + int ret = -1; + + ret = __inode_ctx_get (inode, this, &value); + if (ret) + return NULL; + + qr_inode = (void *) ((long) value); + + return qr_inode; +} + + +qr_inode_t * +qr_inode_ctx_get (xlator_t *this, inode_t *inode) +{ + qr_inode_t *qr_inode = NULL; + + LOCK (&inode->lock); + { + qr_inode = __qr_inode_ctx_get (this, inode); + } + UNLOCK (&inode->lock); + + return qr_inode; +} + + +qr_inode_t * +qr_inode_new (xlator_t *this, inode_t *inode) +{ + qr_inode_t *qr_inode = NULL; + + qr_inode = GF_CALLOC (1, sizeof (*qr_inode), gf_qr_mt_qr_inode_t); + if (!qr_inode) + return NULL; + + INIT_LIST_HEAD (&qr_inode->lru); + + qr_inode->priority = 0; /* initial priority */ + + return qr_inode; +} + + +qr_inode_t * +qr_inode_ctx_get_or_new (xlator_t *this, inode_t *inode) +{ + qr_inode_t *qr_inode = NULL; + int ret = -1; + qr_private_t *priv = NULL; + + priv = this->private; + + LOCK (&inode->lock); + { + qr_inode = __qr_inode_ctx_get (this, inode); + if (qr_inode) + goto unlock; + + qr_inode = qr_inode_new (this, inode); + if (!qr_inode) + goto unlock; + + ret = __qr_inode_ctx_set (this, inode, qr_inode); + if (ret) { + __qr_inode_prune (&priv->table, qr_inode); + GF_FREE (qr_inode); + } + } +unlock: + UNLOCK (&inode->lock); + + return qr_inode; +} + + +uint32_t +qr_get_priority (qr_conf_t *conf, const char *path) +{ + uint32_t priority = 0; + struct qr_priority *curr = NULL; + + list_for_each_entry (curr, &conf->priority_list, list) { + if (fnmatch (curr->pattern, path, FNM_NOESCAPE) == 0) + priority = curr->priority; + } + + return priority; +} + + +void +__qr_inode_register (qr_inode_table_t *table, qr_inode_t *qr_inode) +{ + if (!qr_inode->data) + return; + + if (list_empty (&qr_inode->lru)) + /* first time addition of this qr_inode into table */ + table->cache_used += qr_inode->size; + else + list_del_init (&qr_inode->lru); + + list_add_tail (&qr_inode->lru, &table->lru[qr_inode->priority]); +} + + +void +qr_inode_set_priority (xlator_t *this, inode_t *inode, const char *path) +{ + uint32_t priority = 0; + qr_inode_table_t *table = NULL; + qr_inode_t *qr_inode = NULL; + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + + qr_inode = qr_inode_ctx_get (this, inode); + if (!qr_inode) + return; + + priv = this->private; + table = &priv->table; + conf = &priv->conf; + + if (path) + priority = qr_get_priority (conf, path); + else + /* retain existing priority, just bump LRU */ + priority = qr_inode->priority; + + LOCK (&table->lock); + { + qr_inode->priority = priority; + + __qr_inode_register (table, qr_inode); + } + UNLOCK (&table->lock); +} + + +/* To be called with priv->table.lock held */ +void +__qr_inode_prune (qr_inode_table_t *table, qr_inode_t *qr_inode) +{ + GF_FREE (qr_inode->data); + qr_inode->data = NULL; + + if (!list_empty (&qr_inode->lru)) { + table->cache_used -= qr_inode->size; + qr_inode->size = 0; + + list_del_init (&qr_inode->lru); + } + + memset (&qr_inode->buf, 0, sizeof (qr_inode->buf)); +} + + +void +qr_inode_prune (xlator_t *this, inode_t *inode) +{ + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + qr_inode_t *qr_inode = NULL; + + qr_inode = qr_inode_ctx_get (this, inode); + if (!qr_inode) + return; + + priv = this->private; + table = &priv->table; + + LOCK (&table->lock); + { + __qr_inode_prune (table, qr_inode); + } + UNLOCK (&table->lock); +} + + +/* To be called with priv->table.lock held */ +void +__qr_cache_prune (qr_inode_table_t *table, qr_conf_t *conf) +{ + qr_inode_t *curr = NULL; + qr_inode_t *next = NULL; + int index = 0; + size_t size_pruned = 0; + + for (index = 0; index < conf->max_pri; index++) { + list_for_each_entry_safe (curr, next, &table->lru[index], lru) { + + size_pruned += curr->size; + + __qr_inode_prune (table, curr); + + if (table->cache_used < conf->cache_size) + return; + } + } + + return; +} + + +void +qr_cache_prune (xlator_t *this) +{ + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + qr_inode_table_t *table = NULL; + + priv = this->private; + table = &priv->table; + conf = &priv->conf; + + LOCK (&table->lock); + { + if (table->cache_used > conf->cache_size) + __qr_cache_prune (table, conf); + } + UNLOCK (&table->lock); +} + + +void * +qr_content_extract (dict_t *xdata) +{ + data_t *data = NULL; + void *content = NULL; + + data = dict_get (xdata, GF_CONTENT_KEY); + if (!data) + return NULL; + + content = GF_CALLOC (1, data->len, gf_qr_mt_content_t); + if (!content) + return NULL; + + memcpy (content, data->data, data->len); + + return content; +} + + +void +qr_content_update (xlator_t *this, qr_inode_t *qr_inode, void *data, + struct iatt *buf) +{ + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + + priv = this->private; + table = &priv->table; + + LOCK (&table->lock); + { + __qr_inode_prune (table, qr_inode); + + qr_inode->data = data; + qr_inode->size = buf->ia_size; + + qr_inode->ia_mtime = buf->ia_mtime; + qr_inode->ia_mtime_nsec = buf->ia_mtime_nsec; + + qr_inode->buf = *buf; + + gettimeofday (&qr_inode->last_refresh, NULL); + + __qr_inode_register (table, qr_inode); + } + UNLOCK (&table->lock); + + qr_cache_prune (this); +} + + +gf_boolean_t +qr_size_fits (qr_conf_t *conf, struct iatt *buf) +{ + return (buf->ia_size <= conf->max_file_size); +} + + +gf_boolean_t +qr_mtime_equal (qr_inode_t *qr_inode, struct iatt *buf) +{ + return (qr_inode->ia_mtime == buf->ia_mtime && + qr_inode->ia_mtime_nsec == buf->ia_mtime_nsec); +} + + +void +__qr_content_refresh (xlator_t *this, qr_inode_t *qr_inode, struct iatt *buf) +{ + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + qr_conf_t *conf = NULL; + + priv = this->private; + table = &priv->table; + conf = &priv->conf; + + if (qr_size_fits (conf, buf) && qr_mtime_equal (qr_inode, buf)) { + qr_inode->buf = *buf; + + gettimeofday (&qr_inode->last_refresh, NULL); + + __qr_inode_register (table, qr_inode); + } else { + __qr_inode_prune (table, qr_inode); + } + + return; +} + + +void +qr_content_refresh (xlator_t *this, qr_inode_t *qr_inode, struct iatt *buf) +{ + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + + priv = this->private; + table = &priv->table; + + LOCK (&table->lock); + { + __qr_content_refresh (this, qr_inode, buf); + } + UNLOCK (&table->lock); +} + + +gf_boolean_t +__qr_cache_is_fresh (xlator_t *this, qr_inode_t *qr_inode) +{ + qr_conf_t *conf = NULL; + qr_private_t *priv = NULL; + struct timeval now; + struct timeval diff; + + priv = this->private; + conf = &priv->conf; + + gettimeofday (&now, NULL); + + timersub (&now, &qr_inode->last_refresh, &diff); + + if (diff.tv_sec >= conf->cache_timeout) + return _gf_false; + + return _gf_true; +} + + +int +qr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode_ret, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + void *content = NULL; + qr_inode_t *qr_inode = NULL; + inode_t *inode = NULL; + + inode = frame->local; + frame->local = NULL; + + if (op_ret == -1) { + qr_inode_prune (this, inode); + goto out; + } + + if (dict_get (xdata, "sh-failed")) { + qr_inode_prune (this, inode); + goto out; + } + + content = qr_content_extract (xdata); + + if (content) { + /* new content came along, always replace old content */ + qr_inode = qr_inode_ctx_get_or_new (this, inode); + if (!qr_inode) + /* no harm done */ + goto out; + + qr_content_update (this, qr_inode, content, buf); + } else { + /* purge old content if necessary */ + qr_inode = qr_inode_ctx_get (this, inode); + if (!qr_inode) + /* usual path for large files */ + goto out; + + qr_content_refresh (this, qr_inode, buf); + } +out: + if (inode) + inode_unref (inode); + + STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode_ret, + buf, xdata, postparent); + return 0; +} + + +int +qr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + qr_inode_t *qr_inode = NULL; + int ret = -1; + dict_t *new_xdata = NULL; + + priv = this->private; + conf = &priv->conf; + + qr_inode = qr_inode_ctx_get (this, loc->inode); + if (qr_inode && qr_inode->data) + /* cached. only validate in qr_lookup_cbk */ + goto wind; + + if (!xdata) + xdata = new_xdata = dict_new (); + + if (!xdata) + goto wind; + + ret = 0; + if (conf->max_file_size) + ret = dict_set (xdata, GF_CONTENT_KEY, + data_from_uint64 (conf->max_file_size)); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "cannot set key in request dict (%s)", + loc->path); +wind: + frame->local = inode_ref (loc->inode); + + STACK_WIND (frame, qr_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + + if (new_xdata) + dict_unref (new_xdata); + + return 0; +} + + +int +qr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + qr_inode_t *qr_inode = NULL; + + if (op_ret <= 0) + goto unwind; + + list_for_each_entry (entry, &entries->list, list) { + if (!entry->inode) + continue; + + qr_inode = qr_inode_ctx_get (this, entry->inode); + if (!qr_inode) + /* no harm */ + continue; + + qr_content_refresh (this, qr_inode, &entry->d_stat); + } + +unwind: + STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} + + +int +qr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t offset, dict_t *xdata) +{ + STACK_WIND (frame, qr_readdirp_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->readdirp, + fd, size, offset, xdata); + return 0; +} + + +int +qr_readv_cached (call_frame_t *frame, qr_inode_t *qr_inode, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + xlator_t *this = NULL; + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + int op_ret = -1; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + struct iovec iov = {0, }; + struct iatt buf = {0, }; + + this = frame->this; + priv = this->private; + table = &priv->table; + + LOCK (&table->lock); + { + op_ret = -1; + + if (!qr_inode->data) + goto unlock; + + if (offset >= qr_inode->size) + goto unlock; + + if (!__qr_cache_is_fresh (this, qr_inode)) + goto unlock; + + op_ret = min (size, (qr_inode->size - offset)); + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, op_ret); + if (!iobuf) { + op_ret = -1; + goto unlock; + } + + iobref = iobref_new (); + if (!iobref) { + op_ret = -1; + iobuf_unref (iobuf); + goto unlock; + } + + iobref_add (iobref, iobuf); + + memcpy (iobuf->ptr, qr_inode->data + offset, op_ret); + + buf = qr_inode->buf; + + /* bump LRU */ + __qr_inode_register (table, qr_inode); + } +unlock: + UNLOCK (&table->lock); + + if (op_ret > 0) { + iov.iov_base = iobuf->ptr; + iov.iov_len = op_ret; + + STACK_UNWIND_STRICT (readv, frame, op_ret, 0, &iov, 1, + &buf, iobref, xdata); + } + + if (iobuf) + iobuf_unref (iobuf); + + if (iobref) + iobref_unref (iobref); + + return op_ret; +} + + +int +qr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + qr_inode_t *qr_inode = NULL; + + qr_inode = qr_inode_ctx_get (this, fd->inode); + if (!qr_inode) + goto wind; + + if (qr_readv_cached (frame, qr_inode, size, offset, flags, xdata) <= 0) + goto wind; + + return 0; +wind: + STACK_WIND (frame, default_readv_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; +} + + +int +qr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov, + int count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + qr_inode_prune (this, fd->inode); + + STACK_WIND (frame, default_writev_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, + fd, iov, count, offset, flags, iobref, xdata); + return 0; +} + + +int +qr_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + qr_inode_prune (this, loc->inode); + + STACK_WIND (frame, default_truncate_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->truncate, + loc, offset, xdata); + return 0; +} + + +int +qr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + qr_inode_prune (this, fd->inode); + + STACK_WIND (frame, default_ftruncate_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->ftruncate, + fd, offset, xdata); + return 0; +} + + +int +qr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + fd_t *fd, dict_t *xdata) +{ + qr_inode_set_priority (this, fd->inode, loc->path); + + STACK_WIND (frame, default_open_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->open, + loc, flags, fd, xdata); + return 0; +} + +int +qr_forget (xlator_t *this, inode_t *inode) +{ + qr_inode_t *qr_inode = NULL; + + qr_inode = qr_inode_ctx_get (this, inode); + + if (!qr_inode) + return 0; + + qr_inode_prune (this, inode); + + GF_FREE (qr_inode); + + return 0; +} + + +int32_t +qr_inodectx_dump (xlator_t *this, inode_t *inode) +{ + qr_inode_t *qr_inode = NULL; + int32_t ret = -1; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; + char buf[256] = {0, }; + + qr_inode = qr_inode_ctx_get (this, inode); + if (!qr_inode) + goto out; + + gf_proc_dump_build_key (key_prefix, "xlator.performance.quick-read", + "inodectx"); + gf_proc_dump_add_section (key_prefix); + + gf_proc_dump_write ("entire-file-cached", "%s", qr_inode->data ? "yes" : "no"); + + if (qr_inode->last_refresh.tv_sec) { + gf_time_fmt (buf, sizeof buf, qr_inode->last_refresh.tv_sec, + gf_timefmt_FT); + snprintf (buf + strlen (buf), sizeof buf - strlen (buf), + ".%"GF_PRI_SUSECONDS, qr_inode->last_refresh.tv_usec); + + gf_proc_dump_write ("last-cache-validation-time", "%s", buf); + } + + ret = 0; +out: + return ret; +} + + +int +qr_priv_dump (xlator_t *this) +{ + qr_conf_t *conf = NULL; + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + uint32_t file_count = 0; + uint32_t i = 0; + qr_inode_t *curr = NULL; + uint64_t total_size = 0; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + + if (!this) { + return -1; + } + + priv = this->private; + conf = &priv->conf; + + if (!conf) + return -1; + + table = &priv->table; + + gf_proc_dump_build_key (key_prefix, "xlator.performance.quick-read", + "priv"); + + gf_proc_dump_add_section (key_prefix); + + gf_proc_dump_write ("max_file_size", "%d", conf->max_file_size); + gf_proc_dump_write ("cache_timeout", "%d", conf->cache_timeout); + + if (!table) { + goto out; + } else { + for (i = 0; i < conf->max_pri; i++) { + list_for_each_entry (curr, &table->lru[i], lru) { + file_count++; + total_size += curr->size; + } + } + } + + gf_proc_dump_write ("total_files_cached", "%d", file_count); + gf_proc_dump_write ("total_cache_used", "%d", total_size); + +out: + return 0; +} + + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_qr_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; + } + + return ret; +} + + +static gf_boolean_t +check_cache_size_ok (xlator_t *this, int64_t cache_size) +{ + int ret = _gf_true; + uint64_t total_mem = 0; + uint64_t max_cache_size = 0; + volume_option_t *opt = NULL; + + GF_ASSERT (this); + opt = xlator_volume_option_get (this, "cache-size"); + if (!opt) { + ret = _gf_false; + gf_log (this->name, GF_LOG_ERROR, + "could not get cache-size option"); + goto out; + } + + total_mem = get_mem_size (); + if (-1 == total_mem) + max_cache_size = opt->max; + else + max_cache_size = total_mem; + + gf_log (this->name, GF_LOG_DEBUG, "Max cache size is %"PRIu64, + max_cache_size); + if (cache_size > max_cache_size) { + ret = _gf_false; + gf_log (this->name, GF_LOG_ERROR, "Cache size %"PRIu64 + " is greater than the max size of %"PRIu64, + cache_size, max_cache_size); + goto out; + } +out: + return ret; +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ + int32_t ret = -1; + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + uint64_t cache_size_new = 0; + + GF_VALIDATE_OR_GOTO ("quick-read", this, out); + GF_VALIDATE_OR_GOTO (this->name, this->private, out); + GF_VALIDATE_OR_GOTO (this->name, options, out); + + priv = this->private; + + conf = &priv->conf; + if (!conf) { + goto out; + } + + GF_OPTION_RECONF ("cache-timeout", conf->cache_timeout, options, int32, + out); + + GF_OPTION_RECONF ("cache-size", cache_size_new, options, size, out); + if (!check_cache_size_ok (this, cache_size_new)) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "Not reconfiguring cache-size"); + goto out; + } + conf->cache_size = cache_size_new; + + ret = 0; +out: + return ret; +} + + +int32_t +qr_get_priority_list (const char *opt_str, struct list_head *first) +{ + int32_t max_pri = 1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *tmp_str2 = NULL; + char *dup_str = NULL; + char *priority_str = NULL; + char *pattern = NULL; + char *priority = NULL; + char *string = NULL; + struct qr_priority *curr = NULL, *tmp = NULL; + + GF_VALIDATE_OR_GOTO ("quick-read", opt_str, out); + GF_VALIDATE_OR_GOTO ("quick-read", first, out); + + string = gf_strdup (opt_str); + if (string == NULL) { + max_pri = -1; + goto out; + } + + /* Get the pattern for cache priority. + * "option priority *.jpg:1,abc*:2" etc + */ + /* TODO: inode_lru in table is statically hard-coded to 5, + * should be changed to run-time configuration + */ + priority_str = strtok_r (string, ",", &tmp_str); + while (priority_str) { + curr = GF_CALLOC (1, sizeof (*curr), gf_qr_mt_qr_priority_t); + if (curr == NULL) { + max_pri = -1; + goto out; + } + + list_add_tail (&curr->list, first); + + dup_str = gf_strdup (priority_str); + if (dup_str == NULL) { + max_pri = -1; + goto out; + } + + pattern = strtok_r (dup_str, ":", &tmp_str1); + if (!pattern) { + max_pri = -1; + goto out; + } + + priority = strtok_r (NULL, ":", &tmp_str1); + if (!priority) { + max_pri = -1; + goto out; + } + + gf_log ("quick-read", GF_LOG_TRACE, + "quick-read priority : pattern %s : priority %s", + pattern, + priority); + + curr->pattern = gf_strdup (pattern); + if (curr->pattern == NULL) { + max_pri = -1; + goto out; + } + + curr->priority = strtol (priority, &tmp_str2, 0); + if (tmp_str2 && (*tmp_str2)) { + max_pri = -1; + goto out; + } else { + max_pri = max (max_pri, curr->priority); + } + + GF_FREE (dup_str); + dup_str = NULL; + + priority_str = strtok_r (NULL, ",", &tmp_str); + } +out: + GF_FREE (string); + + GF_FREE (dup_str); + + if (max_pri == -1) { + list_for_each_entry_safe (curr, tmp, first, list) { + list_del_init (&curr->list); + GF_FREE (curr->pattern); + GF_FREE (curr); + } + } + + return max_pri; +} + + +int32_t +init (xlator_t *this) +{ + int32_t ret = -1, i = 0; + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: volume (%s) not configured with exactly one " + "child", this->name); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + priv = GF_CALLOC (1, sizeof (*priv), gf_qr_mt_qr_private_t); + if (priv == NULL) { + ret = -1; + goto out; + } + + LOCK_INIT (&priv->table.lock); + conf = &priv->conf; + + GF_OPTION_INIT ("max-file-size", conf->max_file_size, size, out); + + GF_OPTION_INIT ("cache-timeout", conf->cache_timeout, int32, out); + + GF_OPTION_INIT ("cache-size", conf->cache_size, size, out); + if (!check_cache_size_ok (this, conf->cache_size)) { + ret = -1; + goto out; + } + + INIT_LIST_HEAD (&conf->priority_list); + conf->max_pri = 1; + if (dict_get (this->options, "priority")) { + char *option_list = data_to_str (dict_get (this->options, + "priority")); + gf_log (this->name, GF_LOG_TRACE, + "option path %s", option_list); + /* parse the list of pattern:priority */ + conf->max_pri = qr_get_priority_list (option_list, + &conf->priority_list); + + if (conf->max_pri == -1) { + goto out; + } + conf->max_pri ++; + } + + priv->table.lru = GF_CALLOC (conf->max_pri, sizeof (*priv->table.lru), + gf_common_mt_list_head); + if (priv->table.lru == NULL) { + ret = -1; + goto out; + } + + for (i = 0; i < conf->max_pri; i++) { + INIT_LIST_HEAD (&priv->table.lru[i]); + } + + ret = 0; + + this->private = priv; +out: + if ((ret == -1) && priv) { + GF_FREE (priv); + } + + return ret; +} + + +void +qr_inode_table_destroy (qr_private_t *priv) +{ + int i = 0; + qr_conf_t *conf = NULL; + + conf = &priv->conf; + + for (i = 0; i < conf->max_pri; i++) { + GF_ASSERT (list_empty (&priv->table.lru[i])); + } + + LOCK_DESTROY (&priv->table.lock); + + return; +} + + +void +qr_conf_destroy (qr_conf_t *conf) +{ + struct qr_priority *curr = NULL, *tmp = NULL; + + list_for_each_entry_safe (curr, tmp, &conf->priority_list, list) { + list_del (&curr->list); + GF_FREE (curr->pattern); + GF_FREE (curr); + } + + return; +} + + +void +fini (xlator_t *this) +{ + qr_private_t *priv = NULL; + + if (this == NULL) { + goto out; + } + + priv = this->private; + if (priv == NULL) { + goto out; + } + + qr_inode_table_destroy (priv); + qr_conf_destroy (&priv->conf); + + this->private = NULL; + + GF_FREE (priv); +out: + return; +} + +struct xlator_fops fops = { + .lookup = qr_lookup, + .readdirp = qr_readdirp, + .open = qr_open, + .readv = qr_readv, + .writev = qr_writev, + .truncate = qr_truncate, + .ftruncate = qr_ftruncate +}; + +struct xlator_cbks cbks = { + .forget = qr_forget, +}; + +struct xlator_dumpops dumpops = { + .priv = qr_priv_dump, + .inodectx = qr_inodectx_dump, +}; + +struct volume_options options[] = { + { .key = {"priority"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"cache-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = 32 * GF_UNIT_GB, + .default_value = "128MB", + .description = "Size of the read cache." + }, + { .key = {"cache-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 60, + .default_value = "1", + }, + { .key = {"max-file-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = 1 * GF_UNIT_KB * 1000, + .default_value = "64KB", + }, + { .key = {NULL} } +}; diff --git a/xlators/performance/quick-read/src/quick-read.h b/xlators/performance/quick-read/src/quick-read.h new file mode 100644 index 000000000..6f0a05417 --- /dev/null +++ b/xlators/performance/quick-read/src/quick-read.h @@ -0,0 +1,81 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __QUICK_READ_H +#define __QUICK_READ_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "list.h" +#include "compat.h" +#include "compat-errno.h" +#include "common-utils.h" +#include "call-stub.h" +#include "defaults.h" +#include <libgen.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <fnmatch.h> +#include "quick-read-mem-types.h" + + +struct qr_inode { + void *data; + size_t size; + int priority; + uint32_t ia_mtime; + uint32_t ia_mtime_nsec; + struct iatt buf; + struct timeval last_refresh; + struct list_head lru; +}; +typedef struct qr_inode qr_inode_t; + + +struct qr_priority { + char *pattern; + int32_t priority; + struct list_head list; +}; +typedef struct qr_priority qr_priority_t; + +struct qr_conf { + uint64_t max_file_size; + int32_t cache_timeout; + uint64_t cache_size; + int max_pri; + struct list_head priority_list; +}; +typedef struct qr_conf qr_conf_t; + +struct qr_inode_table { + uint64_t cache_used; + struct list_head *lru; + gf_lock_t lock; +}; +typedef struct qr_inode_table qr_inode_table_t; + +struct qr_private { + qr_conf_t conf; + qr_inode_table_t table; +}; +typedef struct qr_private qr_private_t; + + +#endif /* #ifndef __QUICK_READ_H */ diff --git a/xlators/performance/read-ahead/src/Makefile.am b/xlators/performance/read-ahead/src/Makefile.am index 7bb902282..be80ae7ac 100644 --- a/xlators/performance/read-ahead/src/Makefile.am +++ b/xlators/performance/read-ahead/src/Makefile.am @@ -1,14 +1,15 @@ xlator_LTLIBRARIES = read-ahead.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -read_ahead_la_LDFLAGS = -module -avoidversion +read_ahead_la_LDFLAGS = -module -avoid-version read_ahead_la_SOURCES = read-ahead.c page.c read_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = read-ahead.h +noinst_HEADERS = read-ahead.h read-ahead-mem-types.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/performance/read-ahead/src/page.c b/xlators/performance/read-ahead/src/page.c index f9117dca5..e79e7ae78 100644 --- a/xlators/performance/read-ahead/src/page.c +++ b/xlators/performance/read-ahead/src/page.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -32,317 +23,431 @@ ra_page_t * ra_page_get (ra_file_t *file, off_t offset) { - ra_page_t *page = NULL; - off_t rounded_offset = 0; + ra_page_t *page = NULL; + off_t rounded_offset = 0; - page = file->pages.next; - rounded_offset = floor (offset, file->page_size); + GF_VALIDATE_OR_GOTO ("read-ahead", file, out); - while (page != &file->pages && page->offset < rounded_offset) - page = page->next; + page = file->pages.next; + rounded_offset = floor (offset, file->page_size); - if (page == &file->pages || page->offset != rounded_offset) - page = NULL; + while (page != &file->pages && page->offset < rounded_offset) + page = page->next; - return page; + if (page == &file->pages || page->offset != rounded_offset) + page = NULL; + +out: + return page; } ra_page_t * ra_page_create (ra_file_t *file, off_t offset) { - ra_page_t *page = NULL; - off_t rounded_offset = 0; - ra_page_t *newpage = NULL; + ra_page_t *page = NULL; + off_t rounded_offset = 0; + ra_page_t *newpage = NULL; + + GF_VALIDATE_OR_GOTO ("read-ahead", file, out); - page = file->pages.next; - rounded_offset = floor (offset, file->page_size); + page = file->pages.next; + rounded_offset = floor (offset, file->page_size); - while (page != &file->pages && page->offset < rounded_offset) - page = page->next; + while (page != &file->pages && page->offset < rounded_offset) + page = page->next; - if (page == &file->pages || page->offset != rounded_offset) { - newpage = CALLOC (1, sizeof (*newpage)); - if (!newpage) - return NULL; + if (page == &file->pages || page->offset != rounded_offset) { + newpage = GF_CALLOC (1, sizeof (*newpage), gf_ra_mt_ra_page_t); + if (!newpage) { + goto out; + } - newpage->offset = rounded_offset; - newpage->prev = page->prev; - newpage->next = page; - newpage->file = file; - page->prev->next = newpage; - page->prev = newpage; + newpage->offset = rounded_offset; + newpage->prev = page->prev; + newpage->next = page; + newpage->file = file; + page->prev->next = newpage; + page->prev = newpage; - page = newpage; - } + page = newpage; + } - return page; +out: + return page; } void ra_wait_on_page (ra_page_t *page, call_frame_t *frame) { - ra_waitq_t *waitq = NULL; - ra_local_t *local = NULL; - - local = frame->local; - waitq = CALLOC (1, sizeof (*waitq)); - if (!waitq) { - gf_log (frame->this->name, GF_LOG_ERROR, - "out of memory :("); - return; - } - - waitq->data = frame; - waitq->next = page->waitq; - page->waitq = waitq; - - ra_local_lock (local); - { - local->wait_count++; - } - ra_local_unlock (local); + ra_waitq_t *waitq = NULL; + ra_local_t *local = NULL; + + GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO (frame->this->name, page, out); + + local = frame->local; + + waitq = GF_CALLOC (1, sizeof (*waitq), gf_ra_mt_ra_waitq_t); + if (!waitq) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + waitq->data = frame; + waitq->next = page->waitq; + page->waitq = waitq; + + ra_local_lock (local); + { + local->wait_count++; + } + ra_local_unlock (local); + +out: + return; } void ra_waitq_return (ra_waitq_t *waitq) { - ra_waitq_t *trav = NULL; - ra_waitq_t *next = NULL; - call_frame_t *frame = NULL; + ra_waitq_t *trav = NULL; + ra_waitq_t *next = NULL; + call_frame_t *frame = NULL; - for (trav = waitq; trav; trav = next) { - next = trav->next; + for (trav = waitq; trav; trav = next) { + next = trav->next; - frame = trav->data; - ra_frame_return (frame); - free (trav); - } + frame = trav->data; + ra_frame_return (frame); + GF_FREE (trav); + } + + return; } int ra_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct stat *stbuf, struct iobref *iobref) + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) { - ra_local_t *local = NULL; - off_t pending_offset = 0; - ra_file_t *file = NULL; - ra_page_t *page = NULL; - off_t trav_offset = 0; - size_t payload_size = 0; - ra_waitq_t *waitq = NULL; - fd_t *fd = NULL; - int ret = 0; - uint64_t tmp_file = 0; - - local = frame->local; - fd = local->fd; - - ret = fd_ctx_get (fd, this, &tmp_file); - - file = (ra_file_t *)(long)tmp_file; - pending_offset = local->pending_offset; - trav_offset = pending_offset; - payload_size = op_ret; - - ra_file_lock (file); - { - if (op_ret >= 0) - file->stbuf = *stbuf; - - if (op_ret < 0) { - page = ra_page_get (file, pending_offset); - if (page) - waitq = ra_page_error (page, op_ret, op_errno); - goto unlock; - } - - page = ra_page_get (file, pending_offset); - if (!page) { - gf_log (this->name, GF_LOG_DEBUG, - "wasted copy: %"PRId64"[+%"PRId64"] file=%p", - pending_offset, file->page_size, file); - goto unlock; - } - - if (page->vector) { - iobref_unref (page->iobref); - free (page->vector); - } - - page->vector = iov_dup (vector, count); - page->count = count; - page->iobref = iobref_ref (iobref); - page->ready = 1; - - page->size = iov_length (vector, count); - - waitq = ra_page_wakeup (page); - } + ra_local_t *local = NULL; + off_t pending_offset = 0; + ra_file_t *file = NULL; + ra_page_t *page = NULL; + ra_waitq_t *waitq = NULL; + fd_t *fd = NULL; + uint64_t tmp_file = 0; + + GF_ASSERT (frame); + + local = frame->local; + fd = local->fd; + + fd_ctx_get (fd, this, &tmp_file); + + file = (ra_file_t *)(long)tmp_file; + pending_offset = local->pending_offset; + + if (file == NULL) { + gf_log (this->name, GF_LOG_WARNING, + "read-ahead context not set in fd (%p)", fd); + op_ret = -1; + op_errno = EBADF; + goto out; + } + + ra_file_lock (file); + { + if (op_ret >= 0) + file->stbuf = *stbuf; + + page = ra_page_get (file, pending_offset); + + if (!page) { + gf_log (this->name, GF_LOG_TRACE, + "wasted copy: %"PRId64"[+%"PRId64"] file=%p", + pending_offset, file->page_size, file); + goto unlock; + } + + /* + * "Dirty" means that the request was a pure read-ahead; it's + * set for requests we issue ourselves, and cleared when user + * requests are issued or put on the waitq. "Poisoned" means + * that we got a write while a read was still in flight, and we + * couldn't stop it so we marked it instead. If it's both + * dirty and poisoned by the time we get here, we cancel its + * effect so that a subsequent user read doesn't get data that + * we know is stale (because we made it stale ourselves). We + * can't use ESTALE because that has special significance. + * ECANCELED has no such special meaning, and is close to what + * we're trying to indicate. + */ + if (page->dirty && page->poisoned) { + op_ret = -1; + op_errno = ECANCELED; + } + + if (op_ret < 0) { + waitq = ra_page_error (page, op_ret, op_errno); + goto unlock; + } + + if (page->vector) { + iobref_unref (page->iobref); + GF_FREE (page->vector); + } + + page->vector = iov_dup (vector, count); + if (page->vector == NULL) { + waitq = ra_page_error (page, -1, ENOMEM); + goto unlock; + } + + page->count = count; + page->iobref = iobref_ref (iobref); + page->ready = 1; + + page->size = iov_length (vector, count); + + waitq = ra_page_wakeup (page); + } unlock: - ra_file_unlock (file); + ra_file_unlock (file); - ra_waitq_return (waitq); + ra_waitq_return (waitq); - fd_unref (local->fd); + fd_unref (local->fd); - free (frame->local); - frame->local = NULL; + mem_put (frame->local); + frame->local = NULL; - STACK_DESTROY (frame->root); - return 0; +out: + STACK_DESTROY (frame->root); + return 0; } void ra_page_fault (ra_file_t *file, call_frame_t *frame, off_t offset) { - call_frame_t *fault_frame = NULL; - ra_local_t *fault_local = NULL; - - fault_frame = copy_frame (frame); - fault_local = CALLOC (1, sizeof (ra_local_t)); - - fault_frame->local = fault_local; - fault_local->pending_offset = offset; - fault_local->pending_size = file->page_size; - - fault_local->fd = fd_ref (file->fd); - - STACK_WIND (fault_frame, ra_fault_cbk, - FIRST_CHILD (fault_frame->this), - FIRST_CHILD (fault_frame->this)->fops->readv, - file->fd, file->page_size, offset); - return; + call_frame_t *fault_frame = NULL; + ra_local_t *fault_local = NULL; + ra_page_t *page = NULL; + ra_waitq_t *waitq = NULL; + int32_t op_ret = -1, op_errno = -1; + + GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO (frame->this->name, file, out); + + fault_frame = copy_frame (frame); + if (fault_frame == NULL) { + op_ret = -1; + op_errno = ENOMEM; + goto err; + } + + fault_local = mem_get0 (THIS->local_pool); + if (fault_local == NULL) { + STACK_DESTROY (fault_frame->root); + op_ret = -1; + op_errno = ENOMEM; + goto err; + } + + fault_frame->local = fault_local; + fault_local->pending_offset = offset; + fault_local->pending_size = file->page_size; + + fault_local->fd = fd_ref (file->fd); + + STACK_WIND (fault_frame, ra_fault_cbk, + FIRST_CHILD (fault_frame->this), + FIRST_CHILD (fault_frame->this)->fops->readv, + file->fd, file->page_size, offset, 0, NULL); + + return; + +err: + ra_file_lock (file); + { + page = ra_page_get (file, offset); + if (page) + waitq = ra_page_error (page, op_ret, + op_errno); + } + ra_file_unlock (file); + + if (waitq != NULL) { + ra_waitq_return (waitq); + } + +out: + return; } + void ra_frame_fill (ra_page_t *page, call_frame_t *frame) { - ra_local_t *local = NULL; - ra_fill_t *fill = NULL; - off_t src_offset = 0; - off_t dst_offset = 0; - ssize_t copy_size = 0; - ra_fill_t *new = NULL; - - local = frame->local; - fill = &local->fill; - - if (local->op_ret != -1 && page->size) { - if (local->offset > page->offset) - src_offset = local->offset - page->offset; - else - dst_offset = page->offset - local->offset; - - copy_size = min (page->size - src_offset, - local->size - dst_offset); - - if (copy_size < 0) { - /* if page contains fewer bytes and the required offset - is beyond the page size in the page */ - copy_size = src_offset = 0; - } - - fill = fill->next; - while (fill != &local->fill) { - if (fill->offset > page->offset) { - break; - } - fill = fill->next; - } - - new = CALLOC (1, sizeof (*new)); - - new->offset = page->offset; - new->size = copy_size; - new->iobref = iobref_ref (page->iobref); - new->count = iov_subset (page->vector, page->count, - src_offset, src_offset+copy_size, - NULL); - new->vector = CALLOC (new->count, sizeof (struct iovec)); - - new->count = iov_subset (page->vector, page->count, - src_offset, src_offset+copy_size, - new->vector); - - new->next = fill; - new->prev = new->next->prev; - new->next->prev = new; - new->prev->next = new; - - local->op_ret += copy_size; - } + ra_local_t *local = NULL; + ra_fill_t *fill = NULL; + off_t src_offset = 0; + off_t dst_offset = 0; + ssize_t copy_size = 0; + ra_fill_t *new = NULL; + + GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO (frame->this->name, page, out); + + local = frame->local; + fill = &local->fill; + + if (local->op_ret != -1 && page->size) { + if (local->offset > page->offset) + src_offset = local->offset - page->offset; + else + dst_offset = page->offset - local->offset; + + copy_size = min (page->size - src_offset, + local->size - dst_offset); + + if (copy_size < 0) { + /* if page contains fewer bytes and the required offset + is beyond the page size in the page */ + copy_size = src_offset = 0; + } + + fill = fill->next; + while (fill != &local->fill) { + if (fill->offset > page->offset) { + break; + } + fill = fill->next; + } + + new = GF_CALLOC (1, sizeof (*new), gf_ra_mt_ra_fill_t); + if (new == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + new->offset = page->offset; + new->size = copy_size; + new->iobref = iobref_ref (page->iobref); + new->count = iov_subset (page->vector, page->count, + src_offset, src_offset+copy_size, + NULL); + new->vector = GF_CALLOC (new->count, sizeof (struct iovec), + gf_ra_mt_iovec); + if (new->vector == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + GF_FREE (new); + goto out; + } + + new->count = iov_subset (page->vector, page->count, + src_offset, src_offset+copy_size, + new->vector); + + new->next = fill; + new->prev = new->next->prev; + new->next->prev = new; + new->prev->next = new; + + local->op_ret += copy_size; + } + +out: + return; } void ra_frame_unwind (call_frame_t *frame) { - ra_local_t *local = NULL; - ra_fill_t *fill = NULL; - int32_t count = 0; - struct iovec *vector; - int32_t copied = 0; - struct iobref *iobref = NULL; - ra_fill_t *next = NULL; - fd_t *fd = NULL; - ra_file_t *file = NULL; - int ret = 0; - uint64_t tmp_file = 0; - - local = frame->local; - fill = local->fill.next; - - iobref = iobref_new (); - - frame->local = NULL; - - while (fill != &local->fill) { - count += fill->count; - fill = fill->next; - } - - vector = CALLOC (count, sizeof (*vector)); - - fill = local->fill.next; - - while (fill != &local->fill) { - next = fill->next; - - memcpy (((char *)vector) + copied, fill->vector, - fill->count * sizeof (*vector)); - - copied += (fill->count * sizeof (*vector)); - iobref_merge (iobref, fill->iobref); - - fill->next->prev = fill->prev; - fill->prev->next = fill->prev; - - iobref_unref (fill->iobref); - free (fill->vector); - free (fill); - - fill = next; - } - - fd = local->fd; - ret = fd_ctx_get (fd, frame->this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - - STACK_UNWIND (frame, local->op_ret, local->op_errno, - vector, count, &file->stbuf, iobref); - - iobref_unref (iobref); - pthread_mutex_destroy (&local->local_lock); - free (local); - free (vector); - - return; + ra_local_t *local = NULL; + ra_fill_t *fill = NULL; + int32_t count = 0; + struct iovec *vector = NULL; + int32_t copied = 0; + struct iobref *iobref = NULL; + ra_fill_t *next = NULL; + fd_t *fd = NULL; + ra_file_t *file = NULL; + uint64_t tmp_file = 0; + + GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); + + local = frame->local; + fill = local->fill.next; + + iobref = iobref_new (); + if (iobref == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } + + frame->local = NULL; + + while (fill != &local->fill) { + count += fill->count; + fill = fill->next; + } + + vector = GF_CALLOC (count, sizeof (*vector), gf_ra_mt_iovec); + if (vector == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + iobref_unref (iobref); + iobref = NULL; + } + + fill = local->fill.next; + + while (fill != &local->fill) { + next = fill->next; + + if ((vector != NULL) && (iobref != NULL)) { + memcpy (((char *)vector) + copied, fill->vector, + fill->count * sizeof (*vector)); + + copied += (fill->count * sizeof (*vector)); + iobref_merge (iobref, fill->iobref); + } + + fill->next->prev = fill->prev; + fill->prev->next = fill->prev; + + iobref_unref (fill->iobref); + GF_FREE (fill->vector); + GF_FREE (fill); + + fill = next; + } + + fd = local->fd; + fd_ctx_get (fd, frame->this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + STACK_UNWIND_STRICT (readv, frame, local->op_ret, local->op_errno, + vector, count, &file->stbuf, iobref, NULL); + + iobref_unref (iobref); + pthread_mutex_destroy (&local->local_lock); + mem_put (local); + GF_FREE (vector); + +out: + return; } /* @@ -353,25 +458,28 @@ ra_frame_unwind (call_frame_t *frame) void ra_frame_return (call_frame_t *frame) { - ra_local_t *local = NULL; - int32_t wait_count = 0; + ra_local_t *local = NULL; + int32_t wait_count = 0; + + GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); - local = frame->local; - assert (local->wait_count > 0); + local = frame->local; + GF_ASSERT (local->wait_count > 0); - ra_local_lock (local); - { - wait_count = --local->wait_count; - } - ra_local_unlock (local); + ra_local_lock (local); + { + wait_count = --local->wait_count; + } + ra_local_unlock (local); - if (!wait_count) - ra_frame_unwind (frame); + if (!wait_count) + ra_frame_unwind (frame); - return; +out: + return; } -/* +/* * ra_page_wakeup - * @page: * @@ -379,19 +487,24 @@ ra_frame_return (call_frame_t *frame) ra_waitq_t * ra_page_wakeup (ra_page_t *page) { - ra_waitq_t *waitq = NULL, *trav = NULL; - call_frame_t *frame; + ra_waitq_t *waitq = NULL, *trav = NULL; + call_frame_t *frame = NULL; + + GF_VALIDATE_OR_GOTO ("read-ahead", page, out); - waitq = page->waitq; - page->waitq = NULL; + waitq = page->waitq; + page->waitq = NULL; - trav = waitq; - for (trav = waitq; trav; trav = trav->next) { - frame = trav->data; - ra_frame_fill (page, frame); - } + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; + ra_frame_fill (page, frame); + } - return waitq; + if (page->stale) { + ra_page_purge (page); + } +out: + return waitq; } /* @@ -402,14 +515,20 @@ ra_page_wakeup (ra_page_t *page) void ra_page_purge (ra_page_t *page) { - page->prev->next = page->next; - page->next->prev = page->prev; - - if (page->iobref) { - iobref_unref (page->iobref); - } - free (page->vector); - free (page); + GF_VALIDATE_OR_GOTO ("read-ahead", page, out); + + page->prev->next = page->next; + page->next->prev = page->prev; + + if (page->iobref) { + iobref_unref (page->iobref); + } + + GF_FREE (page->vector); + GF_FREE (page); + +out: + return; } /* @@ -422,32 +541,33 @@ ra_page_purge (ra_page_t *page) ra_waitq_t * ra_page_error (ra_page_t *page, int32_t op_ret, int32_t op_errno) { + ra_waitq_t *waitq = NULL; + ra_waitq_t *trav = NULL; + call_frame_t *frame = NULL; + ra_local_t *local = NULL; - ra_waitq_t *waitq = NULL; - ra_waitq_t *trav = NULL; - call_frame_t *frame = NULL; - ra_local_t *local = NULL; + GF_VALIDATE_OR_GOTO ("read-ahead", page, out); - waitq = page->waitq; - page->waitq = NULL; + waitq = page->waitq; + page->waitq = NULL; - trav = waitq; - for (trav = waitq; trav; trav = trav->next) { - frame = trav->data; + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; - local = frame->local; - if (local->op_ret != -1) { - local->op_ret = op_ret; - local->op_errno = op_errno; - } - } + local = frame->local; + if (local->op_ret != -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + } - ra_page_purge (page); + ra_page_purge (page); - return waitq; +out: + return waitq; } -/* +/* * ra_file_destroy - * @file: * @@ -455,24 +575,29 @@ ra_page_error (ra_page_t *page, int32_t op_ret, int32_t op_errno) void ra_file_destroy (ra_file_t *file) { - ra_conf_t *conf = NULL; - ra_page_t *trav = NULL; - - conf = file->conf; - - ra_conf_lock (conf); - { - file->prev->next = file->next; - file->next->prev = file->prev; - } - ra_conf_unlock (conf); - - trav = file->pages.next; - while (trav != &file->pages) { - ra_page_error (trav, -1, EINVAL); - trav = file->pages.next; - } - - pthread_mutex_destroy (&file->file_lock); - free (file); + ra_conf_t *conf = NULL; + ra_page_t *trav = NULL; + + GF_VALIDATE_OR_GOTO ("read-ahead", file, out); + + conf = file->conf; + + ra_conf_lock (conf); + { + file->prev->next = file->next; + file->next->prev = file->prev; + } + ra_conf_unlock (conf); + + trav = file->pages.next; + while (trav != &file->pages) { + ra_page_error (trav, -1, EINVAL); + trav = file->pages.next; + } + + pthread_mutex_destroy (&file->file_lock); + GF_FREE (file); + +out: + return; } diff --git a/xlators/performance/read-ahead/src/read-ahead-mem-types.h b/xlators/performance/read-ahead/src/read-ahead-mem-types.h new file mode 100644 index 000000000..219e29289 --- /dev/null +++ b/xlators/performance/read-ahead/src/read-ahead-mem-types.h @@ -0,0 +1,26 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef __RA_MEM_TYPES_H__ +#define __RA_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_ra_mem_types_ { + gf_ra_mt_ra_file_t = gf_common_mt_end + 1, + gf_ra_mt_ra_conf_t, + gf_ra_mt_ra_page_t, + gf_ra_mt_ra_waitq_t, + gf_ra_mt_ra_fill_t, + gf_ra_mt_iovec, + gf_ra_mt_end +}; +#endif diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c index 912e61dee..069ab1f1a 100644 --- a/xlators/performance/read-ahead/src/read-ahead.c +++ b/xlators/performance/read-ahead/src/read-ahead.c @@ -1,27 +1,18 @@ /* - Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -/* - TODO: - - handle O_DIRECT - - maintain offset, flush on lseek - - ensure efficient memory managment in case of random seek +/* + TODO: + - handle O_DIRECT + - maintain offset, flush on lseek + - ensure efficient memory management in case of random seek */ #ifndef _CONFIG_H @@ -34,6 +25,7 @@ #include "dict.h" #include "xlator.h" #include "read-ahead.h" +#include "statedump.h" #include <assert.h> #include <sys/time.h> @@ -43,156 +35,177 @@ read_ahead (call_frame_t *frame, ra_file_t *file); int ra_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - ra_conf_t *conf = NULL; - ra_file_t *file = NULL; - int ret = 0; - - conf = this->private; - - if (op_ret == -1) { - goto unwind; - } - - file = CALLOC (1, sizeof (*file)); - if (!file) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory"); - goto unwind; - } - - ret = fd_ctx_set (fd, this, (uint64_t)(long)file); - - /* If mandatory locking has been enabled on this file, - we disable caching on it */ - - if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP)) - file->disabled = 1; - - /* If O_DIRECT open, we disable caching on it */ - - if ((fd->flags & O_DIRECT) || (fd->flags & O_WRONLY)) - file->disabled = 1; - - file->offset = (unsigned long long) 0; - file->conf = conf; - file->pages.next = &file->pages; - file->pages.prev = &file->pages; - file->pages.offset = (unsigned long long) 0; - file->pages.file = file; - - ra_conf_lock (conf); - { - file->next = conf->files.next; - conf->files.next = file; - file->next->prev = file; - file->prev = &conf->files; - } - ra_conf_unlock (conf); - - file->fd = fd; - file->page_count = conf->page_count; - file->page_size = conf->page_size; - pthread_mutex_init (&file->file_lock, NULL); - - if (!file->disabled) { - file->page_count = 1; - } + ra_conf_t *conf = NULL; + ra_file_t *file = NULL; + int ret = 0; + + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); + + conf = this->private; + + if (op_ret == -1) { + goto unwind; + } + + file = GF_CALLOC (1, sizeof (*file), gf_ra_mt_ra_file_t); + if (!file) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + /* If O_DIRECT open, we disable caching on it */ + + if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY)) + file->disabled = 1; + + file->offset = (unsigned long long) 0; + file->conf = conf; + file->pages.next = &file->pages; + file->pages.prev = &file->pages; + file->pages.offset = (unsigned long long) 0; + file->pages.file = file; + + ra_conf_lock (conf); + { + file->next = conf->files.next; + conf->files.next = file; + file->next->prev = file; + file->prev = &conf->files; + } + ra_conf_unlock (conf); + + file->fd = fd; + file->page_count = conf->page_count; + file->page_size = conf->page_size; + pthread_mutex_init (&file->file_lock, NULL); + + if (!file->disabled) { + file->page_count = 1; + } + + ret = fd_ctx_set (fd, this, (uint64_t)(long)file); + if (ret == -1) { + gf_log (frame->this->name, GF_LOG_WARNING, + "cannot set read-ahead context information in fd (%p)", + fd); + ra_file_destroy (file); + op_ret = -1; + op_errno = ENOMEM; + } unwind: - STACK_UNWIND (frame, op_ret, op_errno, fd); + frame->local = NULL; + + STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); - return 0; + return 0; } int ra_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct stat *buf) + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - ra_conf_t *conf = NULL; - ra_file_t *file = NULL; - int ret = 0; - - conf = this->private; - - if (op_ret == -1) { - goto unwind; - } - - file = CALLOC (1, sizeof (*file)); - if (!file) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory"); - goto unwind; - } - - ret = fd_ctx_set (fd, this, (uint64_t)(long)file); - - /* If mandatory locking has been enabled on this file, - we disable caching on it */ - - if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP)) - file->disabled = 1; - - /* If O_DIRECT open, we disable caching on it */ - - if ((fd->flags & O_DIRECT) || (fd->flags & O_WRONLY)) - file->disabled = 1; - - file->offset = (unsigned long long) 0; - //file->size = fd->inode->buf.st_size; - file->conf = conf; - file->pages.next = &file->pages; - file->pages.prev = &file->pages; - file->pages.offset = (unsigned long long) 0; - file->pages.file = file; - - ra_conf_lock (conf); - { - file->next = conf->files.next; - conf->files.next = file; - file->next->prev = file; - file->prev = &conf->files; - } - ra_conf_unlock (conf); - - file->fd = fd; - file->page_count = conf->page_count; - file->page_size = conf->page_size; - pthread_mutex_init (&file->file_lock, NULL); + ra_conf_t *conf = NULL; + ra_file_t *file = NULL; + int ret = 0; + + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); + + conf = this->private; + + if (op_ret == -1) { + goto unwind; + } + + file = GF_CALLOC (1, sizeof (*file), gf_ra_mt_ra_file_t); + if (!file) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + /* If O_DIRECT open, we disable caching on it */ + + if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY)) + file->disabled = 1; + + file->offset = (unsigned long long) 0; + //file->size = fd->inode->buf.ia_size; + file->conf = conf; + file->pages.next = &file->pages; + file->pages.prev = &file->pages; + file->pages.offset = (unsigned long long) 0; + file->pages.file = file; + + ra_conf_lock (conf); + { + file->next = conf->files.next; + conf->files.next = file; + file->next->prev = file; + file->prev = &conf->files; + } + ra_conf_unlock (conf); + + file->fd = fd; + file->page_count = conf->page_count; + file->page_size = conf->page_size; + pthread_mutex_init (&file->file_lock, NULL); + + ret = fd_ctx_set (fd, this, (uint64_t)(long)file); + if (ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "cannot set read ahead context information in fd (%p)", + fd); + ra_file_destroy (file); + op_ret = -1; + op_errno = ENOMEM; + } unwind: - STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); - return 0; + return 0; } int ra_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd) + fd_t *fd, dict_t *xdata) { - STACK_WIND (frame, ra_open_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->open, - loc, flags, fd); + GF_ASSERT (frame); + GF_ASSERT (this); + + STACK_WIND (frame, ra_open_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->open, + loc, flags, fd, xdata); - return 0; + return 0; } + int ra_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, fd_t *fd) + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - STACK_WIND (frame, ra_create_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - loc, flags, mode, fd); + GF_ASSERT (frame); + GF_ASSERT (this); - return 0; + STACK_WIND (frame, ra_create_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, + loc, flags, mode, umask, fd, xdata); + + return 0; } /* free cache pages between offset and offset+size, @@ -200,655 +213,1047 @@ ra_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, */ static void -flush_region (call_frame_t *frame, ra_file_t *file, off_t offset, off_t size) -{ - ra_page_t *trav = NULL; - ra_page_t *next = NULL; - - ra_file_lock (file); - { - trav = file->pages.next; - while (trav != &file->pages - && trav->offset < (offset + size)) { - - next = trav->next; - if (trav->offset >= offset && !trav->waitq) { - ra_page_purge (trav); - } - trav = next; - } - } - ra_file_unlock (file); +flush_region (call_frame_t *frame, ra_file_t *file, off_t offset, off_t size, + int for_write) +{ + ra_page_t *trav = NULL; + ra_page_t *next = NULL; + + ra_file_lock (file); + { + trav = file->pages.next; + while (trav != &file->pages + && trav->offset < (offset + size)) { + + next = trav->next; + if (trav->offset >= offset) { + if (!trav->waitq) { + ra_page_purge (trav); + } + else { + trav->stale = 1; + + if (for_write) { + trav->poisoned = 1; + } + } + } + trav = next; + } + } + ra_file_unlock (file); } int ra_release (xlator_t *this, fd_t *fd) { - uint64_t tmp_file = 0; - int ret = 0; + uint64_t tmp_file = 0; + int ret = 0; - ret = fd_ctx_del (fd, this, &tmp_file); - - if (!ret) { - ra_file_destroy ((ra_file_t *)(long)tmp_file); - } + GF_VALIDATE_OR_GOTO ("read-ahead", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + ret = fd_ctx_del (fd, this, &tmp_file); + + if (!ret) { + ra_file_destroy ((ra_file_t *)(long)tmp_file); + } - return 0; +out: + return 0; } void read_ahead (call_frame_t *frame, ra_file_t *file) { - off_t ra_offset = 0; - size_t ra_size = 0; - off_t trav_offset = 0; - ra_page_t *trav = NULL; - off_t cap = 0; - char fault = 0; - - if (!file->page_count) - return; - - ra_size = file->page_size * file->page_count; - ra_offset = floor (file->offset, file->page_size); - cap = file->size ? file->size : file->offset + ra_size; - - while (ra_offset < min (file->offset + ra_size, cap)) { - - ra_file_lock (file); - { - trav = ra_page_get (file, ra_offset); - } - ra_file_unlock (file); - - if (!trav) - break; - - ra_offset += file->page_size; - } - - if (trav) - /* comfortable enough */ - return; - - trav_offset = ra_offset; - - trav = file->pages.next; - cap = file->size ? file->size : ra_offset + ra_size; - - while (trav_offset < min(ra_offset + ra_size, cap)) { - fault = 0; - ra_file_lock (file); - { - trav = ra_page_get (file, trav_offset); - if (!trav) { - fault = 1; - trav = ra_page_create (file, trav_offset); - if (trav) - trav->dirty = 1; - } - } - ra_file_unlock (file); - - if (!trav) { - /* OUT OF MEMORY */ - break; - } - - if (fault) { - gf_log (frame->this->name, GF_LOG_TRACE, - "RA at offset=%"PRId64, trav_offset); - ra_page_fault (file, frame, trav_offset); - } - trav_offset += file->page_size; - } - - return; + off_t ra_offset = 0; + size_t ra_size = 0; + off_t trav_offset = 0; + ra_page_t *trav = NULL; + off_t cap = 0; + char fault = 0; + + GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO (frame->this->name, file, out); + + if (!file->page_count) { + goto out; + } + + ra_size = file->page_size * file->page_count; + ra_offset = floor (file->offset, file->page_size); + cap = file->size ? file->size : file->offset + ra_size; + + while (ra_offset < min (file->offset + ra_size, cap)) { + + ra_file_lock (file); + { + trav = ra_page_get (file, ra_offset); + } + ra_file_unlock (file); + + if (!trav) + break; + + ra_offset += file->page_size; + } + + if (trav) { + /* comfortable enough */ + goto out; + } + + trav_offset = ra_offset; + + cap = file->size ? file->size : ra_offset + ra_size; + + while (trav_offset < min(ra_offset + ra_size, cap)) { + fault = 0; + ra_file_lock (file); + { + trav = ra_page_get (file, trav_offset); + if (!trav) { + fault = 1; + trav = ra_page_create (file, trav_offset); + if (trav) + trav->dirty = 1; + } + } + ra_file_unlock (file); + + if (!trav) { + /* OUT OF MEMORY */ + break; + } + + if (fault) { + gf_log (frame->this->name, GF_LOG_TRACE, + "RA at offset=%"PRId64, trav_offset); + ra_page_fault (file, frame, trav_offset); + } + trav_offset += file->page_size; + } + +out: + return; } int ra_need_atime_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct stat *stbuf, struct iobref *iobref) + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) { - STACK_DESTROY (frame->root); - return 0; + GF_ASSERT (frame); + STACK_DESTROY (frame->root); + return 0; } static void dispatch_requests (call_frame_t *frame, ra_file_t *file) { - ra_local_t *local = NULL; - ra_conf_t *conf = NULL; - off_t rounded_offset = 0; - off_t rounded_end = 0; - off_t trav_offset = 0; - ra_page_t *trav = NULL; - call_frame_t *ra_frame = NULL; - char need_atime_update = 1; - char fault = 0; - - local = frame->local; - conf = file->conf; - - rounded_offset = floor (local->offset, file->page_size); - rounded_end = roof (local->offset + local->size, file->page_size); - - trav_offset = rounded_offset; - trav = file->pages.next; - - while (trav_offset < rounded_end) { - fault = 0; - - ra_file_lock (file); - { - trav = ra_page_get (file, trav_offset); - if (!trav) { - trav = ra_page_create (file, trav_offset); - fault = 1; - need_atime_update = 0; - } - - if (!trav) - goto unlock; - - if (trav->ready) { - gf_log (frame->this->name, GF_LOG_TRACE, - "HIT at offset=%"PRId64".", - trav_offset); - ra_frame_fill (trav, frame); - } else { - gf_log (frame->this->name, GF_LOG_TRACE, - "IN-TRANSIT at offset=%"PRId64".", - trav_offset); - ra_wait_on_page (trav, frame); - need_atime_update = 0; - } - } - unlock: - ra_file_unlock (file); - - if (fault) { - gf_log (frame->this->name, GF_LOG_TRACE, - "MISS at offset=%"PRId64".", - trav_offset); - ra_page_fault (file, frame, trav_offset); - } - - trav_offset += file->page_size; - } - - if (need_atime_update && conf->force_atime_update) { - /* TODO: use untimens() since readv() can confuse underlying - io-cache and others */ - ra_frame = copy_frame (frame); - STACK_WIND (ra_frame, ra_need_atime_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->readv, - file->fd, 1, 1); - } - - return ; + ra_local_t *local = NULL; + ra_conf_t *conf = NULL; + off_t rounded_offset = 0; + off_t rounded_end = 0; + off_t trav_offset = 0; + ra_page_t *trav = NULL; + call_frame_t *ra_frame = NULL; + char need_atime_update = 1; + char fault = 0; + + GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO (frame->this->name, file, out); + + local = frame->local; + conf = file->conf; + + rounded_offset = floor (local->offset, file->page_size); + rounded_end = roof (local->offset + local->size, file->page_size); + + trav_offset = rounded_offset; + + while (trav_offset < rounded_end) { + fault = 0; + + ra_file_lock (file); + { + trav = ra_page_get (file, trav_offset); + if (!trav) { + trav = ra_page_create (file, trav_offset); + if (!trav) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + fault = 1; + need_atime_update = 0; + } + trav->dirty = 0; + + if (trav->ready) { + gf_log (frame->this->name, GF_LOG_TRACE, + "HIT at offset=%"PRId64".", + trav_offset); + ra_frame_fill (trav, frame); + } else { + gf_log (frame->this->name, GF_LOG_TRACE, + "IN-TRANSIT at offset=%"PRId64".", + trav_offset); + ra_wait_on_page (trav, frame); + need_atime_update = 0; + } + } + unlock: + ra_file_unlock (file); + + if (local->op_ret == -1) { + goto out; + } + + if (fault) { + gf_log (frame->this->name, GF_LOG_TRACE, + "MISS at offset=%"PRId64".", + trav_offset); + ra_page_fault (file, frame, trav_offset); + } + + trav_offset += file->page_size; + } + + if (need_atime_update && conf->force_atime_update) { + /* TODO: use untimens() since readv() can confuse underlying + io-cache and others */ + ra_frame = copy_frame (frame); + if (ra_frame == NULL) { + goto out; + } + + STACK_WIND (ra_frame, ra_need_atime_cbk, + FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->readv, + file->fd, 1, 1, 0, NULL); + } + +out: + return ; } int ra_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct stat *stbuf, struct iobref *iobref) + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf, iobref); + GF_ASSERT (frame); - return 0; + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, + stbuf, iobref, xdata); + + return 0; } int ra_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) + off_t offset, uint32_t flags, dict_t *xdata) { - ra_file_t *file = NULL; - ra_local_t *local = NULL; - ra_conf_t *conf = NULL; - int op_errno = 0; - int ret = 0; - char expected_offset = 1; - uint64_t tmp_file = 0; + ra_file_t *file = NULL; + ra_local_t *local = NULL; + ra_conf_t *conf = NULL; + int op_errno = EINVAL; + char expected_offset = 1; + uint64_t tmp_file = 0; - conf = this->private; + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - gf_log (this->name, GF_LOG_TRACE, - "NEW REQ at offset=%"PRId64" for size=%"GF_PRI_SIZET"", - offset, size); + conf = this->private; - ret = fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; + gf_log (this->name, GF_LOG_TRACE, + "NEW REQ at offset=%"PRId64" for size=%"GF_PRI_SIZET"", + offset, size); - if (file->offset != offset) { - gf_log (this->name, GF_LOG_DEBUG, - "unexpected offset (%"PRId64" != %"PRId64") resetting", - file->offset, offset); - - expected_offset = file->expected = file->page_count = 0; - } else { - gf_log (this->name, GF_LOG_TRACE, - "expected offset (%"PRId64") when page_count=%d", - offset, file->page_count); - - if (file->expected < (conf->page_size * conf->page_count)) { - file->expected += size; - file->page_count = min ((file->expected / file->page_size), - conf->page_count); - } - } + fd_ctx_get (fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; - if (!expected_offset) { - flush_region (frame, file, 0, file->pages.prev->offset + 1); - } + if (!file || file->disabled) { + goto disabled; + } - if (file->disabled) { - STACK_WIND (frame, ra_readv_disabled_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->readv, - file->fd, size, offset); - return 0; - } + if (file->offset != offset) { + gf_log (this->name, GF_LOG_TRACE, + "unexpected offset (%"PRId64" != %"PRId64") resetting", + file->offset, offset); - local = (void *) CALLOC (1, sizeof (*local)); - if (!local) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory"); - op_errno = ENOMEM; - goto unwind; - } + expected_offset = file->expected = file->page_count = 0; + } else { + gf_log (this->name, GF_LOG_TRACE, + "expected offset (%"PRId64") when page_count=%d", + offset, file->page_count); + + if (file->expected < (file->page_size * conf->page_count)) { + file->expected += size; + file->page_count = min ((file->expected + / file->page_size), + conf->page_count); + } + } + + if (!expected_offset) { + flush_region (frame, file, 0, file->pages.prev->offset + 1, 0); + } + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto unwind; + } - local->fd = fd; - local->offset = offset; - local->size = size; - local->wait_count = 1; + local->fd = fd; + local->offset = offset; + local->size = size; + local->wait_count = 1; - local->fill.next = &local->fill; - local->fill.prev = &local->fill; + local->fill.next = &local->fill; + local->fill.prev = &local->fill; - pthread_mutex_init (&local->local_lock, NULL); + pthread_mutex_init (&local->local_lock, NULL); - frame->local = local; + frame->local = local; - dispatch_requests (frame, file); + dispatch_requests (frame, file); - flush_region (frame, file, 0, floor (offset, file->page_size)); + flush_region (frame, file, 0, floor (offset, file->page_size), 0); - read_ahead (frame, file); + read_ahead (frame, file); - ra_frame_return (frame); + ra_frame_return (frame); - file->offset = offset + size; + file->offset = offset + size; - return 0; + return 0; unwind: - STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL); + STACK_UNWIND_STRICT (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, + NULL); - return 0; + return 0; + +disabled: + STACK_WIND (frame, ra_readv_disabled_cbk, + FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; } int ra_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno); - return 0; + GF_ASSERT (frame); + STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata); + return 0; } + int -ra_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +ra_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - ra_file_t *file = NULL; - int ret = 0; - uint64_t tmp_file = 0; + GF_ASSERT (frame); + STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} - ret = fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1); - } +int +ra_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + ra_file_t *file = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + + fd_ctx_get (fd, this, &tmp_file); - STACK_WIND (frame, ra_flush_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->flush, - fd); - return 0; + file = (ra_file_t *)(long)tmp_file; + if (file) { + flush_region (frame, file, 0, file->pages.prev->offset+1, 0); + } + + STACK_WIND (frame, ra_flush_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->flush, fd, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT (flush, frame, -1, op_errno, NULL); + return 0; } int -ra_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync) +ra_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - ra_file_t *file = NULL; - int ret = 0; - uint64_t tmp_file = 0; + ra_file_t *file = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; - ret = fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1); - } + fd_ctx_get (fd, this, &tmp_file); + + file = (ra_file_t *)(long)tmp_file; + if (file) { + flush_region (frame, file, 0, file->pages.prev->offset+1, 0); + } - STACK_WIND (frame, ra_flush_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsync, - fd, datasync); - return 0; + STACK_WIND (frame, ra_fsync_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsync, fd, datasync, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT (fsync, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } int ra_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *stbuf) + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - fd_t *fd = NULL; - ra_file_t *file = NULL; - int ret = 0; - uint64_t tmp_file = 0; + ra_file_t *file = NULL; - fd = frame->local; + GF_ASSERT (frame); - ret = fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; + file = frame->local; - if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1); - } + if (file) { + flush_region (frame, file, 0, file->pages.prev->offset+1, 1); + } - frame->local = NULL; - STACK_UNWIND (frame, op_ret, op_errno, stbuf); - return 0; + frame->local = NULL; + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } int ra_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, - int32_t count, off_t offset, struct iobref *iobref) + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - ra_file_t *file = NULL; - int ret = 0; - uint64_t tmp_file = 0; - - ret = fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; + ra_file_t *file = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + + fd_ctx_get (fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (file) { + flush_region (frame, file, 0, file->pages.prev->offset+1, 1); + frame->local = file; + /* reset the read-ahead counters too */ + file->expected = file->page_count = 0; + } + + STACK_WIND (frame, ra_writev_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, + fd, vector, count, offset, flags, iobref, xdata); + + return 0; - if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1); - - /* reset the read-ahead counters too */ - file->expected = file->page_count = 0; - } +unwind: + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} - frame->local = fd; - STACK_WIND (frame, ra_writev_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, - fd, vector, count, offset, iobref); +int +ra_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + GF_ASSERT (frame); - return 0; + STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; } int ra_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) + int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; + GF_ASSERT (frame); + + STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata); + return 0; } int -ra_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) -{ - ra_file_t *file = NULL; - fd_t *iter_fd = NULL; - inode_t *inode = NULL; - int ret = 0; - uint64_t tmp_file = 0; - - inode = loc->inode; - - LOCK (&inode->lock); - { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - ret = fd_ctx_get (iter_fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - - if (!file) - continue; - flush_region (frame, file, 0, - file->pages.prev->offset + 1); - } +ra_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind); + + inode = loc->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + /* + * Truncation invalidates reads just like writing does. + * TBD: this seems to flush more than it should. The + * only time we should flush at all is when we're + * shortening (not lengthening) the file, and then only + * from new EOF to old EOF. The same problem exists in + * ra_ftruncate. + */ + flush_region (frame, file, 0, + file->pages.prev->offset + 1, 1); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_truncate_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->truncate, + loc, offset, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT (truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + + +void +ra_page_dump (struct ra_page *page) +{ + int i = 0; + call_frame_t *frame = NULL; + char key[GF_DUMP_MAX_BUF_LEN] = {0, }; + ra_waitq_t *trav = NULL; + + if (page == NULL) { + goto out; + } + + gf_proc_dump_write ("offset", "%"PRId64, page->offset); + + gf_proc_dump_write ("size", "%"PRId64, page->size); + + gf_proc_dump_write ("dirty", "%s", page->dirty ? "yes" : "no"); + + gf_proc_dump_write ("poisoned", "%s", page->poisoned ? "yes" : "no"); + + gf_proc_dump_write ("ready", "%s", page->ready ? "yes" : "no"); + + for (trav = page->waitq; trav; trav = trav->next) { + frame = trav->data; + sprintf (key, "waiting-frame[%d]", i++); + gf_proc_dump_write (key, "%"PRId64, frame->root->unique); } - UNLOCK (&inode->lock); - STACK_WIND (frame, ra_attr_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->truncate, - loc, offset); - return 0; +out: + return; } +int32_t +ra_fdctx_dump (xlator_t *this, fd_t *fd) +{ + ra_file_t *file = NULL; + ra_page_t *page = NULL; + int32_t ret = 0, i = 0; + uint64_t tmp_file = 0; + char *path = NULL; + char key[GF_DUMP_MAX_BUF_LEN] = {0, }; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; + + fd_ctx_get (fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (file == NULL) { + ret = 0; + goto out; + } + + gf_proc_dump_build_key (key_prefix, + "xlator.performance.read-ahead", + "file"); + + gf_proc_dump_add_section (key_prefix); + + ret = __inode_path (fd->inode, NULL, &path); + if (path != NULL) { + gf_proc_dump_write ("path", "%s", path); + GF_FREE (path); + } + + gf_proc_dump_write ("fd", "%p", fd); + + gf_proc_dump_write ("disabled", "%s", file->disabled ? "yes" : "no"); + + if (file->disabled) { + ret = 0; + goto out; + } + + gf_proc_dump_write ("page-size", "%"PRId64, file->page_size); + + gf_proc_dump_write ("page-count", "%u", file->page_count); + + gf_proc_dump_write ("next-expected-offset-for-sequential-reads", + "%"PRId64, file->offset); + + for (page = file->pages.next; page != &file->pages; + page = page->next) { + sprintf (key, "page[%d]", i); + gf_proc_dump_write (key, "%p", page[i++]); + ra_page_dump (page); + } + + ret = 0; +out: + return ret; +} int -ra_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - ra_file_t *file = NULL; - fd_t *iter_fd = NULL; - inode_t *inode = NULL; - int ret = 0; - uint64_t tmp_file = 0; - - inode = fd->inode; - - LOCK (&inode->lock); - { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - ret = fd_ctx_get (iter_fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - - if (!file) - continue; - flush_region (frame, file, 0, - file->pages.prev->offset + 1); - } - } - UNLOCK (&inode->lock); +ra_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + + inode = fd->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + flush_region (frame, file, 0, + file->pages.prev->offset + 1, 0); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_attr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fstat, fd, xdata); + return 0; - STACK_WIND (frame, ra_attr_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fstat, - fd); - return 0; +unwind: + STACK_UNWIND_STRICT (stat, frame, -1, op_errno, NULL, NULL); + return 0; } int -ra_fchown (call_frame_t *frame, xlator_t *this, fd_t *fd, uid_t uid, gid_t gid) -{ - ra_file_t *file = NULL; - fd_t *iter_fd = NULL; - inode_t *inode = NULL; - int ret = 0; - uint64_t tmp_file = 0; - - inode = fd->inode; - - LOCK (&inode->lock); - { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - ret = fd_ctx_get (iter_fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - - if (!file) - continue; - flush_region (frame, file, 0, - file->pages.prev->offset + 1); - } - } - UNLOCK (&inode->lock); +ra_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + + inode = fd->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + /* + * Truncation invalidates reads just like writing does. + * TBD: this seems to flush more than it should. The + * only time we should flush at all is when we're + * shortening (not lengthening) the file, and then only + * from new EOF to old EOF. The same problem exists in + * ra_truncate. + */ + flush_region (frame, file, 0, + file->pages.prev->offset + 1, 1); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_truncate_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->ftruncate, fd, offset, xdata); + return 0; - STACK_WIND (frame, ra_attr_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fchown, - fd, uid, gid); - return 0; +unwind: + STACK_UNWIND_STRICT (truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } +int +ra_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + GF_ASSERT (frame); + + STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; +} + +static int +ra_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + + inode = fd->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + + flush_region(frame, file, offset, len, 1); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_discard_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->discard, fd, offset, len, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT (discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} int -ra_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) -{ - ra_file_t *file = NULL; - fd_t *iter_fd = NULL; - inode_t *inode = NULL; - int ret = 0; - uint64_t tmp_file = 0; - - inode = fd->inode; - - LOCK (&inode->lock); - { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - ret = fd_ctx_get (iter_fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - if (!file) - continue; - flush_region (frame, file, 0, - file->pages.prev->offset + 1); - } - } - UNLOCK (&inode->lock); +ra_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + GF_ASSERT (frame); - STACK_WIND (frame, ra_attr_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->ftruncate, - fd, offset); - return 0; + STACK_UNWIND_STRICT (zerofill, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; } +static int +ra_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + + inode = fd->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + + flush_region(frame, file, offset, len, 1); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_zerofill_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->zerofill, fd, + offset, len, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +ra_priv_dump (xlator_t *this) +{ + ra_conf_t *conf = NULL; + int ret = -1; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; + gf_boolean_t add_section = _gf_false; + + if (!this) { + goto out; + } + + conf = this->private; + if (!conf) { + gf_log (this->name, GF_LOG_WARNING, "conf null in xlator"); + goto out; + } + + gf_proc_dump_build_key (key_prefix, "xlator.performance.read-ahead", + "priv"); + + gf_proc_dump_add_section (key_prefix); + add_section = _gf_true; + + ret = pthread_mutex_trylock (&conf->conf_lock); + if (ret) + goto out; + { + gf_proc_dump_write ("page_size", "%d", conf->page_size); + gf_proc_dump_write ("page_count", "%d", conf->page_count); + gf_proc_dump_write ("force_atime_update", "%d", + conf->force_atime_update); + } + pthread_mutex_unlock (&conf->conf_lock); + + ret = 0; +out: + if (ret && conf) { + if (add_section == _gf_false) + gf_proc_dump_add_section (key_prefix); + + gf_proc_dump_write ("Unable to dump priv", + "(Lock acquisition failed) %s", this->name); + } + return ret; +} + + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) { + goto out; + } + + ret = xlator_mem_acct_init (this, gf_ra_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + } + +out: + return ret; +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ + ra_conf_t *conf = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("read-ahead", this, out); + GF_VALIDATE_OR_GOTO ("read-ahead", this->private, out); + + conf = this->private; + + GF_OPTION_RECONF ("page-count", conf->page_count, options, uint32, out); + + GF_OPTION_RECONF ("page-size", conf->page_size, options, size, out); + + ret = 0; + out: + return ret; +} int init (xlator_t *this) { - ra_conf_t *conf; - dict_t *options = this->options; - char *page_count_string = NULL; + ra_conf_t *conf = NULL; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("read-ahead", this, out); - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: read-ahead not configured with exactly one" + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: read-ahead not configured with exactly one" " child"); - return -1; - } + goto out; + } - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - conf = (void *) CALLOC (1, sizeof (*conf)); - ERR_ABORT (conf); - conf->page_size = this->ctx->page_size; - conf->page_count = 4; - - if (dict_get (options, "page-count")) - page_count_string = data_to_str (dict_get (options, - "page-count")); - if (page_count_string) - { - if (gf_string2uint_base10 (page_count_string, &conf->page_count) - != 0) - { - gf_log ("read-ahead", - GF_LOG_ERROR, - "invalid number format \"%s\" of \"option " - "page-count\"", - page_count_string); - return -1; - } - gf_log (this->name, GF_LOG_DEBUG, "Using conf->page_count = %u", - conf->page_count); - } - - if (dict_get (options, "force-atime-update")) { - char *force_atime_update_str = data_to_str (dict_get (options, - "force-atime-update")); - if (gf_string2boolean (force_atime_update_str, - &conf->force_atime_update) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "'force-atime-update' takes only boolean " - "options"); - return -1; - } - if (conf->force_atime_update) - gf_log (this->name, GF_LOG_DEBUG, "Forcing atime " - "updates on cache hit"); - } + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + conf = (void *) GF_CALLOC (1, sizeof (*conf), gf_ra_mt_ra_conf_t); + if (conf == NULL) { + goto out; + } + + conf->page_size = this->ctx->page_size; + + GF_OPTION_INIT ("page-size", conf->page_size, size, out); + + GF_OPTION_INIT ("page-count", conf->page_count, uint32, out); + + GF_OPTION_INIT ("force-atime-update", conf->force_atime_update, bool, out); + + conf->files.next = &conf->files; + conf->files.prev = &conf->files; - conf->files.next = &conf->files; - conf->files.prev = &conf->files; + pthread_mutex_init (&conf->conf_lock, NULL); - pthread_mutex_init (&conf->conf_lock, NULL); - this->private = conf; - return 0; + this->local_pool = mem_pool_new (ra_local_t, 64); + if (!this->local_pool) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto out; + } + + this->private = conf; + ret = 0; + +out: + if (ret == -1) { + GF_FREE (conf); + } + + return ret; } + void fini (xlator_t *this) { - ra_conf_t *conf = this->private; + ra_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("read-ahead", this, out); + + conf = this->private; + if (conf == NULL) { + goto out; + } - pthread_mutex_destroy (&conf->conf_lock); - FREE (conf); + this->private = NULL; - this->private = NULL; - return; + GF_ASSERT ((conf->files.next == &conf->files) + && (conf->files.prev == &conf->files)); + + pthread_mutex_destroy (&conf->conf_lock); + GF_FREE (conf); + +out: + return; } struct xlator_fops fops = { - .open = ra_open, - .create = ra_create, - .readv = ra_readv, - .writev = ra_writev, - .flush = ra_flush, - .fsync = ra_fsync, - .truncate = ra_truncate, - .ftruncate = ra_ftruncate, - .fstat = ra_fstat, - .fchown = ra_fchown, + .open = ra_open, + .create = ra_create, + .readv = ra_readv, + .writev = ra_writev, + .flush = ra_flush, + .fsync = ra_fsync, + .truncate = ra_truncate, + .ftruncate = ra_ftruncate, + .fstat = ra_fstat, + .discard = ra_discard, + .zerofill = ra_zerofill, }; -struct xlator_mops mops = { +struct xlator_cbks cbks = { + .release = ra_release, }; -struct xlator_cbks cbks = { - .release = ra_release, +struct xlator_dumpops dumpops = { + .priv = ra_priv_dump, + .fdctx = ra_fdctx_dump, }; struct volume_options options[] = { - { .key = {"force-atime-update"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"page-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 16 + { .key = {"force-atime-update"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false" + }, + { .key = {"page-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 16, + .default_value = "4", + .description = "Number of pages that will be pre-fetched" + }, + { .key = {"page-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 4096, + .max = 1048576 * 64, + .default_value = "131072", + .description = "Page size with which read-ahead performs server I/O" }, - { .key = {NULL} }, + { .key = {NULL} }, }; diff --git a/xlators/performance/read-ahead/src/read-ahead.h b/xlators/performance/read-ahead/src/read-ahead.h index 5513b2690..d1d768c34 100644 --- a/xlators/performance/read-ahead/src/read-ahead.h +++ b/xlators/performance/read-ahead/src/read-ahead.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __READ_AHEAD_H @@ -31,6 +22,7 @@ #include "dict.h" #include "xlator.h" #include "common-utils.h" +#include "read-ahead-mem-types.h" struct ra_conf; struct ra_local; @@ -40,77 +32,79 @@ struct ra_waitq; struct ra_waitq { - struct ra_waitq *next; - void *data; + struct ra_waitq *next; + void *data; }; struct ra_fill { - struct ra_fill *next; - struct ra_fill *prev; - off_t offset; - size_t size; - struct iovec *vector; - int32_t count; + struct ra_fill *next; + struct ra_fill *prev; + off_t offset; + size_t size; + struct iovec *vector; + int32_t count; struct iobref *iobref; }; struct ra_local { - mode_t mode; - struct ra_fill fill; - off_t offset; - size_t size; - int32_t op_ret; - int32_t op_errno; - off_t pending_offset; - size_t pending_size; - fd_t *fd; - int32_t wait_count; - pthread_mutex_t local_lock; + mode_t mode; + struct ra_fill fill; + off_t offset; + size_t size; + int32_t op_ret; + int32_t op_errno; + off_t pending_offset; + size_t pending_size; + fd_t *fd; + int32_t wait_count; + pthread_mutex_t local_lock; }; struct ra_page { - struct ra_page *next; - struct ra_page *prev; - struct ra_file *file; - char dirty; - char ready; - struct iovec *vector; - int32_t count; - off_t offset; - size_t size; - struct ra_waitq *waitq; + struct ra_page *next; + struct ra_page *prev; + struct ra_file *file; + char dirty; /* Internal request, not from user. */ + char poisoned; /* Pending read invalidated by write. */ + char ready; + struct iovec *vector; + int32_t count; + off_t offset; + size_t size; + struct ra_waitq *waitq; struct iobref *iobref; + char stale; }; struct ra_file { - struct ra_file *next; - struct ra_file *prev; - struct ra_conf *conf; - fd_t *fd; - int disabled; - size_t expected; - struct ra_page pages; - off_t offset; - size_t size; - int32_t refcount; - pthread_mutex_t file_lock; - struct stat stbuf; - uint64_t page_size; - uint32_t page_count; + struct ra_file *next; + struct ra_file *prev; + struct ra_conf *conf; + fd_t *fd; + int disabled; + size_t expected; + struct ra_page pages; + off_t offset; + size_t size; + int32_t refcount; + pthread_mutex_t file_lock; + struct iatt stbuf; + uint64_t page_size; + uint32_t page_count; }; struct ra_conf { - uint64_t page_size; - uint32_t page_count; - void *cache_block; - struct ra_file files; - gf_boolean_t force_atime_update; - pthread_mutex_t conf_lock; + uint64_t page_size; + uint32_t page_count; + void *cache_block; + struct ra_file files; + gf_boolean_t force_atime_update; + pthread_mutex_t conf_lock; }; @@ -123,17 +117,20 @@ typedef struct ra_fill ra_fill_t; ra_page_t * ra_page_get (ra_file_t *file, - off_t offset); + off_t offset); + ra_page_t * ra_page_create (ra_file_t *file, - off_t offset); + off_t offset); + void ra_page_fault (ra_file_t *file, - call_frame_t *frame, - off_t offset); + call_frame_t *frame, + off_t offset); void ra_wait_on_page (ra_page_t *page, - call_frame_t *frame); + call_frame_t *frame); + ra_waitq_t * ra_page_wakeup (ra_page_t *page); @@ -142,16 +139,17 @@ ra_page_flush (ra_page_t *page); ra_waitq_t * ra_page_error (ra_page_t *page, - int32_t op_ret, - int32_t op_errno); + int32_t op_ret, + int32_t op_errno); void ra_page_purge (ra_page_t *page); void ra_frame_return (call_frame_t *frame); + void ra_frame_fill (ra_page_t *page, - call_frame_t *frame); + call_frame_t *frame); void ra_file_destroy (ra_file_t *file); @@ -159,36 +157,36 @@ ra_file_destroy (ra_file_t *file); static inline void ra_file_lock (ra_file_t *file) { - pthread_mutex_lock (&file->file_lock); + pthread_mutex_lock (&file->file_lock); } static inline void ra_file_unlock (ra_file_t *file) { - pthread_mutex_unlock (&file->file_lock); + pthread_mutex_unlock (&file->file_lock); } static inline void ra_conf_lock (ra_conf_t *conf) { - pthread_mutex_lock (&conf->conf_lock); + pthread_mutex_lock (&conf->conf_lock); } static inline void ra_conf_unlock (ra_conf_t *conf) { - pthread_mutex_unlock (&conf->conf_lock); + pthread_mutex_unlock (&conf->conf_lock); } static inline void ra_local_lock (ra_local_t *local) { - pthread_mutex_lock (&local->local_lock); + pthread_mutex_lock (&local->local_lock); } static inline void ra_local_unlock (ra_local_t *local) { - pthread_mutex_unlock (&local->local_lock); + pthread_mutex_unlock (&local->local_lock); } #endif /* __READ_AHEAD_H */ diff --git a/xlators/performance/readdir-ahead/Makefile.am b/xlators/performance/readdir-ahead/Makefile.am new file mode 100644 index 000000000..a985f42a8 --- /dev/null +++ b/xlators/performance/readdir-ahead/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/performance/readdir-ahead/src/Makefile.am b/xlators/performance/readdir-ahead/src/Makefile.am new file mode 100644 index 000000000..cdabd1428 --- /dev/null +++ b/xlators/performance/readdir-ahead/src/Makefile.am @@ -0,0 +1,15 @@ +xlator_LTLIBRARIES = readdir-ahead.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +readdir_ahead_la_LDFLAGS = -module -avoidversion + +readdir_ahead_la_SOURCES = readdir-ahead.c +readdir_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = readdir-ahead.h readdir-ahead-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h new file mode 100644 index 000000000..39e2c5369 --- /dev/null +++ b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef __RDA_MEM_TYPES_H__ +#define __RDA_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_rda_mem_types_ { + gf_rda_mt_rda_local = gf_common_mt_end + 1, + gf_rda_mt_rda_fd_ctx, + gf_rda_mt_rda_priv, + gf_rda_mt_end +}; + +#endif diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.c b/xlators/performance/readdir-ahead/src/readdir-ahead.c new file mode 100644 index 000000000..53e6756f0 --- /dev/null +++ b/xlators/performance/readdir-ahead/src/readdir-ahead.c @@ -0,0 +1,560 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/* + * performance/readdir-ahead preloads a local buffer with directory entries + * on opendir. The optimization involves using maximum sized gluster rpc + * requests (128k) to minimize overhead of smaller client requests. + * + * For example, fuse currently supports a maximum readdir buffer of 4k + * (regardless of the filesystem client's buffer size). readdir-ahead should + * effectively convert these smaller requests into fewer, larger sized requests + * for simple, sequential workloads (i.e., ls). + * + * The translator is currently designed to handle the simple, sequential case + * only. If a non-sequential directory read occurs, readdir-ahead disables + * preloads on the directory. + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "call-stub.h" +#include "readdir-ahead.h" +#include "readdir-ahead-mem-types.h" +#include "defaults.h" + +static int rda_fill_fd(call_frame_t *, xlator_t *, fd_t *); + +/* + * Get (or create) the fd context for storing prepopulated directory + * entries. + */ +static struct +rda_fd_ctx *get_rda_fd_ctx(fd_t *fd, xlator_t *this) +{ + uint64_t val; + struct rda_fd_ctx *ctx; + + LOCK(&fd->lock); + + if (__fd_ctx_get(fd, this, &val) < 0) { + ctx = GF_CALLOC(1, sizeof(struct rda_fd_ctx), + gf_rda_mt_rda_fd_ctx); + if (!ctx) + goto out; + + LOCK_INIT(&ctx->lock); + INIT_LIST_HEAD(&ctx->entries.list); + ctx->state = RDA_FD_NEW; + /* ctx offset values initialized to 0 */ + + if (__fd_ctx_set(fd, this, (uint64_t) ctx) < 0) { + GF_FREE(ctx); + ctx = NULL; + goto out; + } + } else { + ctx = (struct rda_fd_ctx *) val; + } +out: + UNLOCK(&fd->lock); + return ctx; +} + +/* + * Reset the tracking state of the context. + */ +static void +rda_reset_ctx(struct rda_fd_ctx *ctx) +{ + ctx->state = RDA_FD_NEW; + ctx->cur_offset = 0; + ctx->cur_size = 0; + ctx->next_offset = 0; + gf_dirent_free(&ctx->entries); +} + +/* + * Check whether we can handle a request. Offset verification is done by the + * caller, so we only check whether the preload buffer has completion status + * (including an error) or has some data to return. + */ +static gf_boolean_t +rda_can_serve_readdirp(struct rda_fd_ctx *ctx, size_t request_size) +{ + if ((ctx->state & RDA_FD_EOD) || + (ctx->state & RDA_FD_ERROR) || + (!(ctx->state & RDA_FD_PLUGGED) && (ctx->cur_size > 0))) + return _gf_true; + + return _gf_false; +} + +/* + * Serve a request from the fd dentry list based on the size of the request + * buffer. ctx must be locked. + */ +static int32_t +__rda_serve_readdirp(xlator_t *this, gf_dirent_t *entries, size_t request_size, + struct rda_fd_ctx *ctx) +{ + gf_dirent_t *dirent, *tmp; + size_t dirent_size, size = 0; + int32_t count = 0; + struct rda_priv *priv = this->private; + + list_for_each_entry_safe(dirent, tmp, &ctx->entries.list, list) { + dirent_size = gf_dirent_size(dirent->d_name); + if (size + dirent_size > request_size) + break; + + size += dirent_size; + list_del_init(&dirent->list); + ctx->cur_size -= dirent_size; + + list_add_tail(&dirent->list, &entries->list); + ctx->cur_offset = dirent->d_off; + count++; + } + + if (ctx->cur_size <= priv->rda_low_wmark) + ctx->state |= RDA_FD_PLUGGED; + + return count; +} + +static int32_t +rda_readdirp_stub(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + gf_dirent_t entries; + int32_t ret; + struct rda_fd_ctx *ctx; + int op_errno = 0; + + ctx = get_rda_fd_ctx(fd, this); + INIT_LIST_HEAD(&entries.list); + ret = __rda_serve_readdirp(this, &entries, size, ctx); + + if (!ret && (ctx->state & RDA_FD_ERROR)) { + ret = -1; + op_errno = ctx->op_errno; + ctx->state &= ~RDA_FD_ERROR; + + /* + * the preload has stopped running in the event of an error, so + * pass all future requests along + */ + ctx->state |= RDA_FD_BYPASS; + } + + STACK_UNWIND_STRICT(readdirp, frame, ret, op_errno, &entries, xdata); + gf_dirent_free(&entries); + + return 0; +} + +static int32_t +rda_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + struct rda_fd_ctx *ctx; + call_stub_t *stub; + int fill = 0; + + ctx = get_rda_fd_ctx(fd, this); + if (!ctx) + goto err; + + if (ctx->state & RDA_FD_BYPASS) + goto bypass; + + LOCK(&ctx->lock); + + /* recheck now that we have the lock */ + if (ctx->state & RDA_FD_BYPASS) { + UNLOCK(&ctx->lock); + goto bypass; + } + + /* + * If a new read comes in at offset 0 and the buffer has been + * completed, reset the context and kickstart the filler again. + */ + if (!off && (ctx->state & RDA_FD_EOD) && (ctx->cur_size == 0)) { + rda_reset_ctx(ctx); + fill = 1; + } + + /* + * If a readdir occurs at an unexpected offset or we already have a + * request pending, admit defeat and just get out of the way. + */ + if (off != ctx->cur_offset || ctx->stub) { + ctx->state |= RDA_FD_BYPASS; + UNLOCK(&ctx->lock); + goto bypass; + } + + stub = fop_readdirp_stub(frame, rda_readdirp_stub, fd, size, off, xdata); + if (!stub) { + UNLOCK(&ctx->lock); + goto err; + } + + /* + * If we haven't bypassed the preload, this means we can either serve + * the request out of the preload or the request that enables us to do + * so is in flight... + */ + if (rda_can_serve_readdirp(ctx, size)) + call_resume(stub); + else + ctx->stub = stub; + + UNLOCK(&ctx->lock); + + if (fill) + rda_fill_fd(frame, this, fd); + + return 0; + +bypass: + STACK_WIND(frame, default_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(readdirp, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + +static int32_t +rda_fill_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + gf_dirent_t *dirent, *tmp; + struct rda_local *local = frame->local; + struct rda_fd_ctx *ctx = local->ctx; + struct rda_priv *priv = this->private; + int fill = 1; + + LOCK(&ctx->lock); + + /* Verify that the preload buffer is still pending on this data. */ + if (ctx->next_offset != local->offset) { + gf_log(this->name, GF_LOG_ERROR, + "Out of sequence directory preload."); + ctx->state |= (RDA_FD_BYPASS|RDA_FD_ERROR); + ctx->op_errno = EUCLEAN; + + goto out; + } + + if (entries) { + list_for_each_entry_safe(dirent, tmp, &entries->list, list) { + list_del_init(&dirent->list); + /* must preserve entry order */ + list_add_tail(&dirent->list, &ctx->entries.list); + + ctx->cur_size += gf_dirent_size(dirent->d_name); + ctx->next_offset = dirent->d_off; + } + } + + if (ctx->cur_size >= priv->rda_high_wmark) + ctx->state &= ~RDA_FD_PLUGGED; + + if (!op_ret) { + /* we've hit eod */ + ctx->state &= ~RDA_FD_RUNNING; + ctx->state |= RDA_FD_EOD; + } else if (op_ret == -1) { + /* kill the preload and pend the error */ + ctx->state &= ~RDA_FD_RUNNING; + ctx->state |= RDA_FD_ERROR; + ctx->op_errno = op_errno; + } + + /* + * NOTE: The strict bypass logic in readdirp() means a pending request + * is always based on ctx->cur_offset. + */ + if (ctx->stub && + rda_can_serve_readdirp(ctx, ctx->stub->args.size)) { + call_resume(ctx->stub); + ctx->stub = NULL; + } + +out: + /* + * If we have been marked for bypass and have no pending stub, clear the + * run state so we stop preloading the context with entries. + */ + if ((ctx->state & RDA_FD_BYPASS) && !ctx->stub) + ctx->state &= ~RDA_FD_RUNNING; + + if (!(ctx->state & RDA_FD_RUNNING)) { + fill = 0; + STACK_DESTROY(ctx->fill_frame->root); + ctx->fill_frame = NULL; + } + + UNLOCK(&ctx->lock); + + if (fill) + rda_fill_fd(frame, this, local->fd); + + return 0; +} + +/* + * Start prepopulating the fd context with directory entries. + */ +static int +rda_fill_fd(call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + call_frame_t *nframe = NULL; + struct rda_local *local = NULL; + struct rda_fd_ctx *ctx; + off_t offset; + struct rda_priv *priv = this->private; + + ctx = get_rda_fd_ctx(fd, this); + if (!ctx) + goto err; + + LOCK(&ctx->lock); + + if (ctx->state & RDA_FD_NEW) { + ctx->state &= ~RDA_FD_NEW; + ctx->state |= RDA_FD_RUNNING; + if (priv->rda_low_wmark) + ctx->state |= RDA_FD_PLUGGED; + } + + offset = ctx->next_offset; + + if (!ctx->fill_frame) { + nframe = copy_frame(frame); + if (!nframe) { + UNLOCK(&ctx->lock); + goto err; + } + + local = mem_get0(this->local_pool); + if (!local) { + UNLOCK(&ctx->lock); + goto err; + } + + local->ctx = ctx; + local->fd = fd; + nframe->local = local; + + ctx->fill_frame = nframe; + } else { + nframe = ctx->fill_frame; + local = nframe->local; + } + + local->offset = offset; + + UNLOCK(&ctx->lock); + + STACK_WIND(nframe, rda_fill_fd_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, priv->rda_req_size, + offset, NULL); + + return 0; + +err: + if (nframe) + FRAME_DESTROY(nframe); + + return -1; +} + +static int32_t +rda_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + if (!op_ret) + rda_fill_fd(frame, this, fd); + + STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, xdata); + return 0; +} + +static int32_t +rda_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + STACK_WIND(frame, rda_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + return 0; +} + +static int32_t +rda_releasedir(xlator_t *this, fd_t *fd) +{ + uint64_t val; + struct rda_fd_ctx *ctx; + + if (fd_ctx_del(fd, this, &val) < 0) + return -1; + + ctx = (struct rda_fd_ctx *) val; + if (!ctx) + return 0; + + rda_reset_ctx(ctx); + + if (ctx->fill_frame) + STACK_DESTROY(ctx->fill_frame->root); + + if (ctx->stub) + gf_log(this->name, GF_LOG_ERROR, + "released a directory with a pending stub"); + + GF_FREE(ctx); + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + if (!this) + goto out; + + ret = xlator_mem_acct_init(this, gf_rda_mt_end + 1); + + if (ret != 0) + gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + +out: + return ret; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + struct rda_priv *priv = this->private; + + GF_OPTION_RECONF("rda-request-size", priv->rda_req_size, options, + uint32, err); + GF_OPTION_RECONF("rda-low-wmark", priv->rda_low_wmark, options, size, + err); + GF_OPTION_RECONF("rda-high-wmark", priv->rda_high_wmark, options, size, + err); + + return 0; +err: + return -1; +} + +int +init(xlator_t *this) +{ + struct rda_priv *priv = NULL; + + GF_VALIDATE_OR_GOTO("readdir-ahead", this, err); + + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_ERROR, + "FATAL: readdir-ahead not configured with exactly one" + " child"); + goto err; + } + + if (!this->parents) { + gf_log(this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + priv = GF_CALLOC(1, sizeof(struct rda_priv), gf_rda_mt_rda_priv); + if (!priv) + goto err; + this->private = priv; + + this->local_pool = mem_pool_new(struct rda_local, 32); + if (!this->local_pool) + goto err; + + GF_OPTION_INIT("rda-request-size", priv->rda_req_size, uint32, err); + GF_OPTION_INIT("rda-low-wmark", priv->rda_low_wmark, size, err); + GF_OPTION_INIT("rda-high-wmark", priv->rda_high_wmark, size, err); + + return 0; + +err: + if (this->local_pool) + mem_pool_destroy(this->local_pool); + if (priv) + GF_FREE(priv); + + return -1; +} + + +void +fini(xlator_t *this) +{ + GF_VALIDATE_OR_GOTO ("readdir-ahead", this, out); + + GF_FREE(this->private); + +out: + return; +} + +struct xlator_fops fops = { + .opendir = rda_opendir, + .readdirp = rda_readdirp, +}; + +struct xlator_cbks cbks = { + .releasedir = rda_releasedir, +}; + +struct volume_options options[] = { + { .key = {"rda-request-size"}, + .type = GF_OPTION_TYPE_INT, + .min = 4096, + .max = 131072, + .default_value = "131072", + .description = "readdir-ahead request size", + }, + { .key = {"rda-low-wmark"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = 10 * GF_UNIT_MB, + .default_value = "4096", + .description = "the value under which we plug", + }, + { .key = {"rda-high-wmark"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = 100 * GF_UNIT_MB, + .default_value = "131072", + .description = "the value over which we unplug", + }, + { .key = {NULL} }, +}; + diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.h b/xlators/performance/readdir-ahead/src/readdir-ahead.h new file mode 100644 index 000000000..e48786dae --- /dev/null +++ b/xlators/performance/readdir-ahead/src/readdir-ahead.h @@ -0,0 +1,46 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __READDIR_AHEAD_H +#define __READDIR_AHEAD_H + +/* state flags */ +#define RDA_FD_NEW (1 << 0) +#define RDA_FD_RUNNING (1 << 1) +#define RDA_FD_EOD (1 << 2) +#define RDA_FD_ERROR (1 << 3) +#define RDA_FD_BYPASS (1 << 4) +#define RDA_FD_PLUGGED (1 << 5) + +struct rda_fd_ctx { + off_t cur_offset; /* current head of the ctx */ + size_t cur_size; /* current size of the preload */ + off_t next_offset; /* tail of the ctx */ + uint32_t state; + gf_lock_t lock; + gf_dirent_t entries; + call_frame_t *fill_frame; + call_stub_t *stub; + int op_errno; +}; + +struct rda_local { + struct rda_fd_ctx *ctx; + fd_t *fd; + off_t offset; +}; + +struct rda_priv { + uint32_t rda_req_size; + uint64_t rda_low_wmark; + uint64_t rda_high_wmark; +}; + +#endif /* __READDIR_AHEAD_H */ diff --git a/xlators/performance/stat-prefetch/src/Makefile.am b/xlators/performance/stat-prefetch/src/Makefile.am deleted file mode 100644 index e52f2df48..000000000 --- a/xlators/performance/stat-prefetch/src/Makefile.am +++ /dev/null @@ -1,11 +0,0 @@ -xlator_PROGRAMS = stat-prefetch.so -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance - -stat_prefetch_so_SOURCES = stat-prefetch.c -noinst_HEADERS = stat-prefetch.h - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles - -CLEANFILES = - diff --git a/xlators/performance/stat-prefetch/src/stat-prefetch.c b/xlators/performance/stat-prefetch/src/stat-prefetch.c deleted file mode 100644 index c6bf1e684..000000000 --- a/xlators/performance/stat-prefetch/src/stat-prefetch.c +++ /dev/null @@ -1,508 +0,0 @@ -/* - Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "stat-prefetch.h" -#include "dict.h" -#include "xlator.h" -#include <sys/time.h> - -struct sp_cache { - struct sp_cache *next; - struct sp_cache *prev; - pid_t pid; - long long tv_time; - char *dirname; - dir_entry_t entries; - int32_t count; - pthread_mutex_t lock; -}; - -static void -stat_prefetch_cache_flush (struct sp_cache *cache, int32_t force) -{ - struct sp_cache *trav; - struct timeval tv; - long long tv_time; - - gettimeofday (&tv, NULL); - tv_time = (tv.tv_usec + (tv.tv_sec * 1000000)); - - pthread_mutex_lock (&cache->lock); - - trav = cache->next; - while (trav != cache) { - struct sp_cache *next = trav->next; - { - if (tv_time > trav->tv_time || force) { - gf_log ("stat-prefetch", - GF_LOG_DEBUG, - "flush on: %s", - trav->dirname); - dir_entry_t *entries; - - trav->prev->next = trav->next; - trav->next->prev = trav->prev; - - entries = trav->entries.next; - - while (entries) { - dir_entry_t *nextentry = entries->next; - { - free (entries->name); - free (entries); - } - entries = nextentry; - } - free (trav->dirname); - free (trav); - } - } - trav = next; - } - - pthread_mutex_unlock (&cache->lock); -} - -static int32_t -stat_prefetch_cache_fill (struct sp_cache *cache, - pid_t pid, - char *dirname, - dir_entry_t *entries) -{ - struct sp_cache *trav; - struct timeval tv; - - pthread_mutex_unlock (&cache->lock); - trav = cache->next; - while (trav != cache) { - // if (trav->pid == pid && !strcmp (trav->dirname, dirname)) { - if (!strcmp (trav->dirname, dirname)) { - break; - } - trav = trav->next; - } - - if (trav == cache) { - trav = CALLOC (1, sizeof (*trav)); - ERR_ABORT (trav); - trav->pid = pid; - trav->dirname = dirname; - - trav->prev = cache->prev; - trav->next = cache; - trav->next->prev = trav; - trav->prev->next = trav; - } else { - free (dirname); - } - - while (trav->entries.next) { - dir_entry_t *tmp = trav->entries.next; - - trav->entries.next = trav->entries.next->next; - free (tmp->name); - free (tmp); - } - trav->entries.next = entries->next; - entries->next = NULL; - - gettimeofday (&tv, NULL); - trav->tv_time = (tv.tv_usec + (tv.tv_sec * 1000000)) + cache->tv_time; - - pthread_mutex_unlock (&cache->lock); - return 0; -} - -static int32_t -stat_prefetch_cache_lookup (struct sp_cache *cache, - pid_t pid, - const char *path, - struct stat *buf) -{ - struct sp_cache *trav; - char *dirname = strdup (path); - char *filename = strrchr (dirname, '/'); - dir_entry_t *entries; - dir_entry_t *prev = NULL; - - *filename = '\0'; - filename ++; - - pthread_mutex_lock (&cache->lock); - trav = cache->next; - while (trav != cache) { - // if ((trav->pid == pid) && !strcmp (dirname, trav->dirname)) - if (!strcmp (dirname, trav->dirname)) - break; - trav = trav->next; - } - if (trav == cache) { - free (dirname); - pthread_mutex_unlock (&cache->lock); - return -1; - } - - entries = trav->entries.next; - prev = &trav->entries; - while (entries) { - if (!strcmp (entries->name, filename)) - break; - prev = entries; - entries = entries->next; - } - if (!entries) { - free (dirname); - pthread_mutex_unlock (&cache->lock); - return -1; - } - - *buf = entries->buf; - prev->next = entries->next; - free (entries->name); - free (entries); - free (dirname); - - pthread_mutex_unlock (&cache->lock); - - return 0; -} - - -int32_t -stat_prefetch_readdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entries, - int32_t count) -{ - char *path = frame->local; - pid_t pid = frame->root->pid; - frame->local = NULL; - - STACK_UNWIND (frame, op_ret, op_errno, entries, count); - - if (op_ret == 0) - stat_prefetch_cache_fill (this->private, - pid, - path, - entries); - else - free (path); - - return 0; -} - -int32_t -stat_prefetch_readdir (call_frame_t *frame, - xlator_t *this, - const char *path) -{ - stat_prefetch_cache_flush (this->private, 0); - - frame->local = strdup (path); - STACK_WIND (frame, - stat_prefetch_readdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdir, - path); - return 0; -} - - -int32_t -stat_prefetch_getattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -int32_t -stat_prefetch_getattr (call_frame_t *frame, - struct xlator *this, - const char *path) -{ - struct stat buf; - pid_t pid = frame->root->pid; - stat_prefetch_cache_flush (this->private, 0); - - if (stat_prefetch_cache_lookup (this->private, - pid, - path, - &buf) == 0) { - STACK_UNWIND (frame, 0, 0, &buf); - return 0; - } - - STACK_WIND (frame, - stat_prefetch_getattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getattr, - path); - - return 0; -} - - -int32_t -stat_prefetch_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -int32_t -stat_prefetch_unlink (call_frame_t *frame, - struct xlator *this, - const char *path) -{ - stat_prefetch_cache_flush (this->private, 1); - - STACK_WIND (frame, - stat_prefetch_unlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, - path); - - return 0; -} - - -int32_t -stat_prefetch_chmod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -int32_t -stat_prefetch_chmod (call_frame_t *frame, - struct xlator *this, - const char *path, - mode_t mode) -{ - stat_prefetch_cache_flush (this->private, 1); - - STACK_WIND (frame, - stat_prefetch_chmod_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->chmod, - path, - mode); - - return 0; -} - - -int32_t -stat_prefetch_chown_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -int32_t -stat_prefetch_chown (call_frame_t *frame, - struct xlator *this, - const char *path, - uid_t uid, - gid_t gid) -{ - stat_prefetch_cache_flush (this->private, 1); - - STACK_WIND (frame, - stat_prefetch_chown_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->chown, - path, - uid, - gid); - - return 0; -} - - -int32_t -stat_prefetch_utimes_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -int32_t -stat_prefetch_utimes (call_frame_t *frame, - struct xlator *this, - const char *path, - struct timespec *tvp) -{ - stat_prefetch_cache_flush (this->private, 1); - - STACK_WIND (frame, - stat_prefetch_utimes_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->utimes, - path, - tvp); - - return 0; -} - - -int32_t -stat_prefetch_truncate_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -int32_t -stat_prefetch_truncate (call_frame_t *frame, - struct xlator *this, - const char *path, - off_t offset) -{ - stat_prefetch_cache_flush (this->private, 1); - - STACK_WIND (frame, - stat_prefetch_truncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, - path, - offset); - - return 0; -} - - -int32_t -stat_prefetch_rename_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -int32_t -stat_prefetch_rename (call_frame_t *frame, - struct xlator *this, - const char *oldpath, - const char *newpath) -{ - stat_prefetch_cache_flush (this->private, 1); - - STACK_WIND (frame, - stat_prefetch_rename_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rename, - oldpath, - newpath); - - return 0; -} - -int32_t -init (struct xlator *this) -{ - struct sp_cache *cache; - dict_t *options = this->options; - - if (!this->children || this->children->next) { - gf_log ("stat-prefetch", - GF_LOG_ERROR, - "FATAL: translator %s does not have exactly one child node", - this->name); - return -1; - } - - cache = (void *) CALLOC (1, sizeof (*cache)); - ERR_ABORT (cache); - cache->next = cache->prev = cache; - - cache->tv_time = 1 * 1000000; - - if (dict_get (options, "cache-seconds")) { - cache->tv_time = (data_to_int64 (dict_get (options, "cache-seconds")) * - 1000000); - } - - pthread_mutex_init (&cache->lock, NULL); - - this->private = cache; - return 0; -} - -void -fini (struct xlator *this) -{ - return; -} - - -struct xlator_fops fops = { - .getattr = stat_prefetch_getattr, - .readdir = stat_prefetch_readdir, - .unlink = stat_prefetch_unlink, - .chmod = stat_prefetch_chmod, - .chown = stat_prefetch_chown, - .rename = stat_prefetch_rename, - .utimes = stat_prefetch_utimes, - .truncate = stat_prefetch_truncate, -}; - -struct xlator_mops mops = { -}; diff --git a/xlators/performance/stat-prefetch/src/stat-prefetch.h b/xlators/performance/stat-prefetch/src/stat-prefetch.h deleted file mode 100644 index ef82952b0..000000000 --- a/xlators/performance/stat-prefetch/src/stat-prefetch.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef _STAT_PREFETCH_H_ -#define _STAT_PREFETCH_H_ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include <stdio.h> -#include <sys/time.h> -#include "xlator.h" - -#endif /* _STAT_PREFETCH_H_ */ diff --git a/xlators/performance/symlink-cache/src/Makefile.am b/xlators/performance/symlink-cache/src/Makefile.am index 06e85fc92..4091c3293 100644 --- a/xlators/performance/symlink-cache/src/Makefile.am +++ b/xlators/performance/symlink-cache/src/Makefile.am @@ -1,12 +1,13 @@ xlator_LTLIBRARIES = symlink-cache.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/performance -symlink_cache_la_LDFLAGS = -module -avoidversion +symlink_cache_la_LDFLAGS = -module -avoid-version symlink_cache_la_SOURCES = symlink-cache.c symlink_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/performance/symlink-cache/src/symlink-cache.c b/xlators/performance/symlink-cache/src/symlink-cache.c index ad0836c5e..3b5fbc252 100644 --- a/xlators/performance/symlink-cache/src/symlink-cache.c +++ b/xlators/performance/symlink-cache/src/symlink-cache.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -90,7 +81,7 @@ sc_cache_update (xlator_t *this, inode_t *inode, const char *link) int -sc_cache_set (xlator_t *this, inode_t *inode, struct stat *buf, +sc_cache_set (xlator_t *this, inode_t *inode, struct iatt *buf, const char *link) { struct symlink_cache *sc = NULL; @@ -126,7 +117,7 @@ sc_cache_set (xlator_t *this, inode_t *inode, struct stat *buf, } } - sc->ctime = buf->st_ctime; + sc->ctime = buf->ia_ctime; gf_log (this->name, GF_LOG_DEBUG, "setting symlink cache: %s", link); @@ -146,8 +137,7 @@ sc_cache_set (xlator_t *this, inode_t *inode, struct stat *buf, err: if (sc) { - if (sc->readlink) - FREE (sc->readlink); + FREE (sc->readlink); sc->readlink = NULL; FREE (sc); } @@ -180,12 +170,12 @@ sc_cache_flush (xlator_t *this, inode_t *inode) int -sc_cache_validate (xlator_t *this, inode_t *inode, struct stat *buf) +sc_cache_validate (xlator_t *this, inode_t *inode, struct iatt *buf) { struct symlink_cache *sc = NULL; uint64_t tmp_sc = 0; - if (!S_ISLNK (buf->st_mode)) { + if (!IA_ISLNK (buf->ia_type)) { sc_cache_flush (this, inode); return 0; } @@ -204,7 +194,7 @@ sc_cache_validate (xlator_t *this, inode_t *inode, struct stat *buf) sc = (struct symlink_cache *)(long)tmp_sc; } - if (sc->ctime == buf->st_ctime) + if (sc->ctime == buf->ia_ctime) return 0; /* STALE */ @@ -216,7 +206,7 @@ sc_cache_validate (xlator_t *this, inode_t *inode, struct stat *buf) sc->readlink = NULL; } - sc->ctime = buf->st_ctime; + sc->ctime = buf->ia_ctime; return 0; } @@ -242,7 +232,7 @@ sc_cache_get (xlator_t *this, inode_t *inode, char **link) int sc_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, - const char *link) + const char *link, struct iatt *sbuf, dict_t *xdata) { if (op_ret > 0) sc_cache_update (this, frame->local, link); @@ -250,16 +240,18 @@ sc_readlink_cbk (call_frame_t *frame, void *cookie, inode_unref (frame->local); frame->local = NULL; - STACK_UNWIND (frame, op_ret, op_errno, link); + STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, link, sbuf, + xdata); return 0; } int sc_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size) + loc_t *loc, size_t size, dict_t *xdata) { char *link = NULL; + struct iatt buf = {0, }; sc_cache_get (this, loc->inode, &link); @@ -268,7 +260,14 @@ sc_readlink (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_DEBUG, "cache hit %s -> %s", loc->path, link); - STACK_UNWIND (frame, strlen (link), 0, link); + + /* + libglusterfsclient, nfs or any other translators + using buf in readlink_cbk should be aware that @buf + is 0 filled + */ + STACK_UNWIND_STRICT (readlink, frame, strlen (link), 0, link, + &buf, NULL); FREE (link); return 0; } @@ -278,7 +277,7 @@ sc_readlink (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, sc_readlink_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readlink, - loc, size); + loc, size, xdata); return 0; } @@ -287,7 +286,8 @@ sc_readlink (call_frame_t *frame, xlator_t *this, int sc_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct stat *buf) + inode_t *inode, struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { if (op_ret == 0) { if (frame->local) { @@ -295,21 +295,22 @@ sc_symlink_cbk (call_frame_t *frame, void *cookie, } } - STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); return 0; } int sc_symlink (call_frame_t *frame, xlator_t *this, - const char *dst, loc_t *src) + const char *dst, loc_t *src, mode_t umask, dict_t *xdata) { frame->local = strdup (dst); STACK_WIND (frame, sc_symlink_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, - dst, src); + dst, src, umask, xdata); return 0; } @@ -318,26 +319,28 @@ sc_symlink (call_frame_t *frame, xlator_t *this, int sc_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct stat *buf, dict_t *xattr) + inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) { if (op_ret == 0) sc_cache_validate (this, inode, buf); else sc_cache_flush (this, inode); - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, xattr); + STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, + xdata, postparent); return 0; } int sc_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) + loc_t *loc, dict_t *xdata) { STACK_WIND (frame, sc_lookup_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, - loc, xattr_req); + loc, xdata); return 0; } @@ -353,10 +356,9 @@ sc_forget (xlator_t *this, } -int32_t +int32_t init (xlator_t *this) { - if (!this->children || this->children->next) { gf_log (this->name, GF_LOG_ERROR, @@ -387,8 +389,6 @@ struct xlator_fops fops = { .readlink = sc_readlink, }; -struct xlator_mops mops = { -}; struct xlator_cbks cbks = { .forget = sc_forget, diff --git a/xlators/performance/write-behind/src/Makefile.am b/xlators/performance/write-behind/src/Makefile.am index f800abad5..6c829d8ee 100644 --- a/xlators/performance/write-behind/src/Makefile.am +++ b/xlators/performance/write-behind/src/Makefile.am @@ -1,12 +1,15 @@ xlator_LTLIBRARIES = write-behind.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -write_behind_la_LDFLAGS = -module -avoidversion +write_behind_la_LDFLAGS = -module -avoid-version write_behind_la_SOURCES = write-behind.c write_behind_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +noinst_HEADERS = write-behind-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/performance/write-behind/src/write-behind-mem-types.h b/xlators/performance/write-behind/src/write-behind-mem-types.h new file mode 100644 index 000000000..f64f429ce --- /dev/null +++ b/xlators/performance/write-behind/src/write-behind-mem-types.h @@ -0,0 +1,26 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef __WB_MEM_TYPES_H__ +#define __WB_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_wb_mem_types_ { + gf_wb_mt_wb_file_t = gf_common_mt_end + 1, + gf_wb_mt_wb_request_t, + gf_wb_mt_iovec, + gf_wb_mt_wb_conf_t, + gf_wb_mt_wb_inode_t, + gf_wb_mt_end +}; +#endif + diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c index 9ce8557f4..95c5921c6 100644 --- a/xlators/performance/write-behind/src/write-behind.c +++ b/xlators/performance/write-behind/src/write-behind.c @@ -1,24 +1,13 @@ /* - Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -/*TODO: check for non null wb_file_data before getting wb_file */ - #ifndef _CONFIG_H #define _CONFIG_H @@ -34,1846 +23,1989 @@ #include "compat-errno.h" #include "common-utils.h" #include "call-stub.h" +#include "statedump.h" +#include "defaults.h" +#include "write-behind-mem-types.h" + +#define MAX_VECTOR_COUNT 8 +#define WB_AGGREGATE_SIZE 131072 /* 128 KB */ +#define WB_WINDOW_SIZE 1048576 /* 1MB */ -#define MAX_VECTOR_COUNT 8 -#define WB_AGGREGATE_SIZE 131072 /* 128 KB */ - typedef struct list_head list_head_t; struct wb_conf; -struct wb_page; -struct wb_file; - - -typedef struct wb_file { - int disabled; - uint64_t disable_till; - size_t window_size; - int32_t refcount; - int32_t op_ret; - int32_t op_errno; - list_head_t request; - fd_t *fd; +struct wb_inode; + +typedef struct wb_inode { + ssize_t window_conf; + ssize_t window_current; + ssize_t transit; /* size of data stack_wound, and yet + to be fulfilled (wb_fulfill_cbk). + used for trickling_writes + */ + + list_head_t all; /* All requests, from enqueue() till destroy(). + Used only for resetting generation + number when empty. + */ + list_head_t todo; /* Work to do (i.e, STACK_WIND to server). + Once we STACK_WIND, the entry is taken + off the list. If it is non-sync write, + then we continue to track it via @liability + or @temptation depending on the status + of its writeback. + */ + list_head_t liability; /* Non-sync writes which are lied + (STACK_UNWIND'ed to caller) but ack + from server not yet complete. This + is the "liability" which we hold, and + must guarantee that dependent operations + which arrive later (which overlap, etc.) + are issued only after their dependencies + in this list are "fulfilled". + + Server acks for entries in this list + shrinks the window. + + The sum total of all req->write_size + of entries in this list must be kept less + than the permitted window size. + */ + list_head_t temptation; /* Operations for which we are tempted + to 'lie' (write-behind), but temporarily + holding off (because of insufficient + window capacity, etc.) + + This is the list to look at to grow + the window (in __wb_pick_unwinds()). + + Entries typically get chosen from + write-behind from this list, and therefore + get "upgraded" to the "liability" list. + */ + list_head_t wip; /* List of write calls in progress, SYNC or non-SYNC + which are currently STACK_WIND'ed towards the server. + This is for guaranteeing that no two overlapping + writes are in progress at the same time. Modules + like eager-lock in AFR depend on this behavior. + */ + uint64_t gen; /* Liability generation number. Represents + the current 'state' of liability. Every + new addition to the liability list bumps + the generation number. + + a newly arrived request is only required + to perform causal checks against the entries + in the liability list which were present + at the time of its addition. the generation + number at the time of its addition is stored + in the request and used during checks. + + the liability list can grow while the request + waits in the todo list waiting for its + dependent operations to complete. however + it is not of the request's concern to depend + itself on those new entries which arrived + after it arrived (i.e, those that have a + liability generation higher than itself) + */ gf_lock_t lock; xlator_t *this; -}wb_file_t; +} wb_inode_t; typedef struct wb_request { - list_head_t list; - list_head_t winds; - list_head_t unwinds; - list_head_t other_requests; - call_stub_t *stub; - int32_t refcount; - wb_file_t *file; - union { - struct { - char write_behind; - char stack_wound; - char got_reply; - }write_request; - - struct { - char marked_for_resume; - }other_requests; - }flags; + list_head_t all; + list_head_t todo; + list_head_t lie; /* either in @liability or @temptation */ + list_head_t winds; + list_head_t unwinds; + list_head_t wip; + + call_stub_t *stub; + + ssize_t write_size; /* currently held size + (after collapsing) */ + size_t orig_size; /* size which arrived with the request. + This is the size by which we grow + the window when unwinding the frame. + */ + size_t total_size; /* valid only in @head in wb_fulfill(). + This is the size with which we perform + STACK_WIND to server and therefore the + amount by which we shrink the window. + */ + + int op_ret; + int op_errno; + + int32_t refcount; + wb_inode_t *wb_inode; + glusterfs_fop_t fop; + gf_lkowner_t lk_owner; + struct iobref *iobref; + uint64_t gen; /* inode liability state at the time of + request arrival */ + + fd_t *fd; + struct { + size_t size; /* 0 size == till infinity */ + off_t off; + int append:1; /* offset is invalid. only one + outstanding append at a time */ + int tempted:1; /* true only for non-sync writes */ + int lied:1; /* sin committed */ + int fulfilled:1; /* got server acknowledgement */ + int go:1; /* enough aggregating, good to go */ + } ordering; } wb_request_t; -struct wb_conf { - uint64_t aggregate_size; - uint64_t window_size; - uint64_t disable_till; - gf_boolean_t enable_O_SYNC; - gf_boolean_t flush_behind; -}; +typedef struct wb_conf { + uint64_t aggregate_size; + uint64_t window_size; + gf_boolean_t flush_behind; + gf_boolean_t trickling_writes; + gf_boolean_t strict_write_ordering; + gf_boolean_t strict_O_DIRECT; +} wb_conf_t; -typedef struct wb_local { - list_head_t winds; - struct wb_file *file; - wb_request_t *request; - int op_ret; - int op_errno; - call_frame_t *frame; - int32_t reply_count; -} wb_local_t; +void +wb_process_queue (wb_inode_t *wb_inode); -typedef struct wb_conf wb_conf_t; -typedef struct wb_page wb_page_t; +wb_inode_t * +__wb_inode_ctx_get (xlator_t *this, inode_t *inode) +{ + uint64_t value = 0; + wb_inode_t *wb_inode = NULL; + __inode_ctx_get (inode, this, &value); + wb_inode = (wb_inode_t *)(unsigned long) value; -int32_t -wb_process_queue (call_frame_t *frame, wb_file_t *file, char flush_all); + return wb_inode; +} -size_t -wb_sync (call_frame_t *frame, wb_file_t *file, list_head_t *winds); -size_t -__wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_size, - char wind_all); +wb_inode_t * +wb_inode_ctx_get (xlator_t *this, inode_t *inode) +{ + wb_inode_t *wb_inode = NULL; + GF_VALIDATE_OR_GOTO ("write-behind", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); -static void -__wb_request_unref (wb_request_t *this) -{ - if (this->refcount <= 0) { - gf_log ("wb-request", GF_LOG_DEBUG, - "refcount(%d) is <= 0", this->refcount); - return; + LOCK (&inode->lock); + { + wb_inode = __wb_inode_ctx_get (this, inode); } + UNLOCK (&inode->lock); +out: + return wb_inode; +} - this->refcount--; - if (this->refcount == 0) { - list_del_init (&this->list); - if (this->stub && this->stub->fop == GF_FOP_WRITE) { - call_stub_destroy (this->stub); + +gf_boolean_t +wb_fd_err (fd_t *fd, xlator_t *this, int32_t *op_errno) +{ + gf_boolean_t err = _gf_false; + uint64_t value = 0; + int32_t tmp = 0; + + if (fd_ctx_get (fd, this, &value) == 0) { + if (op_errno) { + tmp = value; + *op_errno = tmp; } - FREE (this); + err = _gf_true; } + + return err; } -static void -wb_request_unref (wb_request_t *this) +/* + Below is a succinct explanation of the code deciding whether two regions + overlap, from Pavan <tcp@gluster.com>. + + For any two ranges to be non-overlapping, either the end of the first + range is lesser than the start of the second, or vice versa. Example - + + <---------> <--------------> + p q x y + + ( q < x ) or (y < p) = > No overlap. + + To check for *overlap*, we can negate this (using de morgan's laws), and + it becomes - + + (q >= x ) and (y >= p) + + Either that, or you write the negation using - + + if (! ((q < x) or (y < p)) ) { + "Overlap" + } +*/ + +gf_boolean_t +wb_requests_overlap (wb_request_t *req1, wb_request_t *req2) { - wb_file_t *file = NULL; - if (this == NULL) { - gf_log ("wb-request", GF_LOG_DEBUG, - "request is NULL"); - return; - } - - file = this->file; - LOCK (&file->lock); - { - __wb_request_unref (this); - } - UNLOCK (&file->lock); + uint64_t r1_start = 0; + uint64_t r1_end = 0; + uint64_t r2_start = 0; + uint64_t r2_end = 0; + enum _gf_boolean do_overlap = 0; + + r1_start = req1->ordering.off; + if (req1->ordering.size) + r1_end = r1_start + req1->ordering.size - 1; + else + r1_end = ULLONG_MAX; + + r2_start = req2->ordering.off; + if (req2->ordering.size) + r2_end = r2_start + req2->ordering.size - 1; + else + r2_end = ULLONG_MAX; + + do_overlap = ((r1_end >= r2_start) && (r2_end >= r1_start)); + + return do_overlap; } -static wb_request_t * -__wb_request_ref (wb_request_t *this) +gf_boolean_t +wb_requests_conflict (wb_request_t *lie, wb_request_t *req) { - if (this->refcount < 0) { - gf_log ("wb-request", GF_LOG_DEBUG, - "refcount(%d) is < 0", this->refcount); - return NULL; - } + wb_conf_t *conf = NULL; + + conf = req->wb_inode->this->private; + + if (lie == req) + /* request cannot conflict with itself */ + return _gf_false; + + if (lie->gen >= req->gen) + /* this liability entry was behind + us in the todo list */ + return _gf_false; - this->refcount++; - return this; + if (lie->ordering.append) + /* all modifications wait for the completion + of outstanding append */ + return _gf_true; + + if (conf->strict_write_ordering) + /* We are sure (lie->gen < req->gen) by now. So + skip overlap check if strict write ordering is + requested and always return "conflict" against a + lower generation lie. */ + return _gf_true; + + return wb_requests_overlap (lie, req); } -wb_request_t * -wb_request_ref (wb_request_t *this) +gf_boolean_t +wb_liability_has_conflict (wb_inode_t *wb_inode, wb_request_t *req) { - wb_file_t *file = NULL; - if (this == NULL) { - gf_log ("wb-request", GF_LOG_DEBUG, - "request is NULL"); - return NULL; + wb_request_t *each = NULL; + + list_for_each_entry (each, &wb_inode->liability, lie) { + if (wb_requests_conflict (each, req)) + return _gf_true; } - file = this->file; - LOCK (&file->lock); - { - this = __wb_request_ref (this); + return _gf_false; +} + + +gf_boolean_t +wb_wip_has_conflict (wb_inode_t *wb_inode, wb_request_t *req) +{ + wb_request_t *each = NULL; + + if (req->stub->fop != GF_FOP_WRITE) + /* non-writes fundamentally never conflict with WIP requests */ + return _gf_false; + + list_for_each_entry (each, &wb_inode->wip, wip) { + if (each == req) + /* request never conflicts with itself, + though this condition should never occur. + */ + continue; + + if (wb_requests_overlap (each, req)) + return _gf_true; } - UNLOCK (&file->lock); - return this; + return _gf_false; } -wb_request_t * -wb_enqueue (wb_file_t *file, call_stub_t *stub) +static int +__wb_request_unref (wb_request_t *req) { - wb_request_t *request = NULL; - call_frame_t *frame = NULL; - wb_local_t *local = NULL; - struct iovec *vector = NULL; - int32_t count = 0; - - request = CALLOC (1, sizeof (*request)); - - INIT_LIST_HEAD (&request->list); - INIT_LIST_HEAD (&request->winds); - INIT_LIST_HEAD (&request->unwinds); - INIT_LIST_HEAD (&request->other_requests); - - request->stub = stub; - request->file = file; - - frame = stub->frame; - local = frame->local; - if (local) { - local->request = request; + int ret = -1; + wb_inode_t *wb_inode = NULL; + + wb_inode = req->wb_inode; + + if (req->refcount <= 0) { + gf_log ("wb-request", GF_LOG_WARNING, + "refcount(%d) is <= 0", req->refcount); + goto out; } - if (stub->fop == GF_FOP_WRITE) { - vector = stub->args.writev.vector; - count = stub->args.writev.count; + ret = --req->refcount; + if (req->refcount == 0) { + list_del_init (&req->todo); + list_del_init (&req->lie); + list_del_init (&req->wip); - frame = stub->frame; - local = frame->local; - local->op_ret = iov_length (vector, count); - local->op_errno = 0; + list_del_init (&req->all); + if (list_empty (&wb_inode->all)) { + wb_inode->gen = 0; + /* in case of accounting errors? */ + wb_inode->window_current = 0; + } + + list_del_init (&req->winds); + list_del_init (&req->unwinds); + + if (req->stub && req->ordering.tempted) { + call_stub_destroy (req->stub); + req->stub = NULL; + } /* else we would have call_resume()'ed */ + + if (req->iobref) + iobref_unref (req->iobref); + + if (req->fd) + fd_unref (req->fd); + + GF_FREE (req); } +out: + return ret; +} - LOCK (&file->lock); + +static int +wb_request_unref (wb_request_t *req) +{ + wb_inode_t *wb_inode = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("write-behind", req, out); + + wb_inode = req->wb_inode; + + LOCK (&wb_inode->lock); { - list_add_tail (&request->list, &file->request); - if (stub->fop == GF_FOP_WRITE) { - /* reference for stack winding */ - __wb_request_ref (request); - - /* reference for stack unwinding */ - __wb_request_ref (request); - } else { - /*reference for resuming */ - __wb_request_ref (request); - } + ret = __wb_request_unref (req); } - UNLOCK (&file->lock); + UNLOCK (&wb_inode->lock); - return request; +out: + return ret; } -wb_file_t * -wb_file_create (xlator_t *this, fd_t *fd) +static wb_request_t * +__wb_request_ref (wb_request_t *req) { - wb_file_t *file = NULL; - wb_conf_t *conf = this->private; - - file = CALLOC (1, sizeof (*file)); - INIT_LIST_HEAD (&file->request); - - /* - fd_ref() not required, file should never decide the existance of - an fd - */ - file->fd= fd; - file->disable_till = conf->disable_till; - file->this = this; - file->refcount = 1; - - fd_ctx_set (fd, this, (uint64_t)(long)file); - - return file; + GF_VALIDATE_OR_GOTO ("write-behind", req, out); + + if (req->refcount < 0) { + gf_log ("wb-request", GF_LOG_WARNING, + "refcount(%d) is < 0", req->refcount); + req = NULL; + goto out; + } + + req->refcount++; + +out: + return req; } -void -wb_file_destroy (wb_file_t *file) + +wb_request_t * +wb_request_ref (wb_request_t *req) { - int32_t refcount = 0; + wb_inode_t *wb_inode = NULL; - LOCK (&file->lock); - { - refcount = --file->refcount; - } - UNLOCK (&file->lock); + GF_VALIDATE_OR_GOTO ("write-behind", req, out); - if (!refcount){ - LOCK_DESTROY (&file->lock); - FREE (file); + wb_inode = req->wb_inode; + LOCK (&wb_inode->lock); + { + req = __wb_request_ref (req); } + UNLOCK (&wb_inode->lock); - return; +out: + return req; } -int32_t -wb_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct stat *stbuf) +gf_boolean_t +wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted) { - wb_local_t *local = NULL; - list_head_t *winds = NULL; - wb_file_t *file = NULL; - wb_request_t *request = NULL, *dummy = NULL; - wb_local_t *per_request_local = NULL; + wb_request_t *req = NULL; + + GF_VALIDATE_OR_GOTO ("write-behind", wb_inode, out); + GF_VALIDATE_OR_GOTO (wb_inode->this->name, stub, out); + + req = GF_CALLOC (1, sizeof (*req), gf_wb_mt_wb_request_t); + if (!req) + goto out; + + INIT_LIST_HEAD (&req->all); + INIT_LIST_HEAD (&req->todo); + INIT_LIST_HEAD (&req->lie); + INIT_LIST_HEAD (&req->winds); + INIT_LIST_HEAD (&req->unwinds); + INIT_LIST_HEAD (&req->wip); + + req->stub = stub; + req->wb_inode = wb_inode; + req->fop = stub->fop; + req->ordering.tempted = tempted; + + if (stub->fop == GF_FOP_WRITE) { + req->write_size = iov_length (stub->args.vector, + stub->args.count); + + /* req->write_size can change as we collapse + small writes. But the window needs to grow + only by how much we acknowledge the app. so + copy the original size in orig_size for the + purpose of accounting. + */ + req->orig_size = req->write_size; + + /* Let's be optimistic that we can + lie about it + */ + req->op_ret = req->write_size; + req->op_errno = 0; + + if (stub->args.fd->flags & O_APPEND) + req->ordering.append = 1; + } + + req->lk_owner = stub->frame->root->lk_owner; + + switch (stub->fop) { + case GF_FOP_WRITE: + req->ordering.off = stub->args.offset; + req->ordering.size = req->write_size; + + req->fd = fd_ref (stub->args.fd); + + break; + case GF_FOP_READ: + req->ordering.off = stub->args.offset; + req->ordering.size = stub->args.size; + + req->fd = fd_ref (stub->args.fd); + + break; + case GF_FOP_TRUNCATE: + req->ordering.off = stub->args.offset; + req->ordering.size = 0; /* till infinity */ + break; + case GF_FOP_FTRUNCATE: + req->ordering.off = stub->args.offset; + req->ordering.size = 0; /* till infinity */ + req->fd = fd_ref (stub->args.fd); - local = frame->local; - winds = &local->winds; - file = local->file; + break; + default: + break; + } - LOCK (&file->lock); + LOCK (&wb_inode->lock); { - list_for_each_entry_safe (request, dummy, winds, winds) { - request->flags.write_request.got_reply = 1; + list_add_tail (&req->all, &wb_inode->all); - if (!request->flags.write_request.write_behind - && (op_ret == -1)) { - per_request_local = request->stub->frame->local; - per_request_local->op_ret = op_ret; - per_request_local->op_errno = op_errno; - } + req->gen = wb_inode->gen; - __wb_request_unref (request); - } - - if (op_ret == -1) { - file->op_ret = op_ret; - file->op_errno = op_errno; - } + list_add_tail (&req->todo, &wb_inode->todo); + __wb_request_ref (req); /* for wind */ + + if (req->ordering.tempted) { + list_add_tail (&req->lie, &wb_inode->temptation); + __wb_request_ref (req); /* for unwind */ + } } - UNLOCK (&file->lock); + UNLOCK (&wb_inode->lock); - wb_process_queue (frame, file, 0); - - /* safe place to do fd_unref */ - fd_unref (file->fd); +out: + if (!req) + return _gf_false; - STACK_DESTROY (frame->root); + return _gf_true; +} - return 0; + +gf_boolean_t +wb_enqueue (wb_inode_t *wb_inode, call_stub_t *stub) +{ + return wb_enqueue_common (wb_inode, stub, 0); } -size_t -wb_sync (call_frame_t *frame, wb_file_t *file, list_head_t *winds) +gf_boolean_t +wb_enqueue_tempted (wb_inode_t *wb_inode, call_stub_t *stub) { - wb_request_t *dummy = NULL, *request = NULL; - wb_request_t *first_request = NULL, *next = NULL; - size_t total_count = 0, count = 0; - size_t copied = 0; - call_frame_t *sync_frame = NULL; - struct iobref *iobref = NULL; - wb_local_t *local = NULL; - struct iovec *vector = NULL; - size_t bytes = 0, current_size = 0; - size_t bytecount = 0; - wb_conf_t *conf = NULL; - - conf = file->this->private; - list_for_each_entry (request, winds, winds) { - total_count += request->stub->args.writev.count; - bytes += iov_length (request->stub->args.writev.vector, - request->stub->args.writev.count); - } + return wb_enqueue_common (wb_inode, stub, 1); +} + + +wb_inode_t * +__wb_inode_create (xlator_t *this, inode_t *inode) +{ + wb_inode_t *wb_inode = NULL; + wb_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + conf = this->private; - if (!total_count) { + wb_inode = GF_CALLOC (1, sizeof (*wb_inode), gf_wb_mt_wb_inode_t); + if (!wb_inode) goto out; - } - - list_for_each_entry_safe (request, dummy, winds, winds) { - if (!vector) { - vector = MALLOC (VECTORSIZE (MAX_VECTOR_COUNT)); - iobref = iobref_new (); - - local = CALLOC (1, sizeof (*local)); - INIT_LIST_HEAD (&local->winds); - - first_request = request; - current_size = 0; - } - count += request->stub->args.writev.count; - bytecount = VECTORSIZE (request->stub->args.writev.count); - memcpy (((char *)vector)+copied, - request->stub->args.writev.vector, - bytecount); - copied += bytecount; - - current_size += iov_length (request->stub->args.writev.vector, - request->stub->args.writev.count); - - if (request->stub->args.writev.iobref) { - iobref_merge (iobref, - request->stub->args.writev.iobref); - } + INIT_LIST_HEAD (&wb_inode->all); + INIT_LIST_HEAD (&wb_inode->todo); + INIT_LIST_HEAD (&wb_inode->liability); + INIT_LIST_HEAD (&wb_inode->temptation); + INIT_LIST_HEAD (&wb_inode->wip); - next = NULL; - if (request->winds.next != winds) { - next = list_entry (request->winds.next, - wb_request_t, winds); - } + wb_inode->this = this; - list_del_init (&request->winds); - list_add_tail (&request->winds, &local->winds); + wb_inode->window_conf = conf->window_size; - if ((!next) - || ((count + next->stub->args.writev.count) - > MAX_VECTOR_COUNT) - || ((current_size + iov_length (next->stub->args.writev.vector, - next->stub->args.writev.count)) - > conf->aggregate_size)) - { - sync_frame = copy_frame (frame); - sync_frame->local = local; - local->file = file; - fd_ref (file->fd); - STACK_WIND (sync_frame, - wb_sync_cbk, - FIRST_CHILD(sync_frame->this), - FIRST_CHILD(sync_frame->this)->fops->writev, - file->fd, vector, - count, - first_request->stub->args.writev.off, - iobref); - - iobref_unref (iobref); - FREE (vector); - first_request = NULL; - iobref = NULL; - vector = NULL; - copied = count = 0; - } - } + LOCK_INIT (&wb_inode->lock); + + __inode_ctx_put (inode, this, (uint64_t)(unsigned long)wb_inode); out: - return bytes; + return wb_inode; } -int32_t -wb_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct stat *buf) +wb_inode_t * +wb_inode_create (xlator_t *this, inode_t *inode) { - wb_local_t *local = NULL; - wb_request_t *request = NULL; - call_frame_t *process_frame = NULL; - wb_file_t *file = NULL; - - local = frame->local; - file = local->file; - - request = local->request; - if (request) { - process_frame = copy_frame (frame); - } + wb_inode_t *wb_inode = NULL; - STACK_UNWIND (frame, op_ret, op_errno, buf); - - if (request) { - wb_request_unref (request); - wb_process_queue (process_frame, file, 0); - STACK_DESTROY (process_frame->root); - } + GF_VALIDATE_OR_GOTO (this->name, inode, out); - if (file) { - fd_unref (file->fd); + LOCK (&inode->lock); + { + wb_inode = __wb_inode_ctx_get (this, inode); + if (!wb_inode) + wb_inode = __wb_inode_create (this, inode); } + UNLOCK (&inode->lock); - return 0; +out: + return wb_inode; } -static int32_t -wb_stat_helper (call_frame_t *frame, xlator_t *this, loc_t *loc) +void +wb_inode_destroy (wb_inode_t *wb_inode) { - STACK_WIND (frame, wb_stat_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, - loc); - return 0; + GF_VALIDATE_OR_GOTO ("write-behind", wb_inode, out); + + LOCK_DESTROY (&wb_inode->lock); + GF_FREE (wb_inode); +out: + return; } -int32_t -wb_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) +void +__wb_fulfill_request (wb_request_t *req) { - wb_file_t *file = NULL; - fd_t *iter_fd = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - - if (loc->inode) { - /* FIXME: fd_lookup extends life of fd till stat returns */ - iter_fd = fd_lookup (loc->inode, frame->root->pid); - if (iter_fd) { - if (!fd_ctx_get (iter_fd, this, &tmp_file)) { - file = (wb_file_t *)(long)tmp_file; - } else { - fd_unref (iter_fd); - } - } - } + wb_inode_t *wb_inode = NULL; - local = CALLOC (1, sizeof (*local)); - local->file = file; + wb_inode = req->wb_inode; - frame->local = local; + req->ordering.fulfilled = 1; + wb_inode->window_current -= req->total_size; + wb_inode->transit -= req->total_size; - if (file) { - stub = fop_stat_stub (frame, wb_stat_helper, loc); - if (stub == NULL) { - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; - } - - wb_enqueue (file, stub); - - wb_process_queue (frame, file, 1); - } else { - STACK_WIND (frame, wb_stat_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, - loc); - } + if (!req->ordering.lied) { + /* TODO: fail the req->frame with error if + necessary + */ + } - return 0; + __wb_request_unref (req); } -int32_t -wb_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct stat *buf) +void +wb_head_done (wb_request_t *head) { - wb_local_t *local = NULL; - wb_request_t *request = NULL; - wb_file_t *file = NULL; - - local = frame->local; - file = local->file; - - request = local->request; - if (request) { - wb_request_unref (request); - wb_process_queue (frame, file, 0); - } + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + wb_inode_t *wb_inode = NULL; + + wb_inode = head->wb_inode; + + LOCK (&wb_inode->lock); + { + list_for_each_entry_safe (req, tmp, &head->winds, winds) { + __wb_fulfill_request (req); + } + __wb_fulfill_request (head); + } + UNLOCK (&wb_inode->lock); +} - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; +void +wb_fulfill_err (wb_request_t *head, int op_errno) +{ + wb_inode_t *wb_inode; + wb_request_t *req; + + wb_inode = head->wb_inode; + + /* for all future requests yet to arrive */ + fd_ctx_set (head->fd, THIS, op_errno); + + LOCK (&wb_inode->lock); + { + /* for all requests already arrived */ + list_for_each_entry (req, &wb_inode->all, all) { + if (req->fd != head->fd) + continue; + req->op_ret = -1; + req->op_errno = op_errno; + } + } + UNLOCK (&wb_inode->lock); } -int32_t -wb_fstat_helper (call_frame_t *frame, xlator_t *this, fd_t *fd) +int +wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - STACK_WIND (frame, - wb_fstat_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fstat, - fd); + wb_inode_t *wb_inode = NULL; + wb_request_t *head = NULL; + + head = frame->local; + frame->local = NULL; + + wb_inode = head->wb_inode; + + if (op_ret == -1) { + wb_fulfill_err (head, op_errno); + } else if (op_ret < head->total_size) { + /* + * We've encountered a short write, for whatever reason. + * Set an EIO error for the next fop. This should be + * valid for writev or flush (close). + * + * TODO: Retry the write so we can potentially capture + * a real error condition (i.e., ENOSPC). + */ + wb_fulfill_err (head, EIO); + } + + wb_head_done (head); + + wb_process_queue (wb_inode); + + STACK_DESTROY (frame->root); + return 0; } -int32_t -wb_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd) +#define WB_IOV_LOAD(vec, cnt, req, head) do { \ + memcpy (&vec[cnt], req->stub->args.vector, \ + (req->stub->args.count * sizeof(vec[0]))); \ + cnt += req->stub->args.count; \ + head->total_size += req->write_size; \ + } while (0) + + +int +wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head) { - wb_file_t *file = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - - if (fd_ctx_get (fd, this, &tmp_file)) { - gf_log (this->name, GF_LOG_DEBUG, "write behind file pointer is" - " not stored in context of fd(%p), returning EBADFD", - fd); - - STACK_UNWIND (frame, -1, EBADFD, NULL); - return 0; + struct iovec vector[MAX_VECTOR_COUNT]; + int count = 0; + wb_request_t *req = NULL; + call_frame_t *frame = NULL; + gf_boolean_t fderr = _gf_false; + xlator_t *this = NULL; + + this = THIS; + + /* make sure head->total_size is updated before we run into any + * errors + */ + + WB_IOV_LOAD (vector, count, head, head); + + list_for_each_entry (req, &head->winds, winds) { + WB_IOV_LOAD (vector, count, req, head); + + iobref_merge (head->stub->args.iobref, + req->stub->args.iobref); + } + + if (wb_fd_err (head->fd, this, NULL)) { + fderr = _gf_true; + goto err; } - file = (wb_file_t *)(long)tmp_file; - local = CALLOC (1, sizeof (*local)); - local->file = file; + frame = create_frame (wb_inode->this, wb_inode->this->ctx->pool); + if (!frame) + goto err; - frame->local = local; + frame->root->lk_owner = head->lk_owner; + frame->local = head; - if (file) { - stub = fop_fstat_stub (frame, wb_fstat_helper, fd); - if (stub == NULL) { - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; - } - - wb_enqueue (file, stub); - - wb_process_queue (frame, file, 1); - } else { - STACK_WIND (frame, - wb_fstat_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fstat, - fd); + LOCK (&wb_inode->lock); + { + wb_inode->transit += head->total_size; + } + UNLOCK (&wb_inode->lock); + + STACK_WIND (frame, wb_fulfill_cbk, FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->writev, + head->fd, vector, count, + head->stub->args.offset, + head->stub->args.flags, + head->stub->args.iobref, NULL); + + return 0; +err: + if (!fderr) { + /* frame creation failure */ + fderr = ENOMEM; + wb_fulfill_err (head, fderr); } - return 0; + wb_head_done (head); + + return fderr; } -int32_t -wb_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +#define NEXT_HEAD(head, req) do { \ + if (head) \ + ret |= wb_fulfill_head (wb_inode, head); \ + head = req; \ + expected_offset = req->stub->args.offset + \ + req->write_size; \ + curr_aggregate = 0; \ + vector_count = 0; \ + } while (0) + + +int +wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities) { - wb_local_t *local = NULL; - wb_request_t *request = NULL; - wb_file_t *file = NULL; - call_frame_t *process_frame = NULL; + wb_request_t *req = NULL; + wb_request_t *head = NULL; + wb_request_t *tmp = NULL; + wb_conf_t *conf = NULL; + off_t expected_offset = 0; + size_t curr_aggregate = 0; + size_t vector_count = 0; + int ret = 0; + + conf = wb_inode->this->private; + + list_for_each_entry_safe (req, tmp, liabilities, winds) { + list_del_init (&req->winds); + + if (!head) { + NEXT_HEAD (head, req); + continue; + } + + if (req->fd != head->fd) { + NEXT_HEAD (head, req); + continue; + } + + if (!is_same_lkowner (&req->lk_owner, &head->lk_owner)) { + NEXT_HEAD (head, req); + continue; + } + + if (expected_offset != req->stub->args.offset) { + NEXT_HEAD (head, req); + continue; + } + + if ((curr_aggregate + req->write_size) > conf->aggregate_size) { + NEXT_HEAD (head, req); + continue; + } + + if (vector_count + req->stub->args.count > + MAX_VECTOR_COUNT) { + NEXT_HEAD (head, req); + continue; + } + + list_add_tail (&req->winds, &head->winds); + curr_aggregate += req->write_size; + vector_count += req->stub->args.count; + } + + if (head) + ret |= wb_fulfill_head (wb_inode, head); + + return ret; +} - local = frame->local; - file = local->file; - request = local->request; - if (request) { - process_frame = copy_frame (frame); - } +void +wb_do_unwinds (wb_inode_t *wb_inode, list_head_t *lies) +{ + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + call_frame_t *frame = NULL; + struct iatt buf = {0, }; - STACK_UNWIND (frame, op_ret, op_errno, buf); + list_for_each_entry_safe (req, tmp, lies, unwinds) { + frame = req->stub->frame; - if (request) { - wb_request_unref (request); - wb_process_queue (process_frame, file, 0); - STACK_DESTROY (process_frame->root); - } + STACK_UNWIND_STRICT (writev, frame, req->op_ret, req->op_errno, + &buf, &buf, NULL); /* :O */ + req->stub->frame = NULL; - if (file) { - fd_unref (file->fd); + list_del_init (&req->unwinds); + wb_request_unref (req); } - return 0; + return; } -static int32_t -wb_truncate_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset) +void +__wb_pick_unwinds (wb_inode_t *wb_inode, list_head_t *lies) { - STACK_WIND (frame, - wb_truncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, - loc, - offset); + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; - return 0; + list_for_each_entry_safe (req, tmp, &wb_inode->temptation, lie) { + if (!req->ordering.fulfilled && + wb_inode->window_current > wb_inode->window_conf) + continue; + + list_del_init (&req->lie); + list_move_tail (&req->unwinds, lies); + + wb_inode->window_current += req->orig_size; + + if (!req->ordering.fulfilled) { + /* burden increased */ + list_add_tail (&req->lie, &wb_inode->liability); + + req->ordering.lied = 1; + + wb_inode->gen++; + } + } + + return; } -int32_t -wb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) +int +__wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req) { - wb_file_t *file = NULL; - fd_t *iter_fd = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; + char *ptr = NULL; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + int ret = -1; + ssize_t required_size = 0; + size_t holder_len = 0; + size_t req_len = 0; + + if (!holder->iobref) { + holder_len = iov_length (holder->stub->args.vector, + holder->stub->args.count); + req_len = iov_length (req->stub->args.vector, + req->stub->args.count); + + required_size = max ((THIS->ctx->page_size), + (holder_len + req_len)); + iobuf = iobuf_get2 (req->wb_inode->this->ctx->iobuf_pool, + required_size); + if (iobuf == NULL) { + goto out; + } - if (loc->inode) - { - /* - FIXME: fd_lookup extends life of fd till the execution of - truncate_cbk - */ - iter_fd = fd_lookup (loc->inode, frame->root->pid); - if (iter_fd) { - if (!fd_ctx_get (iter_fd, this, &tmp_file)){ - file = (wb_file_t *)(long)tmp_file; - } else { - fd_unref (iter_fd); - } + iobref = iobref_new (); + if (iobref == NULL) { + iobuf_unref (iobuf); + goto out; } - } - - local = CALLOC (1, sizeof (*local)); - local->file = file; - - frame->local = local; - if (file) { - stub = fop_truncate_stub (frame, wb_truncate_helper, loc, - offset); - if (stub == NULL) { - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; + + ret = iobref_add (iobref, iobuf); + if (ret != 0) { + iobuf_unref (iobuf); + iobref_unref (iobref); + gf_log (req->wb_inode->this->name, GF_LOG_WARNING, + "cannot add iobuf (%p) into iobref (%p)", + iobuf, iobref); + goto out; } - wb_enqueue (file, stub); - - wb_process_queue (frame, file, 1); - - } else { - STACK_WIND (frame, - wb_truncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, - loc, - offset); + iov_unload (iobuf->ptr, holder->stub->args.vector, + holder->stub->args.count); + holder->stub->args.vector[0].iov_base = iobuf->ptr; + holder->stub->args.count = 1; + + iobref_unref (holder->stub->args.iobref); + holder->stub->args.iobref = iobref; + + iobuf_unref (iobuf); + + holder->iobref = iobref_ref (iobref); } - return 0; + ptr = holder->stub->args.vector[0].iov_base + holder->write_size; + + iov_unload (ptr, req->stub->args.vector, + req->stub->args.count); + + holder->stub->args.vector[0].iov_len += req->write_size; + holder->write_size += req->write_size; + holder->ordering.size += req->write_size; + + ret = 0; +out: + return ret; } -int32_t -wb_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +void +__wb_preprocess_winds (wb_inode_t *wb_inode) { - wb_local_t *local = NULL; - wb_request_t *request = NULL; - wb_file_t *file = NULL; + off_t offset_expected = 0; + ssize_t space_left = 0; + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + wb_request_t *holder = NULL; + wb_conf_t *conf = NULL; + int ret = 0; + ssize_t page_size = 0; + + /* With asynchronous IO from a VM guest (as a file), there + can be two sequential writes happening in two regions + of the file. But individual (broken down) IO requests + can arrive interleaved. + + TODO: cycle for each such sequence sifting + through the interleaved ops + */ + + page_size = wb_inode->this->ctx->page_size; + conf = wb_inode->this->private; + + list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { + if (!req->ordering.tempted) { + if (holder) { + if (wb_requests_conflict (holder, req)) + /* do not hold on write if a + dependent write is in queue */ + holder->ordering.go = 1; + } + /* collapse only non-sync writes */ + continue; + } else if (!holder) { + /* holder is always a non-sync write */ + holder = req; + continue; + } + + offset_expected = holder->stub->args.offset + + holder->write_size; + + if (req->stub->args.offset != offset_expected) { + holder->ordering.go = 1; + holder = req; + continue; + } + + if (!is_same_lkowner (&req->lk_owner, &holder->lk_owner)) { + holder->ordering.go = 1; + holder = req; + continue; + } + + if (req->fd != holder->fd) { + holder->ordering.go = 1; + holder = req; + continue; + } + + space_left = page_size - holder->write_size; + + if (space_left < req->write_size) { + holder->ordering.go = 1; + holder = req; + continue; + } + + ret = __wb_collapse_small_writes (holder, req); + if (ret) + continue; + + /* collapsed request is as good as wound + (from its p.o.v) + */ + list_del_init (&req->todo); + __wb_fulfill_request (req); + + /* Only the last @holder in queue which - local = frame->local; - file = local->file; - request = local->request; + - does not have any non-buffered-writes following it + - has not yet filled its capacity - if (request) { - wb_request_unref (request); - wb_process_queue (frame, file, 0); + does not get its 'go' set, in anticipation of the arrival + of consecutive smaller writes. + */ } - STACK_UNWIND (frame, op_ret, op_errno, buf); + /* but if trickling writes are enabled, then do not hold back + writes if there are no outstanding requests + */ - return 0; + if (conf->trickling_writes && !wb_inode->transit && holder) + holder->ordering.go = 1; + + return; } -static int32_t -wb_ftruncate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset) +void +__wb_pick_winds (wb_inode_t *wb_inode, list_head_t *tasks, + list_head_t *liabilities) { - STACK_WIND (frame, - wb_ftruncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, - fd, - offset); - return 0; + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + + list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { + if (wb_liability_has_conflict (wb_inode, req)) + continue; + + if (req->ordering.tempted && !req->ordering.go) + /* wait some more */ + continue; + + if (req->stub->fop == GF_FOP_WRITE) { + if (wb_wip_has_conflict (wb_inode, req)) + continue; + + list_add_tail (&req->wip, &wb_inode->wip); + + if (!req->ordering.tempted) + /* unrefed in wb_writev_cbk */ + req->stub->frame->local = + __wb_request_ref (req); + } + + list_del_init (&req->todo); + + if (req->ordering.tempted) + list_add_tail (&req->winds, liabilities); + else + list_add_tail (&req->winds, tasks); + } } - -int32_t -wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) + +void +wb_do_winds (wb_inode_t *wb_inode, list_head_t *tasks) { - wb_file_t *file = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - - if (fd_ctx_get (fd, this, &tmp_file)) { - gf_log (this->name, GF_LOG_DEBUG, "write behind file pointer is" - " not stored in context of fd(%p), returning EBADFD", - fd); - - STACK_UNWIND (frame, -1, EBADFD, NULL); - return 0; - } + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + + list_for_each_entry_safe (req, tmp, tasks, winds) { + list_del_init (&req->winds); - file = (wb_file_t *)(long)tmp_file; + call_resume (req->stub); - local = CALLOC (1, sizeof (*local)); - local->file = file; + wb_request_unref (req); + } +} - frame->local = local; - if (file) { - stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd, - offset); - if (stub == NULL) { - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; +void +wb_process_queue (wb_inode_t *wb_inode) +{ + list_head_t tasks = {0, }; + list_head_t lies = {0, }; + list_head_t liabilities = {0, }; + int retry = 0; + + INIT_LIST_HEAD (&tasks); + INIT_LIST_HEAD (&lies); + INIT_LIST_HEAD (&liabilities); + + do { + LOCK (&wb_inode->lock); + { + __wb_preprocess_winds (wb_inode); + + __wb_pick_winds (wb_inode, &tasks, &liabilities); + + __wb_pick_unwinds (wb_inode, &lies); + } + UNLOCK (&wb_inode->lock); - wb_enqueue (file, stub); + wb_do_unwinds (wb_inode, &lies); - wb_process_queue (frame, file, 1); - } else { - STACK_WIND (frame, - wb_ftruncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, - fd, - offset); - } + wb_do_winds (wb_inode, &tasks); - return 0; + /* fd might've been marked bad due to previous errors. + * Since, caller of wb_process_queue might be the last fop on + * inode, make sure we keep processing request queue, till there + * are no requests left. + */ + retry = wb_fulfill (wb_inode, &liabilities); + } while (retry); + + return; } -int32_t -wb_utimens_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +int +wb_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - wb_local_t *local = NULL; - wb_request_t *request = NULL; - call_frame_t *process_frame = NULL; - wb_file_t *file = NULL; - - local = frame->local; - file = local->file; - request = local->request; - - if (request) { - process_frame = copy_frame (frame); - } + wb_request_t *req = NULL; + wb_inode_t *wb_inode; - STACK_UNWIND (frame, op_ret, op_errno, buf); + req = frame->local; + frame->local = NULL; + wb_inode = req->wb_inode; - if (request) { - wb_request_unref (request); - wb_process_queue (process_frame, file, 0); - STACK_DESTROY (process_frame->root); - } + wb_request_unref (req); - if (file) { - fd_unref (file->fd); - } + /* requests could be pending while this was in progress */ + wb_process_queue(wb_inode); - return 0; + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } -static int32_t -wb_utimens_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct timespec tv[2]) +int +wb_writev_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) { - STACK_WIND (frame, - wb_utimens_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->utimens, - loc, - tv); - - return 0; + STACK_WIND (frame, wb_writev_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, + fd, vector, count, offset, flags, iobref, xdata); + return 0; } -int32_t -wb_utimens (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct timespec tv[2]) +int +wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - wb_file_t *file = NULL; - fd_t *iter_fd = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - - if (loc->inode) { - /* - FIXME: fd_lookup extends life of fd till the execution - of wb_utimens_cbk - */ - iter_fd = fd_lookup (loc->inode, frame->root->pid); - if (iter_fd) { - if (!fd_ctx_get (iter_fd, this, &tmp_file)) { - file = (wb_file_t *)(long)tmp_file; - } else { - fd_unref (iter_fd); - } - } + wb_inode_t *wb_inode = NULL; + wb_conf_t *conf = NULL; + gf_boolean_t wb_disabled = 0; + call_stub_t *stub = NULL; + int ret = -1; + int32_t op_errno = EINVAL; + int o_direct = O_DIRECT; + conf = this->private; + + if (wb_fd_err (fd, this, &op_errno)) { + goto unwind; } - local = CALLOC (1, sizeof (*local)); - local->file = file; + wb_inode = wb_inode_create (this, fd->inode); + if (!wb_inode) { + op_errno = ENOMEM; + goto unwind; + } - frame->local = local; + if (!conf->strict_O_DIRECT) + o_direct = 0; - if (file) { - stub = fop_utimens_stub (frame, wb_utimens_helper, loc, tv); - if (stub == NULL) { - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; - } + if (fd->flags & (O_SYNC|O_DSYNC|o_direct)) + wb_disabled = 1; - wb_enqueue (file, stub); + if (flags & (O_SYNC|O_DSYNC|o_direct)) + wb_disabled = 1; - wb_process_queue (frame, file, 1); - } else { - STACK_WIND (frame, - wb_utimens_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->utimens, - loc, - tv); + if (wb_disabled) + stub = fop_writev_stub (frame, wb_writev_helper, fd, vector, + count, offset, flags, iobref, xdata); + else + stub = fop_writev_stub (frame, NULL, fd, vector, count, offset, + flags, iobref, xdata); + if (!stub) { + op_errno = ENOMEM; + goto unwind; } - return 0; -} - -int32_t -wb_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, fd_t *fd) -{ - int32_t flags = 0; - wb_file_t *file = NULL; - wb_conf_t *conf = this->private; + if (wb_disabled) + ret = wb_enqueue (wb_inode, stub); + else + ret = wb_enqueue_tempted (wb_inode, stub); - if (op_ret != -1) - { - file = wb_file_create (this, fd); + if (!ret) { + op_errno = ENOMEM; + goto unwind; + } - /* - If mandatory locking has been enabled on this file, - we disable caching on it - */ + wb_process_queue (wb_inode); - if ((fd->inode->st_mode & S_ISGID) - && !(fd->inode->st_mode & S_IXGRP)) - file->disabled = 1; + return 0; - /* If O_DIRECT then, we disable chaching */ - if (frame->local) - { - flags = *((int32_t *)frame->local); - if (((flags & O_DIRECT) == O_DIRECT) - || ((flags & O_RDONLY) == O_RDONLY) - || (((flags & O_SYNC) == O_SYNC) - && conf->enable_O_SYNC == _gf_true)) { - file->disabled = 1; - } - } +unwind: + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL, NULL); - LOCK_INIT (&file->lock); - } + if (stub) + call_stub_destroy (stub); - STACK_UNWIND (frame, op_ret, op_errno, fd); return 0; } -int32_t -wb_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd) +int +wb_readv_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - frame->local = CALLOC (1, sizeof(int32_t)); - *((int32_t *)frame->local) = flags; - - STACK_WIND (frame, - wb_open_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, - loc, flags, fd); + STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); return 0; } -int32_t -wb_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct stat *buf) +int +wb_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - wb_file_t *file = NULL; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - if (op_ret != -1) - { - file = wb_file_create (this, fd); - /* - * If mandatory locking has been enabled on this file, - * we disable caching on it - */ - if ((fd->inode->st_mode & S_ISGID) - && !(fd->inode->st_mode & S_IXGRP)) - { - file->disabled = 1; - } + wb_inode = wb_inode_ctx_get (this, fd->inode); + if (!wb_inode) + goto noqueue; - LOCK_INIT (&file->lock); - } + stub = fop_readv_stub (frame, wb_readv_helper, fd, size, + offset, flags, xdata); + if (!stub) + goto unwind; + + if (!wb_enqueue (wb_inode, stub)) + goto unwind; + + wb_process_queue (wb_inode); - STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); return 0; -} +unwind: + STACK_UNWIND_STRICT (readv, frame, -1, ENOMEM, NULL, 0, NULL, NULL, + NULL); + return 0; -int32_t -wb_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, fd_t *fd) -{ - STACK_WIND (frame, - wb_create_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - loc, flags, mode, fd); +noqueue: + STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); return 0; } -size_t -__wb_mark_wind_all (list_head_t *list, list_head_t *winds) +int +wb_flush_bg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - wb_request_t *request = NULL; - size_t size = 0; - struct iovec *vector = NULL; - int32_t count = 0; - char first_request = 1; - off_t offset_expected = 0; - size_t length = 0; - - list_for_each_entry (request, list, list) - { - if ((request->stub == NULL) - || (request->stub->fop != GF_FOP_WRITE)) { - break; - } - - vector = request->stub->args.writev.vector; - count = request->stub->args.writev.count; - if (!request->flags.write_request.stack_wound) { - if (first_request) { - first_request = 0; - offset_expected = request->stub->args.writev.off; - } - - if (request->stub->args.writev.off != offset_expected) { - break; - } - - length = iov_length (vector, count); - size += length; - offset_expected += length; - - request->flags.write_request.stack_wound = 1; - list_add_tail (&request->winds, winds); - } - } - - return size; + STACK_DESTROY (frame->root); + return 0; } -size_t -__wb_get_aggregate_size (list_head_t *list, char *other_fop_in_queue, - char *non_contiguous_writes) +int +wb_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - wb_request_t *request = NULL; - size_t size = 0, length = 0; - struct iovec *vector = NULL; - int32_t count = 0; - char first_request = 1; - off_t offset_expected = 0; + wb_conf_t *conf = NULL; + wb_inode_t *wb_inode = NULL; + call_frame_t *bg_frame = NULL; + int32_t op_errno = 0; + int op_ret = 0; - list_for_each_entry (request, list, list) - { - if ((request->stub == NULL) - || (request->stub->fop != GF_FOP_WRITE)) { - if (request->stub && other_fop_in_queue) { - *other_fop_in_queue = 1; - } - break; - } + conf = this->private; - vector = request->stub->args.writev.vector; - count = request->stub->args.writev.count; - if (!request->flags.write_request.stack_wound) { - if (first_request) { - first_request = 0; - offset_expected = request->stub->args.writev.off; - } - - if (offset_expected != request->stub->args.writev.off) { - if (non_contiguous_writes) { - *non_contiguous_writes = 1; - } - break; - } - - length = iov_length (vector, count); - size += length; - offset_expected += length; - } - } + wb_inode = wb_inode_ctx_get (this, fd->inode); + if (!wb_inode) { + op_ret = -1; + op_errno = EINVAL; + goto unwind; + } + + if (wb_fd_err (fd, this, &op_errno)) { + op_ret = -1; + goto unwind; + } + + if (conf->flush_behind) + goto flushbehind; + + STACK_WIND (frame, default_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; + +flushbehind: + bg_frame = copy_frame (frame); + if (!bg_frame) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + STACK_WIND (bg_frame, wb_flush_bg_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + /* fall through */ +unwind: + STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, NULL); - return size; + return 0; } -uint32_t -__wb_get_incomplete_writes (list_head_t *list) +int +wb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - wb_request_t *request = NULL; - uint32_t count = 0; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - list_for_each_entry (request, list, list) - { - if ((request->stub == NULL) - || (request->stub->fop != GF_FOP_WRITE)) { - break; - } + wb_inode = wb_inode_ctx_get (this, fd->inode); + if (!wb_inode) + goto noqueue; - if (request->flags.write_request.stack_wound - && !request->flags.write_request.got_reply) { - count++; - } - } + stub = fop_flush_stub (frame, wb_flush_helper, fd, xdata); + if (!stub) + goto unwind; - return count; -} + if (!wb_enqueue (wb_inode, stub)) + goto unwind; + wb_process_queue (wb_inode); -size_t -__wb_mark_wind_atmost_aggregate_size (list_head_t *list, list_head_t *winds, - size_t aggregate_conf) -{ - wb_request_t *request = NULL; - struct iovec *vector = NULL; - int32_t count = 0; - size_t aggregate_current = 0, size = 0, length = 0; + return 0; - list_for_each_entry (request, list, list) - { - vector = request->stub->args.writev.vector; - count = request->stub->args.writev.count; - if (!request->flags.write_request.stack_wound) { - length = iov_length (vector, count); - size += length; - aggregate_current += length; - - if (aggregate_current > aggregate_conf) { - break; - } - - request->flags.write_request.stack_wound = 1; - list_add_tail (&request->winds, winds); - } - } +unwind: + STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM, NULL); + + return 0; - return size; +noqueue: + STACK_WIND (frame, default_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; } -size_t -__wb_mark_wind_aggregate_size_aware (list_head_t *list, list_head_t *winds, - size_t aggregate_conf) + + +int +wb_fsync_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t datasync, dict_t *xdata) { - size_t size = 0; - size_t aggregate_current = 0; - - aggregate_current = __wb_get_aggregate_size (list, NULL, NULL); - while (aggregate_current >= aggregate_conf) { - size += __wb_mark_wind_atmost_aggregate_size (list, winds, - aggregate_conf); - - aggregate_current = __wb_get_aggregate_size (list, NULL, NULL); - } - - return size; + STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; } -size_t -__wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_conf, - char wind_all) +int +wb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - size_t aggregate_current = 0; - uint32_t incomplete_writes = 0; - char other_fop_in_queue = 0; - char non_contiguous_writes = 0; - - incomplete_writes = __wb_get_incomplete_writes (list); - - aggregate_current = __wb_get_aggregate_size (list, &other_fop_in_queue, - &non_contiguous_writes); - - if ((incomplete_writes == 0) || (wind_all) || (non_contiguous_writes) - || (other_fop_in_queue)) { - __wb_mark_wind_all (list, winds); - } else if (aggregate_current >= aggregate_conf) { - __wb_mark_wind_aggregate_size_aware (list, winds, - aggregate_conf); - } + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; + int32_t op_errno = EINVAL; - return aggregate_current; -} + if (wb_fd_err (fd, this, &op_errno)) + goto unwind; + wb_inode = wb_inode_ctx_get (this, fd->inode); + if (!wb_inode) + goto noqueue; -size_t -__wb_get_window_size (list_head_t *list) -{ - wb_request_t *request = NULL; - size_t size = 0; - struct iovec *vector = NULL; - int32_t count = 0; + stub = fop_fsync_stub (frame, wb_fsync_helper, fd, datasync, xdata); + if (!stub) + goto unwind; - list_for_each_entry (request, list, list) - { - if ((request->stub == NULL) - || (request->stub->fop != GF_FOP_WRITE)) { - continue; - } + if (!wb_enqueue (wb_inode, stub)) + goto unwind; - vector = request->stub->args.writev.vector; - count = request->stub->args.writev.count; + wb_process_queue (wb_inode); + + return 0; + +unwind: + STACK_UNWIND_STRICT (fsync, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; + +noqueue: + STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; +} - if (request->flags.write_request.write_behind - && !request->flags.write_request.got_reply) - { - size += iov_length (vector, count); - } - } - return size; +int +wb_stat_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + STACK_WIND (frame, default_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; } -size_t -__wb_mark_unwind_till (list_head_t *list, list_head_t *unwinds, size_t size) +int +wb_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - size_t written_behind = 0; - wb_request_t *request = NULL; - struct iovec *vector = NULL; - int32_t count = 0; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - list_for_each_entry (request, list, list) - { - if ((request->stub == NULL) - || (request->stub->fop != GF_FOP_WRITE)) { - continue; - } - vector = request->stub->args.writev.vector; - count = request->stub->args.writev.count; - - if (written_behind <= size) { - if (!request->flags.write_request.write_behind) { - written_behind += iov_length (vector, count); - request->flags.write_request.write_behind = 1; - list_add_tail (&request->unwinds, unwinds); - } - } else { - break; - } - } + wb_inode = wb_inode_ctx_get (this, loc->inode); + if (!wb_inode) + goto noqueue; - return written_behind; -} + stub = fop_stat_stub (frame, wb_stat_helper, loc, xdata); + if (!stub) + goto unwind; + if (!wb_enqueue (wb_inode, stub)) + goto unwind; -int32_t -__wb_mark_unwinds (list_head_t *list, list_head_t *unwinds, size_t window_conf) -{ - size_t window_current = 0; + wb_process_queue (wb_inode); - window_current = __wb_get_window_size (list); - if (window_current <= window_conf) - { - window_current += __wb_mark_unwind_till (list, unwinds, - window_conf - window_current); - } + return 0; + +unwind: + STACK_UNWIND_STRICT (stat, frame, -1, ENOMEM, NULL, NULL); + + if (stub) + call_stub_destroy (stub); + return 0; - return window_current; +noqueue: + STACK_WIND (frame, default_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; } -uint32_t -__wb_get_other_requests (list_head_t *list, list_head_t *other_requests) +int +wb_fstat_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - wb_request_t *request = NULL; - uint32_t count = 0; - list_for_each_entry (request, list, list) { - if ((request->stub == NULL) - || (request->stub->fop == GF_FOP_WRITE)) { - break; - } - - if (!request->flags.other_requests.marked_for_resume) { - request->flags.other_requests.marked_for_resume = 1; - list_add_tail (&request->other_requests, - other_requests); - count++; - - /* lets handle one at a time */ - break; - } - } - - return count; + STACK_WIND (frame, default_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; } -int32_t -wb_stack_unwind (list_head_t *unwinds) +int +wb_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - struct stat buf = {0,}; - wb_request_t *request = NULL, *dummy = NULL; - call_frame_t *frame = NULL; - wb_local_t *local = NULL; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - list_for_each_entry_safe (request, dummy, unwinds, unwinds) - { - frame = request->stub->frame; - local = frame->local; - STACK_UNWIND (frame, local->op_ret, local->op_errno, &buf); + wb_inode = wb_inode_ctx_get (this, fd->inode); + if (!wb_inode) + goto noqueue; - wb_request_unref (request); - } + stub = fop_fstat_stub (frame, wb_fstat_helper, fd, xdata); + if (!stub) + goto unwind; - return 0; -} + if (!wb_enqueue (wb_inode, stub)) + goto unwind; + wb_process_queue (wb_inode); -int32_t -wb_resume_other_requests (call_frame_t *frame, wb_file_t *file, - list_head_t *other_requests) -{ - int32_t ret = 0; - wb_request_t *request = NULL, *dummy = NULL; - int32_t fops_removed = 0; - char wind = 0; - call_stub_t *stub = NULL; + return 0; - if (list_empty (other_requests)) { - goto out; - } +unwind: + STACK_UNWIND_STRICT (fstat, frame, -1, ENOMEM, NULL, NULL); - list_for_each_entry_safe (request, dummy, other_requests, - other_requests) { - wind = request->stub->wind; - stub = request->stub; - - LOCK (&file->lock); - { - request->stub = NULL; - } - UNLOCK (&file->lock); - - if (!wind) { - wb_request_unref (request); - fops_removed++; - } - - call_resume (stub); - } + if (stub) + call_stub_destroy (stub); + return 0; - if (fops_removed > 0) { - wb_process_queue (frame, file, 0); - } - -out: - return ret; +noqueue: + STACK_WIND (frame, default_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; } -int32_t -wb_do_ops (call_frame_t *frame, wb_file_t *file, list_head_t *winds, - list_head_t *unwinds, list_head_t *other_requests) +int +wb_truncate_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) { - wb_stack_unwind (unwinds); - wb_sync (frame, file, winds); - wb_resume_other_requests (frame, file, other_requests); - + STACK_WIND (frame, default_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); return 0; } -int32_t -wb_process_queue (call_frame_t *frame, wb_file_t *file, char flush_all) +int +wb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - list_head_t winds, unwinds, other_requests; - size_t size = 0; - wb_conf_t *conf = file->this->private; - uint32_t count = 0; - - INIT_LIST_HEAD (&winds); - INIT_LIST_HEAD (&unwinds); - INIT_LIST_HEAD (&other_requests); - - if (!file) { - return -1; - } + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - size = conf->aggregate_size; - LOCK (&file->lock); - { - count = __wb_get_other_requests (&file->request, - &other_requests); + wb_inode = wb_inode_create (this, loc->inode); + if (!wb_inode) + goto unwind; - if (count == 0) { - __wb_mark_winds (&file->request, &winds, size, - flush_all); - } + stub = fop_truncate_stub (frame, wb_truncate_helper, loc, + offset, xdata); + if (!stub) + goto unwind; - __wb_mark_unwinds (&file->request, &unwinds, conf->window_size); - } - UNLOCK (&file->lock); + if (!wb_enqueue (wb_inode, stub)) + goto unwind; + + wb_process_queue (wb_inode); + + return 0; + +unwind: + STACK_UNWIND_STRICT (truncate, frame, -1, ENOMEM, NULL, NULL, NULL); + + if (stub) + call_stub_destroy (stub); - wb_do_ops (frame, file, &winds, &unwinds, &other_requests); return 0; } -int32_t -wb_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *stbuf) +int +wb_ftruncate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata) { - STACK_UNWIND (frame, op_ret, op_errno, stbuf); + STACK_WIND (frame, default_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); return 0; } -int32_t -wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, - int32_t count, off_t offset, struct iobref *iobref) +int +wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - wb_file_t *file = NULL; - char wb_disabled = 0; - call_frame_t *process_frame = NULL; - size_t size = 0; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - wb_local_t *local = NULL; - - if (vector != NULL) - size = iov_length (vector, count); - - if (fd_ctx_get (fd, this, &tmp_file)) { - gf_log (this->name, GF_LOG_DEBUG, "write behind file pointer is" - " not stored in context of fd(%p), returning EBADFD", - fd); - - STACK_UNWIND (frame, -1, EBADFD, NULL); - return 0; - } + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; + int32_t op_errno = 0; - file = (wb_file_t *)(long)tmp_file; - if (!file) { - gf_log (this->name, GF_LOG_DEBUG, - "wb_file not found for fd %p", fd); - STACK_UNWIND (frame, -1, EBADFD, NULL); - return 0; + wb_inode = wb_inode_create (this, fd->inode); + if (!wb_inode) { + op_errno = ENOMEM; + goto unwind; } - LOCK (&file->lock); - { - if (file->disabled || file->disable_till) { - if (size > file->disable_till) { - file->disable_till = 0; - } else { - file->disable_till -= size; - } - wb_disabled = 1; - } + if (wb_fd_err (fd, this, &op_errno)) + goto unwind; + + stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd, + offset, xdata); + if (!stub) { + op_errno = ENOMEM; + goto unwind; } - UNLOCK (&file->lock); - - if (wb_disabled) { - STACK_WIND (frame, wb_writev_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->writev, - fd, vector, count, offset, iobref); - return 0; + + if (!wb_enqueue (wb_inode, stub)) { + op_errno = ENOMEM; + goto unwind; } - process_frame = copy_frame (frame); + wb_process_queue (wb_inode); - local = CALLOC (1, sizeof (*local)); - frame->local = local; - local->file = file; + return 0; - stub = fop_writev_stub (frame, NULL, fd, vector, count, offset, - iobref); - if (stub == NULL) { - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; - } +unwind: + STACK_UNWIND_STRICT (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); - wb_enqueue (file, stub); - wb_process_queue (process_frame, file, 0); + if (stub) + call_stub_destroy (stub); + return 0; +} - STACK_DESTROY (process_frame->root); +int +wb_setattr_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + STACK_WIND (frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); return 0; } -int32_t -wb_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iovec *vector, int32_t count, - struct stat *stbuf, struct iobref *iobref) +int +wb_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { - wb_local_t *local = NULL; - wb_file_t *file = NULL; - wb_request_t *request = NULL; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - local = frame->local; - file = local->file; - request = local->request; + wb_inode = wb_inode_ctx_get (this, loc->inode); + if (!wb_inode) + goto noqueue; - if (request) { - wb_request_unref (request); - wb_process_queue (frame, file, 0); - } + stub = fop_setattr_stub (frame, wb_setattr_helper, loc, stbuf, + valid, xdata); + if (!stub) + goto unwind; - STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf, iobref); + if (!wb_enqueue (wb_inode, stub)) + goto unwind; + + wb_process_queue (wb_inode); return 0; +unwind: + STACK_UNWIND_STRICT (setattr, frame, -1, ENOMEM, NULL, NULL, NULL); + + if (stub) + call_stub_destroy (stub); + return 0; + +noqueue: + STACK_WIND (frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; } -static int32_t -wb_readv_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) +int +wb_fsetattr_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { - STACK_WIND (frame, - wb_readv_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, - fd, size, offset); - + STACK_WIND (frame, default_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); return 0; } -int32_t -wb_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) +int +wb_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) { - wb_file_t *file = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - - if (fd_ctx_get (fd, this, &tmp_file)) { - gf_log (this->name, GF_LOG_DEBUG, "write behind file pointer is" - " not stored in context of fd(%p), returning EBADFD", - fd); - - STACK_UNWIND (frame, -1, EBADFD, NULL); - return 0; - } + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - file = (wb_file_t *)(long)tmp_file; + wb_inode = wb_inode_ctx_get (this, fd->inode); + if (!wb_inode) + goto noqueue; - local = CALLOC (1, sizeof (*local)); - local->file = file; + stub = fop_fsetattr_stub (frame, wb_fsetattr_helper, fd, stbuf, + valid, xdata); + if (!stub) + goto unwind; - frame->local = local; - if (file) { - stub = fop_readv_stub (frame, wb_readv_helper, fd, size, - offset); - if (stub == NULL) { - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; - } + if (!wb_enqueue (wb_inode, stub)) + goto unwind; - wb_enqueue (file, stub); + wb_process_queue (wb_inode); - wb_process_queue (frame, file, 1); - } else { - STACK_WIND (frame, - wb_readv_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, - fd, size, offset); - } + return 0; +unwind: + STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL); + + if (stub) + call_stub_destroy (stub); + return 0; +noqueue: + STACK_WIND (frame, default_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); return 0; } -int32_t -wb_ffr_bg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +int +wb_forget (xlator_t *this, inode_t *inode) { - STACK_DESTROY (frame->root); + uint64_t tmp = 0; + wb_inode_t *wb_inode = NULL; + + inode_ctx_del (inode, this, &tmp); + + wb_inode = (wb_inode_t *)(long)tmp; + + if (!wb_inode) + return 0; + + GF_ASSERT (list_empty (&wb_inode->todo)); + GF_ASSERT (list_empty (&wb_inode->liability)); + GF_ASSERT (list_empty (&wb_inode->temptation)); + + GF_FREE (wb_inode); + return 0; } -int32_t -wb_ffr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) +int +wb_release (xlator_t *this, fd_t *fd) { - wb_local_t *local = NULL; - wb_file_t *file = NULL; - wb_conf_t *conf = NULL; - char unwind = 0; + uint64_t tmp = 0; + + fd_ctx_del (fd, this, &tmp); + + return 0; +} + + +int +wb_priv_dump (xlator_t *this) +{ + wb_conf_t *conf = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("write-behind", this, out); conf = this->private; - local = frame->local; - file = local->file; - - if (conf->flush_behind - && (!file->disabled) && (file->disable_till == 0)) { - unwind = 1; - } else { - local->reply_count++; - /* - without flush-behind, unwind should wait for replies of - writes queued before and the flush - */ - if (local->reply_count == 2) { - unwind = 1; - } - } + GF_VALIDATE_OR_GOTO (this->name, conf, out); - if (unwind) { - if (file->op_ret == -1) { - op_ret = file->op_ret; - op_errno = file->op_errno; + gf_proc_dump_build_key (key_prefix, "xlator.performance.write-behind", + "priv"); - file->op_ret = 0; - } + gf_proc_dump_add_section (key_prefix); - wb_process_queue (frame, file, 0); - - STACK_UNWIND (frame, op_ret, op_errno); - } + gf_proc_dump_write ("aggregate_size", "%d", conf->aggregate_size); + gf_proc_dump_write ("window_size", "%d", conf->window_size); + gf_proc_dump_write ("flush_behind", "%d", conf->flush_behind); + gf_proc_dump_write ("trickling_writes", "%d", conf->trickling_writes); - return 0; + ret = 0; +out: + return ret; } -int32_t -wb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +void +__wb_dump_requests (struct list_head *head, char *prefix) { - wb_conf_t *conf = NULL; - wb_file_t *file = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - call_frame_t *process_frame = NULL; - wb_local_t *tmp_local = NULL; + char key[GF_DUMP_MAX_BUF_LEN] = {0, }; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }, flag = 0; + wb_request_t *req = NULL; - conf = this->private; + list_for_each_entry (req, head, all) { + gf_proc_dump_build_key (key_prefix, key, + (char *)gf_fop_list[req->fop]); - if (fd_ctx_get (fd, this, &tmp_file)) { - gf_log (this->name, GF_LOG_DEBUG, "write behind file pointer is" - " not stored in context of fd(%p), returning EBADFD", - fd); + gf_proc_dump_add_section(key_prefix); - STACK_UNWIND (frame, -1, EBADFD); - return 0; - } + gf_proc_dump_write ("request-ptr", "%p", req); - file = (wb_file_t *)(long)tmp_file; + gf_proc_dump_write ("refcount", "%d", req->refcount); - local = CALLOC (1, sizeof (*local)); - local->file = file; + if (list_empty (&req->todo)) + gf_proc_dump_write ("wound", "yes"); + else + gf_proc_dump_write ("wound", "no"); - frame->local = local; - stub = fop_flush_cbk_stub (frame, wb_ffr_cbk, 0, 0); - if (stub == NULL) { - STACK_UNWIND (frame, -1, ENOMEM); - return 0; - } + if (req->fop == GF_FOP_WRITE) { + gf_proc_dump_write ("size", "%"GF_PRI_SIZET, + req->write_size); - process_frame = copy_frame (frame); - if (conf->flush_behind - && (!file->disabled) && (file->disable_till == 0)) { - tmp_local = CALLOC (1, sizeof (*local)); - tmp_local->file = file; + gf_proc_dump_write ("offset", "%"PRId64, + req->stub->args.offset); - process_frame->local = tmp_local; - } + flag = req->ordering.lied; + gf_proc_dump_write ("lied", "%d", flag); - fd_ref (fd); - - wb_enqueue (file, stub); - - wb_process_queue (process_frame, file, 1); - - if (conf->flush_behind - && (!file->disabled) && (file->disable_till == 0)) { - STACK_WIND (process_frame, - wb_ffr_bg_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, - fd); - } else { - STACK_WIND (frame, - wb_ffr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, - fd); - STACK_DESTROY (process_frame->root); - } + flag = req->ordering.append; + gf_proc_dump_write ("append", "%d", flag); - fd_unref (fd); + flag = req->ordering.fulfilled; + gf_proc_dump_write ("fulfilled", "%d", flag); - return 0; + flag = req->ordering.go; + gf_proc_dump_write ("go", "%d", flag); + } + } } -static int32_t -wb_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) +int +wb_inode_dump (xlator_t *this, inode_t *inode) { - wb_local_t *local = NULL; - wb_file_t *file = NULL; - wb_request_t *request = NULL; + wb_inode_t *wb_inode = NULL; + int32_t ret = -1; + char *path = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; + char uuid_str[64] = {0,}; + + if ((inode == NULL) || (this == NULL)) { + ret = 0; + goto out; + } - local = frame->local; - file = local->file; - request = local->request; + wb_inode = wb_inode_ctx_get (this, inode); + if (wb_inode == NULL) { + ret = 0; + goto out; + } - if (file->op_ret == -1) { - op_ret = file->op_ret; - op_errno = file->op_errno; + gf_proc_dump_build_key (key_prefix, "xlator.performance.write-behind", + "wb_inode"); - file->op_ret = 0; - } + gf_proc_dump_add_section (key_prefix); - if (request) { - wb_request_unref (request); - wb_process_queue (frame, file, 0); + __inode_path (inode, NULL, &path); + if (path != NULL) { + gf_proc_dump_write ("path", "%s", path); + GF_FREE (path); } - STACK_UNWIND (frame, op_ret, op_errno); - - return 0; -} + gf_proc_dump_write ("inode", "%p", inode); + gf_proc_dump_write ("window_conf", "%"GF_PRI_SIZET, + wb_inode->window_conf); -static int32_t -wb_fsync_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) -{ - STACK_WIND (frame, - wb_fsync_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsync, - fd, datasync); - return 0; -} + gf_proc_dump_write ("window_current", "%"GF_PRI_SIZET, + wb_inode->window_current); -int32_t -wb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync) -{ - wb_file_t *file = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - - if (fd_ctx_get (fd, this, &tmp_file)) { - gf_log (this->name, GF_LOG_DEBUG, "write behind file pointer is" - " not stored in context of fd(%p), returning EBADFD", - fd); - - STACK_UNWIND (frame, -1, EBADFD); - return 0; + ret = TRY_LOCK (&wb_inode->lock); + if (!ret) + { + if (!list_empty (&wb_inode->all)) { + __wb_dump_requests (&wb_inode->all, key_prefix); + } + UNLOCK (&wb_inode->lock); } - file = (wb_file_t *)(long)tmp_file; + if (ret && wb_inode) + gf_proc_dump_write ("Unable to dump the inode information", + "(Lock acquisition failed) %p (gfid: %s)", + wb_inode, + uuid_utoa_r (inode->gfid, uuid_str)); + ret = 0; +out: + return ret; +} + - local = CALLOC (1, sizeof (*local)); - local->file = file; +int +mem_acct_init (xlator_t *this) +{ + int ret = -1; - frame->local = local; + if (!this) { + goto out; + } - if (file) { - stub = fop_fsync_stub (frame, wb_fsync_helper, fd, datasync); - if (stub == NULL) { - STACK_UNWIND (frame, -1, ENOMEM); - return 0; - } - - wb_enqueue (file, stub); - - wb_process_queue (frame, file, 1); - } else { - STACK_WIND (frame, - wb_fsync_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsync, - fd, datasync); + ret = xlator_mem_acct_init (this, gf_wb_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); } - return 0; +out: + return ret; } -int32_t -wb_release (xlator_t *this, fd_t *fd) +int +reconfigure (xlator_t *this, dict_t *options) { - uint64_t file_ptr = 0; - wb_file_t *file = NULL; + wb_conf_t *conf = NULL; + int ret = -1; - fd_ctx_get (fd, this, &file_ptr); - file = (wb_file_t *) (long) file_ptr; + conf = this->private; - LOCK (&file->lock); - { - assert (list_empty (&file->request)); - } - UNLOCK (&file->lock); + GF_OPTION_RECONF ("cache-size", conf->window_size, options, size, out); - wb_file_destroy (file); + GF_OPTION_RECONF ("flush-behind", conf->flush_behind, options, bool, + out); - return 0; + GF_OPTION_RECONF ("trickling-writes", conf->trickling_writes, options, + bool, out); + + GF_OPTION_RECONF ("strict-O_DIRECT", conf->strict_O_DIRECT, options, + bool, out); + + GF_OPTION_RECONF ("strict-write-ordering", conf->strict_write_ordering, + options, bool, out); + ret = 0; +out: + return ret; } -int32_t +int32_t init (xlator_t *this) { - dict_t *options = NULL; - wb_conf_t *conf = NULL; - char *window_size_string = NULL; - char *flush_behind_string = NULL; - char *disable_till_string = NULL; - char *enable_O_SYNC_string = NULL; - int32_t ret = -1; + wb_conf_t *conf = NULL; + int32_t ret = -1; if ((this->children == NULL) || this->children->next) { gf_log (this->name, GF_LOG_ERROR, "FATAL: write-behind (%s) not configured with exactly " - "one child", - this->name); - return -1; + "one child", this->name); + goto out; } if (this->parents == NULL) { gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); + "dangling volume. check volfilex"); } - - options = this->options; - - conf = CALLOC (1, sizeof (*conf)); - - conf->enable_O_SYNC = _gf_false; - ret = dict_get_str (options, "enable-O_SYNC", - &enable_O_SYNC_string); - if (ret == 0) { - ret = gf_string2boolean (enable_O_SYNC_string, - &conf->enable_O_SYNC); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "'enable-O_SYNC' takes only boolean arguments"); - return -1; - } + + conf = GF_CALLOC (1, sizeof (*conf), gf_wb_mt_wb_conf_t); + if (conf == NULL) { + goto out; } /* configure 'options aggregate-size <size>' */ conf->aggregate_size = WB_AGGREGATE_SIZE; - conf->disable_till = 1; - ret = dict_get_str (options, "disable-for-first-nbytes", - &disable_till_string); - if (ret == 0) { - ret = gf_string2bytesize (disable_till_string, - &conf->disable_till); - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\" of \"option " - "disable-for-first-nbytes\"", - disable_till_string); - return -1; - } - } - gf_log (this->name, GF_LOG_DEBUG, - "disabling write-behind for first %"PRIu64" bytes", - conf->disable_till); - /* configure 'option window-size <size>' */ - conf->window_size = 0; - ret = dict_get_str (options, "cache-size", - &window_size_string); - if (ret == 0) { - ret = gf_string2bytesize (window_size_string, - &conf->window_size); - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\" of \"option " - "window-size\"", - window_size_string); - FREE (conf); - return -1; - } - } + GF_OPTION_INIT ("cache-size", conf->window_size, size, out); if (!conf->window_size && conf->aggregate_size) { gf_log (this->name, GF_LOG_WARNING, @@ -1886,49 +2018,54 @@ init (xlator_t *this) if (conf->window_size < conf->aggregate_size) { gf_log (this->name, GF_LOG_ERROR, "aggregate-size(%"PRIu64") cannot be more than " - "window-size" - "(%"PRIu64")", conf->window_size, conf->aggregate_size); - FREE (conf); - return -1; + "window-size(%"PRIu64")", conf->aggregate_size, + conf->window_size); + goto out; } /* configure 'option flush-behind <on/off>' */ - conf->flush_behind = 0; - ret = dict_get_str (options, "flush-behind", - &flush_behind_string); - if (ret == 0) { - ret = gf_string2boolean (flush_behind_string, - &conf->flush_behind); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "'flush-behind' takes only boolean arguments"); - return -1; - } + GF_OPTION_INIT ("flush-behind", conf->flush_behind, bool, out); + + GF_OPTION_INIT ("trickling-writes", conf->trickling_writes, bool, out); + + GF_OPTION_INIT ("strict-O_DIRECT", conf->strict_O_DIRECT, bool, out); + + GF_OPTION_INIT ("strict-write-ordering", conf->strict_write_ordering, + bool, out); - if (conf->flush_behind) { - gf_log (this->name, GF_LOG_DEBUG, - "enabling flush-behind"); - } - } this->private = conf; - return 0; + ret = 0; + +out: + if (ret) { + GF_FREE (conf); + } + return ret; } void fini (xlator_t *this) { - wb_conf_t *conf = this->private; + wb_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("write-behind", this, out); - FREE (conf); + conf = this->private; + if (!conf) { + goto out; + } + + this->private = NULL; + GF_FREE (conf); + +out: return; } struct xlator_fops fops = { .writev = wb_writev, - .open = wb_open, - .create = wb_create, .readv = wb_readv, .flush = wb_flush, .fsync = wb_fsync, @@ -1936,32 +2073,56 @@ struct xlator_fops fops = { .fstat = wb_fstat, .truncate = wb_truncate, .ftruncate = wb_ftruncate, - .utimens = wb_utimens, + .setattr = wb_setattr, + .fsetattr = wb_fsetattr, }; -struct xlator_mops mops = { -}; struct xlator_cbks cbks = { + .forget = wb_forget, .release = wb_release }; + +struct xlator_dumpops dumpops = { + .priv = wb_priv_dump, + .inodectx = wb_inode_dump, +}; + + struct volume_options options[] = { - { .key = {"flush-behind"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"cache-size", "window-size"}, - .type = GF_OPTION_TYPE_SIZET, - .min = 512 * GF_UNIT_KB, - .max = 1 * GF_UNIT_GB + { .key = {"flush-behind"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "If this option is set ON, instructs write-behind " + "translator to perform flush in background, by " + "returning success (or any errors, if any of " + "previous writes were failed) to application even " + "before flush FOP is sent to backend filesystem. " }, - { .key = {"disable-for-first-nbytes"}, + { .key = {"cache-size", "window-size"}, .type = GF_OPTION_TYPE_SIZET, - .min = 1, - .max = 1 * GF_UNIT_MB, + .min = 512 * GF_UNIT_KB, + .max = 1 * GF_UNIT_GB, + .default_value = "1MB", + .description = "Size of the write-behind buffer for a single file " + "(inode)." + }, + { .key = {"trickling-writes"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + }, + { .key = {"strict-O_DIRECT"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option when set to off, ignores the " + "O_DIRECT flag." }, - { .key = {"enable-O_SYNC"}, + { .key = {"strict-write-ordering"}, .type = GF_OPTION_TYPE_BOOL, - }, + .default_value = "off", + .description = "Do not let later writes overtake earlier writes even " + "if they do not overlap", + }, { .key = {NULL} }, }; |
